feat: Adding file metadata to hybrid search (#19095)

* Added metadata to hybrid search

* And config and env plus refac

* consistency

---------

Co-authored-by: Tim Baek <tim@openwebui.com>
This commit is contained in:
Jacob Leksan 2025-11-18 15:29:07 -05:00 committed by GitHub
parent 42071cb8e8
commit 07ef295a77
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 88 additions and 1 deletions

View file

@ -2622,6 +2622,13 @@ ENABLE_RAG_HYBRID_SEARCH = PersistentConfig(
os.environ.get("ENABLE_RAG_HYBRID_SEARCH", "").lower() == "true", os.environ.get("ENABLE_RAG_HYBRID_SEARCH", "").lower() == "true",
) )
ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS = PersistentConfig(
"ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS",
"rag.enable_hybrid_search_enriched_texts",
os.environ.get("ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS", "False").lower()
== "true",
)
RAG_FULL_CONTEXT = PersistentConfig( RAG_FULL_CONTEXT = PersistentConfig(
"RAG_FULL_CONTEXT", "RAG_FULL_CONTEXT",
"rag.full_context", "rag.full_context",

View file

@ -337,6 +337,7 @@ from open_webui.config import (
ENABLE_ONEDRIVE_PERSONAL, ENABLE_ONEDRIVE_PERSONAL,
ENABLE_ONEDRIVE_BUSINESS, ENABLE_ONEDRIVE_BUSINESS,
ENABLE_RAG_HYBRID_SEARCH, ENABLE_RAG_HYBRID_SEARCH,
ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS,
ENABLE_RAG_LOCAL_WEB_FETCH, ENABLE_RAG_LOCAL_WEB_FETCH,
ENABLE_WEB_LOADER_SSL_VERIFICATION, ENABLE_WEB_LOADER_SSL_VERIFICATION,
ENABLE_GOOGLE_DRIVE_INTEGRATION, ENABLE_GOOGLE_DRIVE_INTEGRATION,
@ -841,6 +842,9 @@ app.state.config.FILE_IMAGE_COMPRESSION_HEIGHT = FILE_IMAGE_COMPRESSION_HEIGHT
app.state.config.RAG_FULL_CONTEXT = RAG_FULL_CONTEXT app.state.config.RAG_FULL_CONTEXT = RAG_FULL_CONTEXT
app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL = BYPASS_EMBEDDING_AND_RETRIEVAL app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL = BYPASS_EMBEDDING_AND_RETRIEVAL
app.state.config.ENABLE_RAG_HYBRID_SEARCH = ENABLE_RAG_HYBRID_SEARCH app.state.config.ENABLE_RAG_HYBRID_SEARCH = ENABLE_RAG_HYBRID_SEARCH
app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS = (
ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
)
app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERIFICATION app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERIFICATION
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE

View file

@ -148,6 +148,44 @@ def get_doc(collection_name: str, user: UserModel = None):
raise e raise e
def get_enriched_texts(collection_result: GetResult) -> list[str]:
enriched_texts = []
for idx, text in enumerate(collection_result.documents[0]):
metadata = collection_result.metadatas[0][idx]
metadata_parts = [text]
# Add filename (repeat twice for extra weight in BM25 scoring)
if metadata.get("name"):
filename = metadata["name"]
filename_tokens = (
filename.replace("_", " ").replace("-", " ").replace(".", " ")
)
metadata_parts.append(
f"Filename: {filename} {filename_tokens} {filename_tokens}"
)
# Add title if available
if metadata.get("title"):
metadata_parts.append(f"Title: {metadata['title']}")
# Add document section headings if available (from markdown splitter)
if metadata.get("headings") and isinstance(metadata["headings"], list):
headings = " > ".join(str(h) for h in metadata["headings"])
metadata_parts.append(f"Section: {headings}")
# Add source URL/path if available
if metadata.get("source"):
metadata_parts.append(f"Source: {metadata['source']}")
# Add snippet for web search results
if metadata.get("snippet"):
metadata_parts.append(f"Snippet: {metadata['snippet']}")
enriched_texts.append(" ".join(metadata_parts))
return enriched_texts
def query_doc_with_hybrid_search( def query_doc_with_hybrid_search(
collection_name: str, collection_name: str,
collection_result: GetResult, collection_result: GetResult,
@ -158,6 +196,7 @@ def query_doc_with_hybrid_search(
k_reranker: int, k_reranker: int,
r: float, r: float,
hybrid_bm25_weight: float, hybrid_bm25_weight: float,
enable_enriched_texts: bool = False,
) -> dict: ) -> dict:
try: try:
# First check if collection_result has the required attributes # First check if collection_result has the required attributes
@ -180,8 +219,14 @@ def query_doc_with_hybrid_search(
log.debug(f"query_doc_with_hybrid_search:doc {collection_name}") log.debug(f"query_doc_with_hybrid_search:doc {collection_name}")
bm25_texts = (
get_enriched_texts(collection_result)
if enable_enriched_texts
else collection_result.documents[0]
)
bm25_retriever = BM25Retriever.from_texts( bm25_retriever = BM25Retriever.from_texts(
texts=collection_result.documents[0], texts=bm25_texts,
metadatas=collection_result.metadatas[0], metadatas=collection_result.metadatas[0],
) )
bm25_retriever.k = k bm25_retriever.k = k
@ -397,6 +442,7 @@ def query_collection_with_hybrid_search(
k_reranker: int, k_reranker: int,
r: float, r: float,
hybrid_bm25_weight: float, hybrid_bm25_weight: float,
enable_enriched_texts: bool = False,
) -> dict: ) -> dict:
results = [] results = []
error = False error = False
@ -431,6 +477,7 @@ def query_collection_with_hybrid_search(
k_reranker=k_reranker, k_reranker=k_reranker,
r=r, r=r,
hybrid_bm25_weight=hybrid_bm25_weight, hybrid_bm25_weight=hybrid_bm25_weight,
enable_enriched_texts=enable_enriched_texts,
) )
return result, None return result, None
except Exception as e: except Exception as e:
@ -762,6 +809,7 @@ def get_sources_from_items(
k_reranker=k_reranker, k_reranker=k_reranker,
r=r, r=r,
hybrid_bm25_weight=hybrid_bm25_weight, hybrid_bm25_weight=hybrid_bm25_weight,
enable_enriched_texts=request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS,
) )
except Exception as e: except Exception as e:
log.debug( log.debug(

View file

@ -431,6 +431,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
"RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT, "RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT,
# Hybrid search settings # Hybrid search settings
"ENABLE_RAG_HYBRID_SEARCH": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH, "ENABLE_RAG_HYBRID_SEARCH": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH,
"ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS,
"TOP_K_RERANKER": request.app.state.config.TOP_K_RERANKER, "TOP_K_RERANKER": request.app.state.config.TOP_K_RERANKER,
"RELEVANCE_THRESHOLD": request.app.state.config.RELEVANCE_THRESHOLD, "RELEVANCE_THRESHOLD": request.app.state.config.RELEVANCE_THRESHOLD,
"HYBRID_BM25_WEIGHT": request.app.state.config.HYBRID_BM25_WEIGHT, "HYBRID_BM25_WEIGHT": request.app.state.config.HYBRID_BM25_WEIGHT,
@ -615,6 +616,7 @@ class ConfigForm(BaseModel):
# Hybrid search settings # Hybrid search settings
ENABLE_RAG_HYBRID_SEARCH: Optional[bool] = None ENABLE_RAG_HYBRID_SEARCH: Optional[bool] = None
ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS: Optional[bool] = None
TOP_K_RERANKER: Optional[int] = None TOP_K_RERANKER: Optional[int] = None
RELEVANCE_THRESHOLD: Optional[float] = None RELEVANCE_THRESHOLD: Optional[float] = None
HYBRID_BM25_WEIGHT: Optional[float] = None HYBRID_BM25_WEIGHT: Optional[float] = None
@ -721,6 +723,11 @@ async def update_rag_config(
if form_data.ENABLE_RAG_HYBRID_SEARCH is not None if form_data.ENABLE_RAG_HYBRID_SEARCH is not None
else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH
) )
request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS = (
form_data.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
if form_data.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS is not None
else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
)
request.app.state.config.TOP_K_RERANKER = ( request.app.state.config.TOP_K_RERANKER = (
form_data.TOP_K_RERANKER form_data.TOP_K_RERANKER
@ -2324,6 +2331,7 @@ class QueryCollectionsForm(BaseModel):
r: Optional[float] = None r: Optional[float] = None
hybrid: Optional[bool] = None hybrid: Optional[bool] = None
hybrid_bm25_weight: Optional[float] = None hybrid_bm25_weight: Optional[float] = None
enable_enriched_texts: Optional[bool] = None
@router.post("/query/collection") @router.post("/query/collection")
@ -2364,6 +2372,11 @@ def query_collection_handler(
if form_data.hybrid_bm25_weight if form_data.hybrid_bm25_weight
else request.app.state.config.HYBRID_BM25_WEIGHT else request.app.state.config.HYBRID_BM25_WEIGHT
), ),
enable_enriched_texts=(
form_data.enable_enriched_texts
if form_data.enable_enriched_texts is not None
else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
),
) )
else: else:
return query_collection( return query_collection(

View file

@ -1152,6 +1152,21 @@
</div> </div>
{#if RAGConfig.ENABLE_RAG_HYBRID_SEARCH === true} {#if RAGConfig.ENABLE_RAG_HYBRID_SEARCH === true}
<div class="mb-2.5 flex w-full justify-between">
<div class="self-center text-xs font-medium">
{$i18n.t('Enrich Hybrid Search Text')}
</div>
<div class="flex items-center relative">
<Tooltip
content={$i18n.t(
'Adds filenames, titles, sections, and snippets into the BM25 text to improve lexical recall.'
)}
>
<Switch bind:state={RAGConfig.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS} />
</Tooltip>
</div>
</div>
<div class=" mb-2.5 flex flex-col w-full justify-between"> <div class=" mb-2.5 flex flex-col w-full justify-between">
<div class="flex w-full justify-between"> <div class="flex w-full justify-between">
<div class=" self-center text-xs font-medium"> <div class=" self-center text-xs font-medium">