diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 981d5326c3..93ed6b75ef 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2622,6 +2622,13 @@ ENABLE_RAG_HYBRID_SEARCH = PersistentConfig( os.environ.get("ENABLE_RAG_HYBRID_SEARCH", "").lower() == "true", ) +ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS = PersistentConfig( + "ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS", + "rag.enable_hybrid_search_enriched_texts", + os.environ.get("ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS", "False").lower() + == "true", +) + RAG_FULL_CONTEXT = PersistentConfig( "RAG_FULL_CONTEXT", "rag.full_context", diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index c71cb755e1..c694008696 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -337,6 +337,7 @@ from open_webui.config import ( ENABLE_ONEDRIVE_PERSONAL, ENABLE_ONEDRIVE_BUSINESS, ENABLE_RAG_HYBRID_SEARCH, + ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS, ENABLE_RAG_LOCAL_WEB_FETCH, ENABLE_WEB_LOADER_SSL_VERIFICATION, ENABLE_GOOGLE_DRIVE_INTEGRATION, @@ -841,6 +842,9 @@ app.state.config.FILE_IMAGE_COMPRESSION_HEIGHT = FILE_IMAGE_COMPRESSION_HEIGHT app.state.config.RAG_FULL_CONTEXT = RAG_FULL_CONTEXT app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL = BYPASS_EMBEDDING_AND_RETRIEVAL app.state.config.ENABLE_RAG_HYBRID_SEARCH = ENABLE_RAG_HYBRID_SEARCH +app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS = ( + ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS +) app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERIFICATION app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE diff --git a/backend/open_webui/retrieval/utils.py b/backend/open_webui/retrieval/utils.py index f20884a4d2..370737ba55 100644 --- a/backend/open_webui/retrieval/utils.py +++ b/backend/open_webui/retrieval/utils.py @@ -148,6 +148,44 @@ def get_doc(collection_name: str, user: UserModel = None): raise e +def get_enriched_texts(collection_result: GetResult) -> list[str]: + enriched_texts = [] + for idx, text in enumerate(collection_result.documents[0]): + metadata = collection_result.metadatas[0][idx] + metadata_parts = [text] + + # Add filename (repeat twice for extra weight in BM25 scoring) + if metadata.get("name"): + filename = metadata["name"] + filename_tokens = ( + filename.replace("_", " ").replace("-", " ").replace(".", " ") + ) + metadata_parts.append( + f"Filename: {filename} {filename_tokens} {filename_tokens}" + ) + + # Add title if available + if metadata.get("title"): + metadata_parts.append(f"Title: {metadata['title']}") + + # Add document section headings if available (from markdown splitter) + if metadata.get("headings") and isinstance(metadata["headings"], list): + headings = " > ".join(str(h) for h in metadata["headings"]) + metadata_parts.append(f"Section: {headings}") + + # Add source URL/path if available + if metadata.get("source"): + metadata_parts.append(f"Source: {metadata['source']}") + + # Add snippet for web search results + if metadata.get("snippet"): + metadata_parts.append(f"Snippet: {metadata['snippet']}") + + enriched_texts.append(" ".join(metadata_parts)) + + return enriched_texts + + def query_doc_with_hybrid_search( collection_name: str, collection_result: GetResult, @@ -158,6 +196,7 @@ def query_doc_with_hybrid_search( k_reranker: int, r: float, hybrid_bm25_weight: float, + enable_enriched_texts: bool = False, ) -> dict: try: # First check if collection_result has the required attributes @@ -180,8 +219,14 @@ def query_doc_with_hybrid_search( log.debug(f"query_doc_with_hybrid_search:doc {collection_name}") + bm25_texts = ( + get_enriched_texts(collection_result) + if enable_enriched_texts + else collection_result.documents[0] + ) + bm25_retriever = BM25Retriever.from_texts( - texts=collection_result.documents[0], + texts=bm25_texts, metadatas=collection_result.metadatas[0], ) bm25_retriever.k = k @@ -397,6 +442,7 @@ def query_collection_with_hybrid_search( k_reranker: int, r: float, hybrid_bm25_weight: float, + enable_enriched_texts: bool = False, ) -> dict: results = [] error = False @@ -431,6 +477,7 @@ def query_collection_with_hybrid_search( k_reranker=k_reranker, r=r, hybrid_bm25_weight=hybrid_bm25_weight, + enable_enriched_texts=enable_enriched_texts, ) return result, None except Exception as e: @@ -762,6 +809,7 @@ def get_sources_from_items( k_reranker=k_reranker, r=r, hybrid_bm25_weight=hybrid_bm25_weight, + enable_enriched_texts=request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS, ) except Exception as e: log.debug( diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 7ebeb05bfd..80ef02caf8 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -431,6 +431,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT, # Hybrid search settings "ENABLE_RAG_HYBRID_SEARCH": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH, + "ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS, "TOP_K_RERANKER": request.app.state.config.TOP_K_RERANKER, "RELEVANCE_THRESHOLD": request.app.state.config.RELEVANCE_THRESHOLD, "HYBRID_BM25_WEIGHT": request.app.state.config.HYBRID_BM25_WEIGHT, @@ -615,6 +616,7 @@ class ConfigForm(BaseModel): # Hybrid search settings ENABLE_RAG_HYBRID_SEARCH: Optional[bool] = None + ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS: Optional[bool] = None TOP_K_RERANKER: Optional[int] = None RELEVANCE_THRESHOLD: Optional[float] = None HYBRID_BM25_WEIGHT: Optional[float] = None @@ -721,6 +723,11 @@ async def update_rag_config( if form_data.ENABLE_RAG_HYBRID_SEARCH is not None else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH ) + request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS = ( + form_data.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS + if form_data.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS is not None + else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS + ) request.app.state.config.TOP_K_RERANKER = ( form_data.TOP_K_RERANKER @@ -2324,6 +2331,7 @@ class QueryCollectionsForm(BaseModel): r: Optional[float] = None hybrid: Optional[bool] = None hybrid_bm25_weight: Optional[float] = None + enable_enriched_texts: Optional[bool] = None @router.post("/query/collection") @@ -2364,6 +2372,11 @@ def query_collection_handler( if form_data.hybrid_bm25_weight else request.app.state.config.HYBRID_BM25_WEIGHT ), + enable_enriched_texts=( + form_data.enable_enriched_texts + if form_data.enable_enriched_texts is not None + else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS + ), ) else: return query_collection( diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index 640a3a58d6..aae75dfbd8 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -1152,6 +1152,21 @@ {#if RAGConfig.ENABLE_RAG_HYBRID_SEARCH === true} +
+
+ {$i18n.t('Enrich Hybrid Search Text')} +
+
+ + + +
+
+