mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-12 04:15:25 +00:00
feat: Adding file metadata to hybrid search (#19095)
* Added metadata to hybrid search * And config and env plus refac * consistency --------- Co-authored-by: Tim Baek <tim@openwebui.com>
This commit is contained in:
parent
42071cb8e8
commit
07ef295a77
5 changed files with 88 additions and 1 deletions
|
|
@ -2622,6 +2622,13 @@ ENABLE_RAG_HYBRID_SEARCH = PersistentConfig(
|
|||
os.environ.get("ENABLE_RAG_HYBRID_SEARCH", "").lower() == "true",
|
||||
)
|
||||
|
||||
ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS = PersistentConfig(
|
||||
"ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS",
|
||||
"rag.enable_hybrid_search_enriched_texts",
|
||||
os.environ.get("ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS", "False").lower()
|
||||
== "true",
|
||||
)
|
||||
|
||||
RAG_FULL_CONTEXT = PersistentConfig(
|
||||
"RAG_FULL_CONTEXT",
|
||||
"rag.full_context",
|
||||
|
|
|
|||
|
|
@ -337,6 +337,7 @@ from open_webui.config import (
|
|||
ENABLE_ONEDRIVE_PERSONAL,
|
||||
ENABLE_ONEDRIVE_BUSINESS,
|
||||
ENABLE_RAG_HYBRID_SEARCH,
|
||||
ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS,
|
||||
ENABLE_RAG_LOCAL_WEB_FETCH,
|
||||
ENABLE_WEB_LOADER_SSL_VERIFICATION,
|
||||
ENABLE_GOOGLE_DRIVE_INTEGRATION,
|
||||
|
|
@ -841,6 +842,9 @@ app.state.config.FILE_IMAGE_COMPRESSION_HEIGHT = FILE_IMAGE_COMPRESSION_HEIGHT
|
|||
app.state.config.RAG_FULL_CONTEXT = RAG_FULL_CONTEXT
|
||||
app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL = BYPASS_EMBEDDING_AND_RETRIEVAL
|
||||
app.state.config.ENABLE_RAG_HYBRID_SEARCH = ENABLE_RAG_HYBRID_SEARCH
|
||||
app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS = (
|
||||
ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
|
||||
)
|
||||
app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERIFICATION
|
||||
|
||||
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
|
||||
|
|
|
|||
|
|
@ -148,6 +148,44 @@ def get_doc(collection_name: str, user: UserModel = None):
|
|||
raise e
|
||||
|
||||
|
||||
def get_enriched_texts(collection_result: GetResult) -> list[str]:
|
||||
enriched_texts = []
|
||||
for idx, text in enumerate(collection_result.documents[0]):
|
||||
metadata = collection_result.metadatas[0][idx]
|
||||
metadata_parts = [text]
|
||||
|
||||
# Add filename (repeat twice for extra weight in BM25 scoring)
|
||||
if metadata.get("name"):
|
||||
filename = metadata["name"]
|
||||
filename_tokens = (
|
||||
filename.replace("_", " ").replace("-", " ").replace(".", " ")
|
||||
)
|
||||
metadata_parts.append(
|
||||
f"Filename: {filename} {filename_tokens} {filename_tokens}"
|
||||
)
|
||||
|
||||
# Add title if available
|
||||
if metadata.get("title"):
|
||||
metadata_parts.append(f"Title: {metadata['title']}")
|
||||
|
||||
# Add document section headings if available (from markdown splitter)
|
||||
if metadata.get("headings") and isinstance(metadata["headings"], list):
|
||||
headings = " > ".join(str(h) for h in metadata["headings"])
|
||||
metadata_parts.append(f"Section: {headings}")
|
||||
|
||||
# Add source URL/path if available
|
||||
if metadata.get("source"):
|
||||
metadata_parts.append(f"Source: {metadata['source']}")
|
||||
|
||||
# Add snippet for web search results
|
||||
if metadata.get("snippet"):
|
||||
metadata_parts.append(f"Snippet: {metadata['snippet']}")
|
||||
|
||||
enriched_texts.append(" ".join(metadata_parts))
|
||||
|
||||
return enriched_texts
|
||||
|
||||
|
||||
def query_doc_with_hybrid_search(
|
||||
collection_name: str,
|
||||
collection_result: GetResult,
|
||||
|
|
@ -158,6 +196,7 @@ def query_doc_with_hybrid_search(
|
|||
k_reranker: int,
|
||||
r: float,
|
||||
hybrid_bm25_weight: float,
|
||||
enable_enriched_texts: bool = False,
|
||||
) -> dict:
|
||||
try:
|
||||
# First check if collection_result has the required attributes
|
||||
|
|
@ -180,8 +219,14 @@ def query_doc_with_hybrid_search(
|
|||
|
||||
log.debug(f"query_doc_with_hybrid_search:doc {collection_name}")
|
||||
|
||||
bm25_texts = (
|
||||
get_enriched_texts(collection_result)
|
||||
if enable_enriched_texts
|
||||
else collection_result.documents[0]
|
||||
)
|
||||
|
||||
bm25_retriever = BM25Retriever.from_texts(
|
||||
texts=collection_result.documents[0],
|
||||
texts=bm25_texts,
|
||||
metadatas=collection_result.metadatas[0],
|
||||
)
|
||||
bm25_retriever.k = k
|
||||
|
|
@ -397,6 +442,7 @@ def query_collection_with_hybrid_search(
|
|||
k_reranker: int,
|
||||
r: float,
|
||||
hybrid_bm25_weight: float,
|
||||
enable_enriched_texts: bool = False,
|
||||
) -> dict:
|
||||
results = []
|
||||
error = False
|
||||
|
|
@ -431,6 +477,7 @@ def query_collection_with_hybrid_search(
|
|||
k_reranker=k_reranker,
|
||||
r=r,
|
||||
hybrid_bm25_weight=hybrid_bm25_weight,
|
||||
enable_enriched_texts=enable_enriched_texts,
|
||||
)
|
||||
return result, None
|
||||
except Exception as e:
|
||||
|
|
@ -762,6 +809,7 @@ def get_sources_from_items(
|
|||
k_reranker=k_reranker,
|
||||
r=r,
|
||||
hybrid_bm25_weight=hybrid_bm25_weight,
|
||||
enable_enriched_texts=request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS,
|
||||
)
|
||||
except Exception as e:
|
||||
log.debug(
|
||||
|
|
|
|||
|
|
@ -431,6 +431,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
|
|||
"RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT,
|
||||
# Hybrid search settings
|
||||
"ENABLE_RAG_HYBRID_SEARCH": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH,
|
||||
"ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS,
|
||||
"TOP_K_RERANKER": request.app.state.config.TOP_K_RERANKER,
|
||||
"RELEVANCE_THRESHOLD": request.app.state.config.RELEVANCE_THRESHOLD,
|
||||
"HYBRID_BM25_WEIGHT": request.app.state.config.HYBRID_BM25_WEIGHT,
|
||||
|
|
@ -615,6 +616,7 @@ class ConfigForm(BaseModel):
|
|||
|
||||
# Hybrid search settings
|
||||
ENABLE_RAG_HYBRID_SEARCH: Optional[bool] = None
|
||||
ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS: Optional[bool] = None
|
||||
TOP_K_RERANKER: Optional[int] = None
|
||||
RELEVANCE_THRESHOLD: Optional[float] = None
|
||||
HYBRID_BM25_WEIGHT: Optional[float] = None
|
||||
|
|
@ -721,6 +723,11 @@ async def update_rag_config(
|
|||
if form_data.ENABLE_RAG_HYBRID_SEARCH is not None
|
||||
else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH
|
||||
)
|
||||
request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS = (
|
||||
form_data.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
|
||||
if form_data.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS is not None
|
||||
else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
|
||||
)
|
||||
|
||||
request.app.state.config.TOP_K_RERANKER = (
|
||||
form_data.TOP_K_RERANKER
|
||||
|
|
@ -2324,6 +2331,7 @@ class QueryCollectionsForm(BaseModel):
|
|||
r: Optional[float] = None
|
||||
hybrid: Optional[bool] = None
|
||||
hybrid_bm25_weight: Optional[float] = None
|
||||
enable_enriched_texts: Optional[bool] = None
|
||||
|
||||
|
||||
@router.post("/query/collection")
|
||||
|
|
@ -2364,6 +2372,11 @@ def query_collection_handler(
|
|||
if form_data.hybrid_bm25_weight
|
||||
else request.app.state.config.HYBRID_BM25_WEIGHT
|
||||
),
|
||||
enable_enriched_texts=(
|
||||
form_data.enable_enriched_texts
|
||||
if form_data.enable_enriched_texts is not None
|
||||
else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
|
||||
),
|
||||
)
|
||||
else:
|
||||
return query_collection(
|
||||
|
|
|
|||
|
|
@ -1152,6 +1152,21 @@
|
|||
</div>
|
||||
|
||||
{#if RAGConfig.ENABLE_RAG_HYBRID_SEARCH === true}
|
||||
<div class="mb-2.5 flex w-full justify-between">
|
||||
<div class="self-center text-xs font-medium">
|
||||
{$i18n.t('Enrich Hybrid Search Text')}
|
||||
</div>
|
||||
<div class="flex items-center relative">
|
||||
<Tooltip
|
||||
content={$i18n.t(
|
||||
'Adds filenames, titles, sections, and snippets into the BM25 text to improve lexical recall.'
|
||||
)}
|
||||
>
|
||||
<Switch bind:state={RAGConfig.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS} />
|
||||
</Tooltip>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class=" mb-2.5 flex flex-col w-full justify-between">
|
||||
<div class="flex w-full justify-between">
|
||||
<div class=" self-center text-xs font-medium">
|
||||
|
|
|
|||
Loading…
Reference in a new issue