mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-11 20:05:19 +00:00
feat: Adding file metadata to hybrid search (#19095)
* Added metadata to hybrid search * And config and env plus refac * consistency --------- Co-authored-by: Tim Baek <tim@openwebui.com>
This commit is contained in:
parent
42071cb8e8
commit
07ef295a77
5 changed files with 88 additions and 1 deletions
|
|
@ -2622,6 +2622,13 @@ ENABLE_RAG_HYBRID_SEARCH = PersistentConfig(
|
||||||
os.environ.get("ENABLE_RAG_HYBRID_SEARCH", "").lower() == "true",
|
os.environ.get("ENABLE_RAG_HYBRID_SEARCH", "").lower() == "true",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS = PersistentConfig(
|
||||||
|
"ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS",
|
||||||
|
"rag.enable_hybrid_search_enriched_texts",
|
||||||
|
os.environ.get("ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS", "False").lower()
|
||||||
|
== "true",
|
||||||
|
)
|
||||||
|
|
||||||
RAG_FULL_CONTEXT = PersistentConfig(
|
RAG_FULL_CONTEXT = PersistentConfig(
|
||||||
"RAG_FULL_CONTEXT",
|
"RAG_FULL_CONTEXT",
|
||||||
"rag.full_context",
|
"rag.full_context",
|
||||||
|
|
|
||||||
|
|
@ -337,6 +337,7 @@ from open_webui.config import (
|
||||||
ENABLE_ONEDRIVE_PERSONAL,
|
ENABLE_ONEDRIVE_PERSONAL,
|
||||||
ENABLE_ONEDRIVE_BUSINESS,
|
ENABLE_ONEDRIVE_BUSINESS,
|
||||||
ENABLE_RAG_HYBRID_SEARCH,
|
ENABLE_RAG_HYBRID_SEARCH,
|
||||||
|
ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS,
|
||||||
ENABLE_RAG_LOCAL_WEB_FETCH,
|
ENABLE_RAG_LOCAL_WEB_FETCH,
|
||||||
ENABLE_WEB_LOADER_SSL_VERIFICATION,
|
ENABLE_WEB_LOADER_SSL_VERIFICATION,
|
||||||
ENABLE_GOOGLE_DRIVE_INTEGRATION,
|
ENABLE_GOOGLE_DRIVE_INTEGRATION,
|
||||||
|
|
@ -841,6 +842,9 @@ app.state.config.FILE_IMAGE_COMPRESSION_HEIGHT = FILE_IMAGE_COMPRESSION_HEIGHT
|
||||||
app.state.config.RAG_FULL_CONTEXT = RAG_FULL_CONTEXT
|
app.state.config.RAG_FULL_CONTEXT = RAG_FULL_CONTEXT
|
||||||
app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL = BYPASS_EMBEDDING_AND_RETRIEVAL
|
app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL = BYPASS_EMBEDDING_AND_RETRIEVAL
|
||||||
app.state.config.ENABLE_RAG_HYBRID_SEARCH = ENABLE_RAG_HYBRID_SEARCH
|
app.state.config.ENABLE_RAG_HYBRID_SEARCH = ENABLE_RAG_HYBRID_SEARCH
|
||||||
|
app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS = (
|
||||||
|
ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
|
||||||
|
)
|
||||||
app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERIFICATION
|
app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERIFICATION
|
||||||
|
|
||||||
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
|
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
|
||||||
|
|
|
||||||
|
|
@ -148,6 +148,44 @@ def get_doc(collection_name: str, user: UserModel = None):
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
|
||||||
|
def get_enriched_texts(collection_result: GetResult) -> list[str]:
|
||||||
|
enriched_texts = []
|
||||||
|
for idx, text in enumerate(collection_result.documents[0]):
|
||||||
|
metadata = collection_result.metadatas[0][idx]
|
||||||
|
metadata_parts = [text]
|
||||||
|
|
||||||
|
# Add filename (repeat twice for extra weight in BM25 scoring)
|
||||||
|
if metadata.get("name"):
|
||||||
|
filename = metadata["name"]
|
||||||
|
filename_tokens = (
|
||||||
|
filename.replace("_", " ").replace("-", " ").replace(".", " ")
|
||||||
|
)
|
||||||
|
metadata_parts.append(
|
||||||
|
f"Filename: {filename} {filename_tokens} {filename_tokens}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add title if available
|
||||||
|
if metadata.get("title"):
|
||||||
|
metadata_parts.append(f"Title: {metadata['title']}")
|
||||||
|
|
||||||
|
# Add document section headings if available (from markdown splitter)
|
||||||
|
if metadata.get("headings") and isinstance(metadata["headings"], list):
|
||||||
|
headings = " > ".join(str(h) for h in metadata["headings"])
|
||||||
|
metadata_parts.append(f"Section: {headings}")
|
||||||
|
|
||||||
|
# Add source URL/path if available
|
||||||
|
if metadata.get("source"):
|
||||||
|
metadata_parts.append(f"Source: {metadata['source']}")
|
||||||
|
|
||||||
|
# Add snippet for web search results
|
||||||
|
if metadata.get("snippet"):
|
||||||
|
metadata_parts.append(f"Snippet: {metadata['snippet']}")
|
||||||
|
|
||||||
|
enriched_texts.append(" ".join(metadata_parts))
|
||||||
|
|
||||||
|
return enriched_texts
|
||||||
|
|
||||||
|
|
||||||
def query_doc_with_hybrid_search(
|
def query_doc_with_hybrid_search(
|
||||||
collection_name: str,
|
collection_name: str,
|
||||||
collection_result: GetResult,
|
collection_result: GetResult,
|
||||||
|
|
@ -158,6 +196,7 @@ def query_doc_with_hybrid_search(
|
||||||
k_reranker: int,
|
k_reranker: int,
|
||||||
r: float,
|
r: float,
|
||||||
hybrid_bm25_weight: float,
|
hybrid_bm25_weight: float,
|
||||||
|
enable_enriched_texts: bool = False,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
try:
|
try:
|
||||||
# First check if collection_result has the required attributes
|
# First check if collection_result has the required attributes
|
||||||
|
|
@ -180,8 +219,14 @@ def query_doc_with_hybrid_search(
|
||||||
|
|
||||||
log.debug(f"query_doc_with_hybrid_search:doc {collection_name}")
|
log.debug(f"query_doc_with_hybrid_search:doc {collection_name}")
|
||||||
|
|
||||||
|
bm25_texts = (
|
||||||
|
get_enriched_texts(collection_result)
|
||||||
|
if enable_enriched_texts
|
||||||
|
else collection_result.documents[0]
|
||||||
|
)
|
||||||
|
|
||||||
bm25_retriever = BM25Retriever.from_texts(
|
bm25_retriever = BM25Retriever.from_texts(
|
||||||
texts=collection_result.documents[0],
|
texts=bm25_texts,
|
||||||
metadatas=collection_result.metadatas[0],
|
metadatas=collection_result.metadatas[0],
|
||||||
)
|
)
|
||||||
bm25_retriever.k = k
|
bm25_retriever.k = k
|
||||||
|
|
@ -397,6 +442,7 @@ def query_collection_with_hybrid_search(
|
||||||
k_reranker: int,
|
k_reranker: int,
|
||||||
r: float,
|
r: float,
|
||||||
hybrid_bm25_weight: float,
|
hybrid_bm25_weight: float,
|
||||||
|
enable_enriched_texts: bool = False,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
results = []
|
results = []
|
||||||
error = False
|
error = False
|
||||||
|
|
@ -431,6 +477,7 @@ def query_collection_with_hybrid_search(
|
||||||
k_reranker=k_reranker,
|
k_reranker=k_reranker,
|
||||||
r=r,
|
r=r,
|
||||||
hybrid_bm25_weight=hybrid_bm25_weight,
|
hybrid_bm25_weight=hybrid_bm25_weight,
|
||||||
|
enable_enriched_texts=enable_enriched_texts,
|
||||||
)
|
)
|
||||||
return result, None
|
return result, None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -762,6 +809,7 @@ def get_sources_from_items(
|
||||||
k_reranker=k_reranker,
|
k_reranker=k_reranker,
|
||||||
r=r,
|
r=r,
|
||||||
hybrid_bm25_weight=hybrid_bm25_weight,
|
hybrid_bm25_weight=hybrid_bm25_weight,
|
||||||
|
enable_enriched_texts=request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.debug(
|
log.debug(
|
||||||
|
|
|
||||||
|
|
@ -431,6 +431,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
|
||||||
"RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT,
|
"RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT,
|
||||||
# Hybrid search settings
|
# Hybrid search settings
|
||||||
"ENABLE_RAG_HYBRID_SEARCH": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH,
|
"ENABLE_RAG_HYBRID_SEARCH": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH,
|
||||||
|
"ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS,
|
||||||
"TOP_K_RERANKER": request.app.state.config.TOP_K_RERANKER,
|
"TOP_K_RERANKER": request.app.state.config.TOP_K_RERANKER,
|
||||||
"RELEVANCE_THRESHOLD": request.app.state.config.RELEVANCE_THRESHOLD,
|
"RELEVANCE_THRESHOLD": request.app.state.config.RELEVANCE_THRESHOLD,
|
||||||
"HYBRID_BM25_WEIGHT": request.app.state.config.HYBRID_BM25_WEIGHT,
|
"HYBRID_BM25_WEIGHT": request.app.state.config.HYBRID_BM25_WEIGHT,
|
||||||
|
|
@ -615,6 +616,7 @@ class ConfigForm(BaseModel):
|
||||||
|
|
||||||
# Hybrid search settings
|
# Hybrid search settings
|
||||||
ENABLE_RAG_HYBRID_SEARCH: Optional[bool] = None
|
ENABLE_RAG_HYBRID_SEARCH: Optional[bool] = None
|
||||||
|
ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS: Optional[bool] = None
|
||||||
TOP_K_RERANKER: Optional[int] = None
|
TOP_K_RERANKER: Optional[int] = None
|
||||||
RELEVANCE_THRESHOLD: Optional[float] = None
|
RELEVANCE_THRESHOLD: Optional[float] = None
|
||||||
HYBRID_BM25_WEIGHT: Optional[float] = None
|
HYBRID_BM25_WEIGHT: Optional[float] = None
|
||||||
|
|
@ -721,6 +723,11 @@ async def update_rag_config(
|
||||||
if form_data.ENABLE_RAG_HYBRID_SEARCH is not None
|
if form_data.ENABLE_RAG_HYBRID_SEARCH is not None
|
||||||
else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH
|
else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH
|
||||||
)
|
)
|
||||||
|
request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS = (
|
||||||
|
form_data.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
|
||||||
|
if form_data.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS is not None
|
||||||
|
else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
|
||||||
|
)
|
||||||
|
|
||||||
request.app.state.config.TOP_K_RERANKER = (
|
request.app.state.config.TOP_K_RERANKER = (
|
||||||
form_data.TOP_K_RERANKER
|
form_data.TOP_K_RERANKER
|
||||||
|
|
@ -2324,6 +2331,7 @@ class QueryCollectionsForm(BaseModel):
|
||||||
r: Optional[float] = None
|
r: Optional[float] = None
|
||||||
hybrid: Optional[bool] = None
|
hybrid: Optional[bool] = None
|
||||||
hybrid_bm25_weight: Optional[float] = None
|
hybrid_bm25_weight: Optional[float] = None
|
||||||
|
enable_enriched_texts: Optional[bool] = None
|
||||||
|
|
||||||
|
|
||||||
@router.post("/query/collection")
|
@router.post("/query/collection")
|
||||||
|
|
@ -2364,6 +2372,11 @@ def query_collection_handler(
|
||||||
if form_data.hybrid_bm25_weight
|
if form_data.hybrid_bm25_weight
|
||||||
else request.app.state.config.HYBRID_BM25_WEIGHT
|
else request.app.state.config.HYBRID_BM25_WEIGHT
|
||||||
),
|
),
|
||||||
|
enable_enriched_texts=(
|
||||||
|
form_data.enable_enriched_texts
|
||||||
|
if form_data.enable_enriched_texts is not None
|
||||||
|
else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
|
||||||
|
),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return query_collection(
|
return query_collection(
|
||||||
|
|
|
||||||
|
|
@ -1152,6 +1152,21 @@
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{#if RAGConfig.ENABLE_RAG_HYBRID_SEARCH === true}
|
{#if RAGConfig.ENABLE_RAG_HYBRID_SEARCH === true}
|
||||||
|
<div class="mb-2.5 flex w-full justify-between">
|
||||||
|
<div class="self-center text-xs font-medium">
|
||||||
|
{$i18n.t('Enrich Hybrid Search Text')}
|
||||||
|
</div>
|
||||||
|
<div class="flex items-center relative">
|
||||||
|
<Tooltip
|
||||||
|
content={$i18n.t(
|
||||||
|
'Adds filenames, titles, sections, and snippets into the BM25 text to improve lexical recall.'
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
<Switch bind:state={RAGConfig.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS} />
|
||||||
|
</Tooltip>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class=" mb-2.5 flex flex-col w-full justify-between">
|
<div class=" mb-2.5 flex flex-col w-full justify-between">
|
||||||
<div class="flex w-full justify-between">
|
<div class="flex w-full justify-between">
|
||||||
<div class=" self-center text-xs font-medium">
|
<div class=" self-center text-xs font-medium">
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue