Merge pull request #16523 from expruc/perf/hybrid_search_bm_25

perf: disable collection retrieval and bm_25 calculation if bm_25 weight is 0
This commit is contained in:
Tim Jaeryang Baek 2025-08-12 23:55:26 +04:00 committed by GitHub
commit 1e67035bd3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -124,12 +124,14 @@ def query_doc_with_hybrid_search(
hybrid_bm25_weight: float, hybrid_bm25_weight: float,
) -> dict: ) -> dict:
try: try:
log.debug(f"query_doc_with_hybrid_search:doc {collection_name}") # BM_25 required only if weight is greater than 0
bm25_retriever = BM25Retriever.from_texts( if hybrid_bm25_weight > 0:
texts=collection_result.documents[0], log.debug(f"query_doc_with_hybrid_search:doc {collection_name}")
metadatas=collection_result.metadatas[0], bm25_retriever = BM25Retriever.from_texts(
) texts=collection_result.documents[0],
bm25_retriever.k = k metadatas=collection_result.metadatas[0],
)
bm25_retriever.k = k
vector_search_retriever = VectorSearchRetriever( vector_search_retriever = VectorSearchRetriever(
collection_name=collection_name, collection_name=collection_name,
@ -337,18 +339,22 @@ def query_collection_with_hybrid_search(
# Fetch collection data once per collection sequentially # Fetch collection data once per collection sequentially
# Avoid fetching the same data multiple times later # Avoid fetching the same data multiple times later
collection_results = {} collection_results = {}
for collection_name in collection_names: # Only retrieve entire collection if bm_25 calculation is required
try: if hybrid_bm25_weight > 0:
log.debug( for collection_name in collection_names:
f"query_collection_with_hybrid_search:VECTOR_DB_CLIENT.get:collection {collection_name}" try:
) log.debug(
collection_results[collection_name] = VECTOR_DB_CLIENT.get( f"query_collection_with_hybrid_search:VECTOR_DB_CLIENT.get:collection {collection_name}"
collection_name=collection_name )
) collection_results[collection_name] = VECTOR_DB_CLIENT.get(
except Exception as e: collection_name=collection_name
log.exception(f"Failed to fetch collection {collection_name}: {e}") )
collection_results[collection_name] = None except Exception as e:
log.exception(f"Failed to fetch collection {collection_name}: {e}")
collection_results[collection_name] = None
else:
for collection_name in collection_names:
collection_results[collection_name] = []
log.info( log.info(
f"Starting hybrid search for {len(queries)} queries in {len(collection_names)} collections..." f"Starting hybrid search for {len(queries)} queries in {len(collection_names)} collections..."
) )