From 033d07ee2332a40bf226e6bb655907854c526b72 Mon Sep 17 00:00:00 2001 From: Timothy Jaeryang Baek Date: Fri, 11 Jul 2025 12:29:17 +0400 Subject: [PATCH] refac: file handling --- backend/open_webui/retrieval/utils.py | 148 ++++++++++++-------------- 1 file changed, 69 insertions(+), 79 deletions(-) diff --git a/backend/open_webui/retrieval/utils.py b/backend/open_webui/retrieval/utils.py index 9d6a2a79b2..63f513dcf2 100644 --- a/backend/open_webui/retrieval/utils.py +++ b/backend/open_webui/retrieval/utils.py @@ -468,8 +468,10 @@ def get_sources_from_items( for item in items: query_result = None + collection_names = [] + if item.get("type") == "text": - # Text File + # Raw Text # Used during temporary chat file uploads query_result = { "documents": [[item.get("content")]], @@ -487,24 +489,57 @@ def get_sources_from_items( "metadatas": [[{"file_id": note.id, "name": note.title}]], } - elif item.get("docs"): - # BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL - query_result = { - "documents": [[doc.get("content") for doc in item.get("docs")]], - "metadatas": [[doc.get("metadata") for doc in item.get("docs")]], - } + elif item.get("type") == "file": + if ( + item.get("context") == "full" + or request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL + ): + if item.get("file").get("data", {}): + # Manual Full Mode Toggle + # Used from chat file modal, we can assume that the file content will be available from item.get("file").get("data", {}).get("content") + query_result = { + "documents": [ + [item.get("file").get("data", {}).get("content", "")] + ], + "metadatas": [ + [ + { + "file_id": item.get("id"), + "name": item.get("name"), + **item.get("file") + .get("data", {}) + .get("metadata", {}), + } + ] + ], + } + elif item.get("id"): + file_object = Files.get_file_by_id(item.get("id")) + if file_object: + query_result = { + "documents": [[file_object.data.get("content", "")]], + "metadatas": [ + [ + { + "file_id": item.get("id"), + "name": file_object.filename, + "source": file_object.filename, + } + ] + ], + } + else: + # Fallback to collection names + if item.get("legacy"): + collection_names.append(f"{item['id']}") + else: + collection_names.append(f"file-{item['id']}") - elif item.get("context") == "full": - if item.get("type") == "file": - # Manual Full Mode Toggle - # Used from chat file modal, we can assume that the file content will be available from item.get("file").get("data", {}).get("content") - query_result = { - "documents": [[item.get("file").get("data", {}).get("content")]], - "metadatas": [ - [{"file_id": item.get("id"), "name": item.get("name")}] - ], - } - elif item.get("type") == "collection": + elif item.get("type") == "collection": + if ( + item.get("context") == "full" + or request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL + ): # Manual Full Mode Toggle for Collection knowledge_base = Knowledges.get_knowledge_by_id(item.get("id")) @@ -534,71 +569,26 @@ def get_sources_from_items( "documents": [documents], "metadatas": [metadatas], } - elif ( - item.get("type") != "web_search" - and request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL - ): - # BYPASS_EMBEDDING_AND_RETRIEVAL - if item.get("type") == "collection": - file_ids = item.get("data", {}).get("file_ids", []) - - documents = [] - metadatas = [] - for file_id in file_ids: - file_object = Files.get_file_by_id(file_id) - - if file_object: - documents.append(file_object.data.get("content", "")) - metadatas.append( - { - "file_id": file_id, - "name": file_object.filename, - "source": file_object.filename, - } - ) - - query_result = { - "documents": [documents], - "metadatas": [metadatas], - } - - elif item.get("id"): - file_object = Files.get_file_by_id(item.get("id")) - if file_object: - query_result = { - "documents": [[file_object.data.get("content", "")]], - "metadatas": [ - [ - { - "file_id": item.get("id"), - "name": file_object.filename, - "source": file_object.filename, - } - ] - ], - } - elif item.get("file").get("data"): - query_result = { - "documents": [[item.get("file").get("data", {}).get("content")]], - "metadatas": [ - [item.get("file").get("data", {}).get("metadata", {})] - ], - } - else: - collection_names = [] - if item.get("type") == "collection": + else: + # Fallback to collection names if item.get("legacy"): collection_names = item.get("collection_names", []) else: collection_names.append(item["id"]) - elif item.get("collection_name"): - collection_names.append(item["collection_name"]) - elif item.get("id"): - if item.get("legacy"): - collection_names.append(f"{item['id']}") - else: - collection_names.append(f"file-{item['id']}") + elif item.get("docs"): + # BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL + query_result = { + "documents": [[doc.get("content") for doc in item.get("docs")]], + "metadatas": [[doc.get("metadata") for doc in item.get("docs")]], + } + elif item.get("collection_name"): + # Direct Collection Name + collection_names.append(item["collection_name"]) + + # If query_result is None + # Fallback to collection names and vector search the collections + if query_result is None and collection_names: collection_names = set(collection_names).difference(extracted_collections) if not collection_names: log.debug(f"skipping {item} as it has already been extracted") @@ -609,12 +599,12 @@ def get_sources_from_items( query_result = get_all_items_from_collections(collection_names) except Exception as e: log.exception(e) - else: try: query_result = None if item.get("type") == "text": # Not sure when this is used, but it seems to be a fallback + # TODO: remove? query_result = { "documents": [ [item.get("file").get("data", {}).get("content")]