mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-17 06:45:24 +00:00
refac: file handling
This commit is contained in:
parent
0db8bedf45
commit
033d07ee23
1 changed files with 69 additions and 79 deletions
|
|
@ -468,8 +468,10 @@ def get_sources_from_items(
|
||||||
|
|
||||||
for item in items:
|
for item in items:
|
||||||
query_result = None
|
query_result = None
|
||||||
|
collection_names = []
|
||||||
|
|
||||||
if item.get("type") == "text":
|
if item.get("type") == "text":
|
||||||
# Text File
|
# Raw Text
|
||||||
# Used during temporary chat file uploads
|
# Used during temporary chat file uploads
|
||||||
query_result = {
|
query_result = {
|
||||||
"documents": [[item.get("content")]],
|
"documents": [[item.get("content")]],
|
||||||
|
|
@ -487,24 +489,57 @@ def get_sources_from_items(
|
||||||
"metadatas": [[{"file_id": note.id, "name": note.title}]],
|
"metadatas": [[{"file_id": note.id, "name": note.title}]],
|
||||||
}
|
}
|
||||||
|
|
||||||
elif item.get("docs"):
|
elif item.get("type") == "file":
|
||||||
# BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL
|
if (
|
||||||
query_result = {
|
item.get("context") == "full"
|
||||||
"documents": [[doc.get("content") for doc in item.get("docs")]],
|
or request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL
|
||||||
"metadatas": [[doc.get("metadata") for doc in item.get("docs")]],
|
):
|
||||||
}
|
if item.get("file").get("data", {}):
|
||||||
|
# Manual Full Mode Toggle
|
||||||
|
# Used from chat file modal, we can assume that the file content will be available from item.get("file").get("data", {}).get("content")
|
||||||
|
query_result = {
|
||||||
|
"documents": [
|
||||||
|
[item.get("file").get("data", {}).get("content", "")]
|
||||||
|
],
|
||||||
|
"metadatas": [
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"file_id": item.get("id"),
|
||||||
|
"name": item.get("name"),
|
||||||
|
**item.get("file")
|
||||||
|
.get("data", {})
|
||||||
|
.get("metadata", {}),
|
||||||
|
}
|
||||||
|
]
|
||||||
|
],
|
||||||
|
}
|
||||||
|
elif item.get("id"):
|
||||||
|
file_object = Files.get_file_by_id(item.get("id"))
|
||||||
|
if file_object:
|
||||||
|
query_result = {
|
||||||
|
"documents": [[file_object.data.get("content", "")]],
|
||||||
|
"metadatas": [
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"file_id": item.get("id"),
|
||||||
|
"name": file_object.filename,
|
||||||
|
"source": file_object.filename,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
],
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# Fallback to collection names
|
||||||
|
if item.get("legacy"):
|
||||||
|
collection_names.append(f"{item['id']}")
|
||||||
|
else:
|
||||||
|
collection_names.append(f"file-{item['id']}")
|
||||||
|
|
||||||
elif item.get("context") == "full":
|
elif item.get("type") == "collection":
|
||||||
if item.get("type") == "file":
|
if (
|
||||||
# Manual Full Mode Toggle
|
item.get("context") == "full"
|
||||||
# Used from chat file modal, we can assume that the file content will be available from item.get("file").get("data", {}).get("content")
|
or request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL
|
||||||
query_result = {
|
):
|
||||||
"documents": [[item.get("file").get("data", {}).get("content")]],
|
|
||||||
"metadatas": [
|
|
||||||
[{"file_id": item.get("id"), "name": item.get("name")}]
|
|
||||||
],
|
|
||||||
}
|
|
||||||
elif item.get("type") == "collection":
|
|
||||||
# Manual Full Mode Toggle for Collection
|
# Manual Full Mode Toggle for Collection
|
||||||
knowledge_base = Knowledges.get_knowledge_by_id(item.get("id"))
|
knowledge_base = Knowledges.get_knowledge_by_id(item.get("id"))
|
||||||
|
|
||||||
|
|
@ -534,71 +569,26 @@ def get_sources_from_items(
|
||||||
"documents": [documents],
|
"documents": [documents],
|
||||||
"metadatas": [metadatas],
|
"metadatas": [metadatas],
|
||||||
}
|
}
|
||||||
elif (
|
else:
|
||||||
item.get("type") != "web_search"
|
# Fallback to collection names
|
||||||
and request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL
|
|
||||||
):
|
|
||||||
# BYPASS_EMBEDDING_AND_RETRIEVAL
|
|
||||||
if item.get("type") == "collection":
|
|
||||||
file_ids = item.get("data", {}).get("file_ids", [])
|
|
||||||
|
|
||||||
documents = []
|
|
||||||
metadatas = []
|
|
||||||
for file_id in file_ids:
|
|
||||||
file_object = Files.get_file_by_id(file_id)
|
|
||||||
|
|
||||||
if file_object:
|
|
||||||
documents.append(file_object.data.get("content", ""))
|
|
||||||
metadatas.append(
|
|
||||||
{
|
|
||||||
"file_id": file_id,
|
|
||||||
"name": file_object.filename,
|
|
||||||
"source": file_object.filename,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
query_result = {
|
|
||||||
"documents": [documents],
|
|
||||||
"metadatas": [metadatas],
|
|
||||||
}
|
|
||||||
|
|
||||||
elif item.get("id"):
|
|
||||||
file_object = Files.get_file_by_id(item.get("id"))
|
|
||||||
if file_object:
|
|
||||||
query_result = {
|
|
||||||
"documents": [[file_object.data.get("content", "")]],
|
|
||||||
"metadatas": [
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"file_id": item.get("id"),
|
|
||||||
"name": file_object.filename,
|
|
||||||
"source": file_object.filename,
|
|
||||||
}
|
|
||||||
]
|
|
||||||
],
|
|
||||||
}
|
|
||||||
elif item.get("file").get("data"):
|
|
||||||
query_result = {
|
|
||||||
"documents": [[item.get("file").get("data", {}).get("content")]],
|
|
||||||
"metadatas": [
|
|
||||||
[item.get("file").get("data", {}).get("metadata", {})]
|
|
||||||
],
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
collection_names = []
|
|
||||||
if item.get("type") == "collection":
|
|
||||||
if item.get("legacy"):
|
if item.get("legacy"):
|
||||||
collection_names = item.get("collection_names", [])
|
collection_names = item.get("collection_names", [])
|
||||||
else:
|
else:
|
||||||
collection_names.append(item["id"])
|
collection_names.append(item["id"])
|
||||||
elif item.get("collection_name"):
|
|
||||||
collection_names.append(item["collection_name"])
|
|
||||||
elif item.get("id"):
|
|
||||||
if item.get("legacy"):
|
|
||||||
collection_names.append(f"{item['id']}")
|
|
||||||
else:
|
|
||||||
collection_names.append(f"file-{item['id']}")
|
|
||||||
|
|
||||||
|
elif item.get("docs"):
|
||||||
|
# BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL
|
||||||
|
query_result = {
|
||||||
|
"documents": [[doc.get("content") for doc in item.get("docs")]],
|
||||||
|
"metadatas": [[doc.get("metadata") for doc in item.get("docs")]],
|
||||||
|
}
|
||||||
|
elif item.get("collection_name"):
|
||||||
|
# Direct Collection Name
|
||||||
|
collection_names.append(item["collection_name"])
|
||||||
|
|
||||||
|
# If query_result is None
|
||||||
|
# Fallback to collection names and vector search the collections
|
||||||
|
if query_result is None and collection_names:
|
||||||
collection_names = set(collection_names).difference(extracted_collections)
|
collection_names = set(collection_names).difference(extracted_collections)
|
||||||
if not collection_names:
|
if not collection_names:
|
||||||
log.debug(f"skipping {item} as it has already been extracted")
|
log.debug(f"skipping {item} as it has already been extracted")
|
||||||
|
|
@ -609,12 +599,12 @@ def get_sources_from_items(
|
||||||
query_result = get_all_items_from_collections(collection_names)
|
query_result = get_all_items_from_collections(collection_names)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.exception(e)
|
log.exception(e)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
query_result = None
|
query_result = None
|
||||||
if item.get("type") == "text":
|
if item.get("type") == "text":
|
||||||
# Not sure when this is used, but it seems to be a fallback
|
# Not sure when this is used, but it seems to be a fallback
|
||||||
|
# TODO: remove?
|
||||||
query_result = {
|
query_result = {
|
||||||
"documents": [
|
"documents": [
|
||||||
[item.get("file").get("data", {}).get("content")]
|
[item.get("file").get("data", {}).get("content")]
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue