mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-12 04:15:25 +00:00
enh/fix: filter content metadata
This commit is contained in:
parent
1c418a7f83
commit
118549caf3
9 changed files with 38 additions and 23 deletions
|
|
@ -11,7 +11,7 @@ from open_webui.retrieval.vector.main import (
|
||||||
SearchResult,
|
SearchResult,
|
||||||
GetResult,
|
GetResult,
|
||||||
)
|
)
|
||||||
from open_webui.retrieval.vector.utils import stringify_metadata
|
from open_webui.retrieval.vector.utils import process_metadata
|
||||||
|
|
||||||
from open_webui.config import (
|
from open_webui.config import (
|
||||||
CHROMA_DATA_PATH,
|
CHROMA_DATA_PATH,
|
||||||
|
|
@ -146,7 +146,7 @@ class ChromaClient(VectorDBBase):
|
||||||
ids = [item["id"] for item in items]
|
ids = [item["id"] for item in items]
|
||||||
documents = [item["text"] for item in items]
|
documents = [item["text"] for item in items]
|
||||||
embeddings = [item["vector"] for item in items]
|
embeddings = [item["vector"] for item in items]
|
||||||
metadatas = [stringify_metadata(item["metadata"]) for item in items]
|
metadatas = [process_metadata(item["metadata"]) for item in items]
|
||||||
|
|
||||||
for batch in create_batches(
|
for batch in create_batches(
|
||||||
api=self.client,
|
api=self.client,
|
||||||
|
|
@ -166,7 +166,7 @@ class ChromaClient(VectorDBBase):
|
||||||
ids = [item["id"] for item in items]
|
ids = [item["id"] for item in items]
|
||||||
documents = [item["text"] for item in items]
|
documents = [item["text"] for item in items]
|
||||||
embeddings = [item["vector"] for item in items]
|
embeddings = [item["vector"] for item in items]
|
||||||
metadatas = [stringify_metadata(item["metadata"]) for item in items]
|
metadatas = [process_metadata(item["metadata"]) for item in items]
|
||||||
|
|
||||||
collection.upsert(
|
collection.upsert(
|
||||||
ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas
|
ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ from typing import Optional
|
||||||
import ssl
|
import ssl
|
||||||
from elasticsearch.helpers import bulk, scan
|
from elasticsearch.helpers import bulk, scan
|
||||||
|
|
||||||
from open_webui.retrieval.vector.utils import stringify_metadata
|
from open_webui.retrieval.vector.utils import process_metadata
|
||||||
from open_webui.retrieval.vector.main import (
|
from open_webui.retrieval.vector.main import (
|
||||||
VectorDBBase,
|
VectorDBBase,
|
||||||
VectorItem,
|
VectorItem,
|
||||||
|
|
@ -245,7 +245,7 @@ class ElasticsearchClient(VectorDBBase):
|
||||||
"collection": collection_name,
|
"collection": collection_name,
|
||||||
"vector": item["vector"],
|
"vector": item["vector"],
|
||||||
"text": item["text"],
|
"text": item["text"],
|
||||||
"metadata": stringify_metadata(item["metadata"]),
|
"metadata": process_metadata(item["metadata"]),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
for item in batch
|
for item in batch
|
||||||
|
|
@ -266,7 +266,7 @@ class ElasticsearchClient(VectorDBBase):
|
||||||
"collection": collection_name,
|
"collection": collection_name,
|
||||||
"vector": item["vector"],
|
"vector": item["vector"],
|
||||||
"text": item["text"],
|
"text": item["text"],
|
||||||
"metadata": stringify_metadata(item["metadata"]),
|
"metadata": process_metadata(item["metadata"]),
|
||||||
},
|
},
|
||||||
"doc_as_upsert": True,
|
"doc_as_upsert": True,
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@ import json
|
||||||
import logging
|
import logging
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from open_webui.retrieval.vector.utils import stringify_metadata
|
from open_webui.retrieval.vector.utils import process_metadata
|
||||||
from open_webui.retrieval.vector.main import (
|
from open_webui.retrieval.vector.main import (
|
||||||
VectorDBBase,
|
VectorDBBase,
|
||||||
VectorItem,
|
VectorItem,
|
||||||
|
|
@ -289,7 +289,7 @@ class MilvusClient(VectorDBBase):
|
||||||
"id": item["id"],
|
"id": item["id"],
|
||||||
"vector": item["vector"],
|
"vector": item["vector"],
|
||||||
"data": {"text": item["text"]},
|
"data": {"text": item["text"]},
|
||||||
"metadata": stringify_metadata(item["metadata"]),
|
"metadata": process_metadata(item["metadata"]),
|
||||||
}
|
}
|
||||||
for item in items
|
for item in items
|
||||||
],
|
],
|
||||||
|
|
@ -325,7 +325,7 @@ class MilvusClient(VectorDBBase):
|
||||||
"id": item["id"],
|
"id": item["id"],
|
||||||
"vector": item["vector"],
|
"vector": item["vector"],
|
||||||
"data": {"text": item["text"]},
|
"data": {"text": item["text"]},
|
||||||
"metadata": stringify_metadata(item["metadata"]),
|
"metadata": process_metadata(item["metadata"]),
|
||||||
}
|
}
|
||||||
for item in items
|
for item in items
|
||||||
],
|
],
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@ from opensearchpy import OpenSearch
|
||||||
from opensearchpy.helpers import bulk
|
from opensearchpy.helpers import bulk
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from open_webui.retrieval.vector.utils import stringify_metadata
|
from open_webui.retrieval.vector.utils import process_metadata
|
||||||
from open_webui.retrieval.vector.main import (
|
from open_webui.retrieval.vector.main import (
|
||||||
VectorDBBase,
|
VectorDBBase,
|
||||||
VectorItem,
|
VectorItem,
|
||||||
|
|
@ -201,7 +201,7 @@ class OpenSearchClient(VectorDBBase):
|
||||||
"_source": {
|
"_source": {
|
||||||
"vector": item["vector"],
|
"vector": item["vector"],
|
||||||
"text": item["text"],
|
"text": item["text"],
|
||||||
"metadata": stringify_metadata(item["metadata"]),
|
"metadata": process_metadata(item["metadata"]),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
for item in batch
|
for item in batch
|
||||||
|
|
@ -223,7 +223,7 @@ class OpenSearchClient(VectorDBBase):
|
||||||
"doc": {
|
"doc": {
|
||||||
"vector": item["vector"],
|
"vector": item["vector"],
|
||||||
"text": item["text"],
|
"text": item["text"],
|
||||||
"metadata": stringify_metadata(item["metadata"]),
|
"metadata": process_metadata(item["metadata"]),
|
||||||
},
|
},
|
||||||
"doc_as_upsert": True,
|
"doc_as_upsert": True,
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,7 @@ from sqlalchemy.ext.mutable import MutableDict
|
||||||
from sqlalchemy.exc import NoSuchTableError
|
from sqlalchemy.exc import NoSuchTableError
|
||||||
|
|
||||||
|
|
||||||
from open_webui.retrieval.vector.utils import stringify_metadata
|
from open_webui.retrieval.vector.utils import process_metadata
|
||||||
from open_webui.retrieval.vector.main import (
|
from open_webui.retrieval.vector.main import (
|
||||||
VectorDBBase,
|
VectorDBBase,
|
||||||
VectorItem,
|
VectorItem,
|
||||||
|
|
@ -265,7 +265,7 @@ class PgvectorClient(VectorDBBase):
|
||||||
vector=vector,
|
vector=vector,
|
||||||
collection_name=collection_name,
|
collection_name=collection_name,
|
||||||
text=item["text"],
|
text=item["text"],
|
||||||
vmetadata=stringify_metadata(item["metadata"]),
|
vmetadata=process_metadata(item["metadata"]),
|
||||||
)
|
)
|
||||||
new_items.append(new_chunk)
|
new_items.append(new_chunk)
|
||||||
self.session.bulk_save_objects(new_items)
|
self.session.bulk_save_objects(new_items)
|
||||||
|
|
@ -323,7 +323,7 @@ class PgvectorClient(VectorDBBase):
|
||||||
if existing:
|
if existing:
|
||||||
existing.vector = vector
|
existing.vector = vector
|
||||||
existing.text = item["text"]
|
existing.text = item["text"]
|
||||||
existing.vmetadata = stringify_metadata(item["metadata"])
|
existing.vmetadata = process_metadata(item["metadata"])
|
||||||
existing.collection_name = (
|
existing.collection_name = (
|
||||||
collection_name # Update collection_name if necessary
|
collection_name # Update collection_name if necessary
|
||||||
)
|
)
|
||||||
|
|
@ -333,7 +333,7 @@ class PgvectorClient(VectorDBBase):
|
||||||
vector=vector,
|
vector=vector,
|
||||||
collection_name=collection_name,
|
collection_name=collection_name,
|
||||||
text=item["text"],
|
text=item["text"],
|
||||||
vmetadata=stringify_metadata(item["metadata"]),
|
vmetadata=process_metadata(item["metadata"]),
|
||||||
)
|
)
|
||||||
self.session.add(new_chunk)
|
self.session.add(new_chunk)
|
||||||
self.session.commit()
|
self.session.commit()
|
||||||
|
|
|
||||||
|
|
@ -32,7 +32,7 @@ from open_webui.config import (
|
||||||
PINECONE_CLOUD,
|
PINECONE_CLOUD,
|
||||||
)
|
)
|
||||||
from open_webui.env import SRC_LOG_LEVELS
|
from open_webui.env import SRC_LOG_LEVELS
|
||||||
from open_webui.retrieval.vector.utils import stringify_metadata
|
from open_webui.retrieval.vector.utils import process_metadata
|
||||||
|
|
||||||
|
|
||||||
NO_LIMIT = 10000 # Reasonable limit to avoid overwhelming the system
|
NO_LIMIT = 10000 # Reasonable limit to avoid overwhelming the system
|
||||||
|
|
@ -185,7 +185,7 @@ class PineconeClient(VectorDBBase):
|
||||||
point = {
|
point = {
|
||||||
"id": item["id"],
|
"id": item["id"],
|
||||||
"values": item["vector"],
|
"values": item["vector"],
|
||||||
"metadata": stringify_metadata(metadata),
|
"metadata": process_metadata(metadata),
|
||||||
}
|
}
|
||||||
points.append(point)
|
points.append(point)
|
||||||
return points
|
return points
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
from open_webui.retrieval.vector.utils import stringify_metadata
|
from open_webui.retrieval.vector.utils import process_metadata
|
||||||
from open_webui.retrieval.vector.main import (
|
from open_webui.retrieval.vector.main import (
|
||||||
VectorDBBase,
|
VectorDBBase,
|
||||||
VectorItem,
|
VectorItem,
|
||||||
|
|
@ -185,7 +185,7 @@ class S3VectorClient(VectorDBBase):
|
||||||
metadata["text"] = item["text"]
|
metadata["text"] = item["text"]
|
||||||
|
|
||||||
# Convert metadata to string format for consistency
|
# Convert metadata to string format for consistency
|
||||||
metadata = stringify_metadata(metadata)
|
metadata = process_metadata(metadata)
|
||||||
|
|
||||||
# Filter metadata to comply with S3 Vector API limit of 10 keys
|
# Filter metadata to comply with S3 Vector API limit of 10 keys
|
||||||
metadata = self._filter_metadata(metadata, item["id"])
|
metadata = self._filter_metadata(metadata, item["id"])
|
||||||
|
|
@ -256,7 +256,7 @@ class S3VectorClient(VectorDBBase):
|
||||||
metadata["text"] = item["text"]
|
metadata["text"] = item["text"]
|
||||||
|
|
||||||
# Convert metadata to string format for consistency
|
# Convert metadata to string format for consistency
|
||||||
metadata = stringify_metadata(metadata)
|
metadata = process_metadata(metadata)
|
||||||
|
|
||||||
# Filter metadata to comply with S3 Vector API limit of 10 keys
|
# Filter metadata to comply with S3 Vector API limit of 10 keys
|
||||||
metadata = self._filter_metadata(metadata, item["id"])
|
metadata = self._filter_metadata(metadata, item["id"])
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,24 @@
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
KEYS_TO_EXCLUDE = ["content", "pages", "tables", "paragraphs", "sections", "figures"]
|
||||||
|
|
||||||
def stringify_metadata(
|
|
||||||
|
def filter_metadata(metadata: dict[str, any]) -> dict[str, any]:
|
||||||
|
metadata = {
|
||||||
|
key: value for key, value in metadata.items() if key not in KEYS_TO_EXCLUDE
|
||||||
|
}
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
|
def process_metadata(
|
||||||
metadata: dict[str, any],
|
metadata: dict[str, any],
|
||||||
) -> dict[str, any]:
|
) -> dict[str, any]:
|
||||||
for key, value in metadata.items():
|
for key, value in metadata.items():
|
||||||
|
# Remove large fields
|
||||||
|
if key in KEYS_TO_EXCLUDE:
|
||||||
|
del metadata[key]
|
||||||
|
|
||||||
|
# Convert non-serializable fields to strings
|
||||||
if (
|
if (
|
||||||
isinstance(value, datetime)
|
isinstance(value, datetime)
|
||||||
or isinstance(value, list)
|
or isinstance(value, list)
|
||||||
|
|
|
||||||
|
|
@ -78,6 +78,7 @@ from open_webui.retrieval.utils import (
|
||||||
query_doc,
|
query_doc,
|
||||||
query_doc_with_hybrid_search,
|
query_doc_with_hybrid_search,
|
||||||
)
|
)
|
||||||
|
from open_webui.retrieval.vector.utils import filter_metadata
|
||||||
from open_webui.utils.misc import (
|
from open_webui.utils.misc import (
|
||||||
calculate_sha256_string,
|
calculate_sha256_string,
|
||||||
)
|
)
|
||||||
|
|
@ -1535,7 +1536,7 @@ def process_file(
|
||||||
Document(
|
Document(
|
||||||
page_content=doc.page_content,
|
page_content=doc.page_content,
|
||||||
metadata={
|
metadata={
|
||||||
**doc.metadata,
|
**filter_metadata(doc.metadata),
|
||||||
"name": file.filename,
|
"name": file.filename,
|
||||||
"created_by": file.user_id,
|
"created_by": file.user_id,
|
||||||
"file_id": file.id,
|
"file_id": file.id,
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue