From 6a17ba5b7a74b788e339034b29254ca539ec865e Mon Sep 17 00:00:00 2001 From: Timothy Jaeryang Baek Date: Thu, 31 Jul 2025 17:45:06 +0400 Subject: [PATCH] refac: metadata handling in vectordb --- .../open_webui/retrieval/vector/dbs/chroma.py | 6 ++++-- .../open_webui/retrieval/vector/dbs/milvus.py | 6 ++++-- .../retrieval/vector/dbs/pgvector.py | 8 ++++--- backend/open_webui/retrieval/vector/utils.py | 14 +++++++++++++ backend/open_webui/routers/retrieval.py | 21 ++++--------------- 5 files changed, 31 insertions(+), 24 deletions(-) create mode 100644 backend/open_webui/retrieval/vector/utils.py diff --git a/backend/open_webui/retrieval/vector/dbs/chroma.py b/backend/open_webui/retrieval/vector/dbs/chroma.py index f9adc9c95f..9675e141e7 100755 --- a/backend/open_webui/retrieval/vector/dbs/chroma.py +++ b/backend/open_webui/retrieval/vector/dbs/chroma.py @@ -11,6 +11,8 @@ from open_webui.retrieval.vector.main import ( SearchResult, GetResult, ) +from open_webui.retrieval.vector.utils import stringify_metadata + from open_webui.config import ( CHROMA_DATA_PATH, CHROMA_HTTP_HOST, @@ -144,7 +146,7 @@ class ChromaClient(VectorDBBase): ids = [item["id"] for item in items] documents = [item["text"] for item in items] embeddings = [item["vector"] for item in items] - metadatas = [item["metadata"] for item in items] + metadatas = [stringify_metadata(item["metadata"]) for item in items] for batch in create_batches( api=self.client, @@ -164,7 +166,7 @@ class ChromaClient(VectorDBBase): ids = [item["id"] for item in items] documents = [item["text"] for item in items] embeddings = [item["vector"] for item in items] - metadatas = [item["metadata"] for item in items] + metadatas = [stringify_metadata(item["metadata"]) for item in items] collection.upsert( ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas diff --git a/backend/open_webui/retrieval/vector/dbs/milvus.py b/backend/open_webui/retrieval/vector/dbs/milvus.py index a4bad13d00..6e07c28016 100644 --- a/backend/open_webui/retrieval/vector/dbs/milvus.py +++ b/backend/open_webui/retrieval/vector/dbs/milvus.py @@ -3,6 +3,8 @@ from pymilvus import FieldSchema, DataType import json import logging from typing import Optional + +from open_webui.retrieval.vector.utils import stringify_metadata from open_webui.retrieval.vector.main import ( VectorDBBase, VectorItem, @@ -311,7 +313,7 @@ class MilvusClient(VectorDBBase): "id": item["id"], "vector": item["vector"], "data": {"text": item["text"]}, - "metadata": item["metadata"], + "metadata": stringify_metadata(item["metadata"]), } for item in items ], @@ -347,7 +349,7 @@ class MilvusClient(VectorDBBase): "id": item["id"], "vector": item["vector"], "data": {"text": item["text"]}, - "metadata": item["metadata"], + "metadata": stringify_metadata(item["metadata"]), } for item in items ], diff --git a/backend/open_webui/retrieval/vector/dbs/pgvector.py b/backend/open_webui/retrieval/vector/dbs/pgvector.py index 64f12aa6d0..465ab45dde 100644 --- a/backend/open_webui/retrieval/vector/dbs/pgvector.py +++ b/backend/open_webui/retrieval/vector/dbs/pgvector.py @@ -26,6 +26,8 @@ from pgvector.sqlalchemy import Vector from sqlalchemy.ext.mutable import MutableDict from sqlalchemy.exc import NoSuchTableError + +from open_webui.retrieval.vector.utils import stringify_metadata from open_webui.retrieval.vector.main import ( VectorDBBase, VectorItem, @@ -235,7 +237,7 @@ class PgvectorClient(VectorDBBase): vector=vector, collection_name=collection_name, text=item["text"], - vmetadata=item["metadata"], + vmetadata=stringify_metadata(item["metadata"]), ) new_items.append(new_chunk) self.session.bulk_save_objects(new_items) @@ -292,7 +294,7 @@ class PgvectorClient(VectorDBBase): if existing: existing.vector = vector existing.text = item["text"] - existing.vmetadata = item["metadata"] + existing.vmetadata = stringify_metadata(item["metadata"]) existing.collection_name = ( collection_name # Update collection_name if necessary ) @@ -302,7 +304,7 @@ class PgvectorClient(VectorDBBase): vector=vector, collection_name=collection_name, text=item["text"], - vmetadata=item["metadata"], + vmetadata=stringify_metadata(item["metadata"]), ) self.session.add(new_chunk) self.session.commit() diff --git a/backend/open_webui/retrieval/vector/utils.py b/backend/open_webui/retrieval/vector/utils.py new file mode 100644 index 0000000000..1d9698c6b1 --- /dev/null +++ b/backend/open_webui/retrieval/vector/utils.py @@ -0,0 +1,14 @@ +from datetime import datetime + + +def stringify_metadata( + metadata: dict[str, any], +) -> dict[str, any]: + for key, value in metadata.items(): + if ( + isinstance(value, datetime) + or isinstance(value, list) + or isinstance(value, dict) + ): + metadata[key] = str(value) + return metadata diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index fac5706f03..731e93c50f 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -1229,27 +1229,14 @@ def save_docs_to_vector_db( { **doc.metadata, **(metadata if metadata else {}), - "embedding_config": json.dumps( - { - "engine": request.app.state.config.RAG_EMBEDDING_ENGINE, - "model": request.app.state.config.RAG_EMBEDDING_MODEL, - } - ), + "embedding_config": { + "engine": request.app.state.config.RAG_EMBEDDING_ENGINE, + "model": request.app.state.config.RAG_EMBEDDING_MODEL, + }, } for doc in docs ] - # ChromaDB does not like datetime formats - # for meta-data so convert them to string. - for metadata in metadatas: - for key, value in metadata.items(): - if ( - isinstance(value, datetime) - or isinstance(value, list) - or isinstance(value, dict) - ): - metadata[key] = str(value) - try: if VECTOR_DB_CLIENT.has_collection(collection_name=collection_name): log.info(f"collection {collection_name} already exists")