From d463a29ba17d5eb7d3ad0f38980a1e38eefbb4b3 Mon Sep 17 00:00:00 2001 From: "0xThresh.eth" <0xthresh@protonmail.com> Date: Tue, 22 Jul 2025 21:36:35 -0600 Subject: [PATCH] feat: S3 vector support tested --- backend/dev.sh | 2 +- .../retrieval/vector/{s3 => dbs}/s3vector.py | 42 ++----------------- .../open_webui/retrieval/vector/factory.py | 2 +- 3 files changed, 5 insertions(+), 41 deletions(-) rename backend/open_webui/retrieval/vector/{s3 => dbs}/s3vector.py (93%) diff --git a/backend/dev.sh b/backend/dev.sh index 0164c1940e..504b8f7554 100755 --- a/backend/dev.sh +++ b/backend/dev.sh @@ -1,3 +1,3 @@ -export CORS_ALLOW_ORIGIN=http://localhost:5173/ +export CORS_ALLOW_ORIGIN="http://localhost:5173" PORT="${PORT:-8080}" uvicorn open_webui.main:app --port $PORT --host 0.0.0.0 --forwarded-allow-ips '*' --reload diff --git a/backend/open_webui/retrieval/vector/s3/s3vector.py b/backend/open_webui/retrieval/vector/dbs/s3vector.py similarity index 93% rename from backend/open_webui/retrieval/vector/s3/s3vector.py rename to backend/open_webui/retrieval/vector/dbs/s3vector.py index 88392d09ba..e0fc4cb1a2 100644 --- a/backend/open_webui/retrieval/vector/s3/s3vector.py +++ b/backend/open_webui/retrieval/vector/dbs/s3vector.py @@ -38,15 +38,16 @@ class S3VectorClient(VectorDBBase): def _filter_metadata(self, metadata: Dict[str, Any], item_id: str) -> Dict[str, Any]: """ - Filter metadata to comply with S3 Vector API limit of 10 keys maximum. + Filter vector metadata keys to comply with S3 Vector API limit of 10 keys maximum. If AWS S3 Vector feature starts supporting more than 10 keys, this should be adjusted, and preferably removed. + Limitation is documented here: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-vectors-limitations.html """ if not isinstance(metadata, dict) or len(metadata) <= 10: return metadata # Keep only the first 10 keys, prioritizing important ones based on actual Open WebUI metadata important_keys = [ - 'text', # THE MOST IMPORTANT - the actual document content + 'text', # The actual document content 'file_id', # File ID 'source', # Document source file 'title', # Document title @@ -77,34 +78,6 @@ class S3VectorClient(VectorDBBase): log.warning(f"Metadata for key '{item_id}' had {len(metadata)} keys, limited to 10 keys") return filtered_metadata - def _check_for_duplicate_file_collections(self, knowledge_collection_name: str, new_items: List[Dict[str, Any]]) -> None: - """ - Check for existing file-specific collections that might create duplicates. - """ - # Extract file IDs from the new items to find corresponding file collections - file_ids = set() - for item in new_items: - metadata = item.get("metadata", {}) - file_id = metadata.get("file_id") - if file_id: - file_ids.add(file_id) - - # Check for existing file-specific collections - duplicate_collections = [] - for file_id in file_ids: - file_collection_name = f"file-{file_id}" - if self.has_collection(file_collection_name): - duplicate_collections.append(file_collection_name) - - if duplicate_collections: - log.warning(f"Found existing file-specific collections that may contain duplicate vectors: {duplicate_collections}") - log.warning(f"Consider manually deleting these collections to avoid duplicate storage:") - for collection in duplicate_collections: - log.warning(f" - {collection}") - log.warning(f"Continuing with insertion to knowledge collection '{knowledge_collection_name}'") - else: - log.info(f"No duplicate file-specific collections found for knowledge collection '{knowledge_collection_name}'") - def has_collection(self, collection_name: str) -> bool: """ Check if a vector index (collection) exists in the S3 vector bucket. @@ -157,9 +130,6 @@ class S3VectorClient(VectorDBBase): distance_metric="cosine", ) - # Check for any existing file-specific collections that might create duplicates - self._check_for_duplicate_file_collections(collection_name, items) - # Prepare vectors for insertion vectors = [] for item in items: @@ -202,9 +172,6 @@ class S3VectorClient(VectorDBBase): def upsert(self, collection_name: str, items: List[Dict[str, Any]]) -> None: """ Insert or update vector items in the S3 Vector index. Create index if it does not exist. - - Supports both knowledge collections and file-specific collections for compatibility - with existing Open WebUI backend logic. """ if not items: log.warning("No items to upsert") @@ -223,9 +190,6 @@ class S3VectorClient(VectorDBBase): distance_metric="cosine", ) - # Check for any existing file-specific collections that might create duplicates - self._check_for_duplicate_file_collections(collection_name, items) - # Prepare vectors for upsert vectors = [] for item in items: diff --git a/backend/open_webui/retrieval/vector/factory.py b/backend/open_webui/retrieval/vector/factory.py index 28f647ad26..629a9debf8 100644 --- a/backend/open_webui/retrieval/vector/factory.py +++ b/backend/open_webui/retrieval/vector/factory.py @@ -30,7 +30,7 @@ class Vector: from open_webui.retrieval.vector.dbs.pinecone import PineconeClient return PineconeClient() case VectorType.S3VECTOR: - from open_webui.retrieval.vector.s3.s3vector import S3VectorClient + from open_webui.retrieval.vector.dbs.s3vector import S3VectorClient return S3VectorClient() case VectorType.OPENSEARCH: from open_webui.retrieval.vector.dbs.opensearch import OpenSearchClient