feat: S3 vector support tested

This commit is contained in:
0xThresh.eth 2025-07-22 21:36:35 -06:00
parent f6ee1965cb
commit d463a29ba1
3 changed files with 5 additions and 41 deletions

View file

@ -1,3 +1,3 @@
export CORS_ALLOW_ORIGIN=http://localhost:5173/ export CORS_ALLOW_ORIGIN="http://localhost:5173"
PORT="${PORT:-8080}" PORT="${PORT:-8080}"
uvicorn open_webui.main:app --port $PORT --host 0.0.0.0 --forwarded-allow-ips '*' --reload uvicorn open_webui.main:app --port $PORT --host 0.0.0.0 --forwarded-allow-ips '*' --reload

View file

@ -38,15 +38,16 @@ class S3VectorClient(VectorDBBase):
def _filter_metadata(self, metadata: Dict[str, Any], item_id: str) -> Dict[str, Any]: def _filter_metadata(self, metadata: Dict[str, Any], item_id: str) -> Dict[str, Any]:
""" """
Filter metadata to comply with S3 Vector API limit of 10 keys maximum. Filter vector metadata keys to comply with S3 Vector API limit of 10 keys maximum.
If AWS S3 Vector feature starts supporting more than 10 keys, this should be adjusted, and preferably removed. If AWS S3 Vector feature starts supporting more than 10 keys, this should be adjusted, and preferably removed.
Limitation is documented here: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-vectors-limitations.html
""" """
if not isinstance(metadata, dict) or len(metadata) <= 10: if not isinstance(metadata, dict) or len(metadata) <= 10:
return metadata return metadata
# Keep only the first 10 keys, prioritizing important ones based on actual Open WebUI metadata # Keep only the first 10 keys, prioritizing important ones based on actual Open WebUI metadata
important_keys = [ important_keys = [
'text', # THE MOST IMPORTANT - the actual document content 'text', # The actual document content
'file_id', # File ID 'file_id', # File ID
'source', # Document source file 'source', # Document source file
'title', # Document title 'title', # Document title
@ -77,34 +78,6 @@ class S3VectorClient(VectorDBBase):
log.warning(f"Metadata for key '{item_id}' had {len(metadata)} keys, limited to 10 keys") log.warning(f"Metadata for key '{item_id}' had {len(metadata)} keys, limited to 10 keys")
return filtered_metadata return filtered_metadata
def _check_for_duplicate_file_collections(self, knowledge_collection_name: str, new_items: List[Dict[str, Any]]) -> None:
"""
Check for existing file-specific collections that might create duplicates.
"""
# Extract file IDs from the new items to find corresponding file collections
file_ids = set()
for item in new_items:
metadata = item.get("metadata", {})
file_id = metadata.get("file_id")
if file_id:
file_ids.add(file_id)
# Check for existing file-specific collections
duplicate_collections = []
for file_id in file_ids:
file_collection_name = f"file-{file_id}"
if self.has_collection(file_collection_name):
duplicate_collections.append(file_collection_name)
if duplicate_collections:
log.warning(f"Found existing file-specific collections that may contain duplicate vectors: {duplicate_collections}")
log.warning(f"Consider manually deleting these collections to avoid duplicate storage:")
for collection in duplicate_collections:
log.warning(f" - {collection}")
log.warning(f"Continuing with insertion to knowledge collection '{knowledge_collection_name}'")
else:
log.info(f"No duplicate file-specific collections found for knowledge collection '{knowledge_collection_name}'")
def has_collection(self, collection_name: str) -> bool: def has_collection(self, collection_name: str) -> bool:
""" """
Check if a vector index (collection) exists in the S3 vector bucket. Check if a vector index (collection) exists in the S3 vector bucket.
@ -157,9 +130,6 @@ class S3VectorClient(VectorDBBase):
distance_metric="cosine", distance_metric="cosine",
) )
# Check for any existing file-specific collections that might create duplicates
self._check_for_duplicate_file_collections(collection_name, items)
# Prepare vectors for insertion # Prepare vectors for insertion
vectors = [] vectors = []
for item in items: for item in items:
@ -202,9 +172,6 @@ class S3VectorClient(VectorDBBase):
def upsert(self, collection_name: str, items: List[Dict[str, Any]]) -> None: def upsert(self, collection_name: str, items: List[Dict[str, Any]]) -> None:
""" """
Insert or update vector items in the S3 Vector index. Create index if it does not exist. Insert or update vector items in the S3 Vector index. Create index if it does not exist.
Supports both knowledge collections and file-specific collections for compatibility
with existing Open WebUI backend logic.
""" """
if not items: if not items:
log.warning("No items to upsert") log.warning("No items to upsert")
@ -223,9 +190,6 @@ class S3VectorClient(VectorDBBase):
distance_metric="cosine", distance_metric="cosine",
) )
# Check for any existing file-specific collections that might create duplicates
self._check_for_duplicate_file_collections(collection_name, items)
# Prepare vectors for upsert # Prepare vectors for upsert
vectors = [] vectors = []
for item in items: for item in items:

View file

@ -30,7 +30,7 @@ class Vector:
from open_webui.retrieval.vector.dbs.pinecone import PineconeClient from open_webui.retrieval.vector.dbs.pinecone import PineconeClient
return PineconeClient() return PineconeClient()
case VectorType.S3VECTOR: case VectorType.S3VECTOR:
from open_webui.retrieval.vector.s3.s3vector import S3VectorClient from open_webui.retrieval.vector.dbs.s3vector import S3VectorClient
return S3VectorClient() return S3VectorClient()
case VectorType.OPENSEARCH: case VectorType.OPENSEARCH:
from open_webui.retrieval.vector.dbs.opensearch import OpenSearchClient from open_webui.retrieval.vector.dbs.opensearch import OpenSearchClient