mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-12 20:35:19 +00:00
feat: S3 vector support tested
This commit is contained in:
parent
f6ee1965cb
commit
d463a29ba1
3 changed files with 5 additions and 41 deletions
|
|
@ -1,3 +1,3 @@
|
|||
export CORS_ALLOW_ORIGIN=http://localhost:5173/
|
||||
export CORS_ALLOW_ORIGIN="http://localhost:5173"
|
||||
PORT="${PORT:-8080}"
|
||||
uvicorn open_webui.main:app --port $PORT --host 0.0.0.0 --forwarded-allow-ips '*' --reload
|
||||
|
|
|
|||
|
|
@ -38,15 +38,16 @@ class S3VectorClient(VectorDBBase):
|
|||
|
||||
def _filter_metadata(self, metadata: Dict[str, Any], item_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Filter metadata to comply with S3 Vector API limit of 10 keys maximum.
|
||||
Filter vector metadata keys to comply with S3 Vector API limit of 10 keys maximum.
|
||||
If AWS S3 Vector feature starts supporting more than 10 keys, this should be adjusted, and preferably removed.
|
||||
Limitation is documented here: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-vectors-limitations.html
|
||||
"""
|
||||
if not isinstance(metadata, dict) or len(metadata) <= 10:
|
||||
return metadata
|
||||
|
||||
# Keep only the first 10 keys, prioritizing important ones based on actual Open WebUI metadata
|
||||
important_keys = [
|
||||
'text', # THE MOST IMPORTANT - the actual document content
|
||||
'text', # The actual document content
|
||||
'file_id', # File ID
|
||||
'source', # Document source file
|
||||
'title', # Document title
|
||||
|
|
@ -77,34 +78,6 @@ class S3VectorClient(VectorDBBase):
|
|||
log.warning(f"Metadata for key '{item_id}' had {len(metadata)} keys, limited to 10 keys")
|
||||
return filtered_metadata
|
||||
|
||||
def _check_for_duplicate_file_collections(self, knowledge_collection_name: str, new_items: List[Dict[str, Any]]) -> None:
|
||||
"""
|
||||
Check for existing file-specific collections that might create duplicates.
|
||||
"""
|
||||
# Extract file IDs from the new items to find corresponding file collections
|
||||
file_ids = set()
|
||||
for item in new_items:
|
||||
metadata = item.get("metadata", {})
|
||||
file_id = metadata.get("file_id")
|
||||
if file_id:
|
||||
file_ids.add(file_id)
|
||||
|
||||
# Check for existing file-specific collections
|
||||
duplicate_collections = []
|
||||
for file_id in file_ids:
|
||||
file_collection_name = f"file-{file_id}"
|
||||
if self.has_collection(file_collection_name):
|
||||
duplicate_collections.append(file_collection_name)
|
||||
|
||||
if duplicate_collections:
|
||||
log.warning(f"Found existing file-specific collections that may contain duplicate vectors: {duplicate_collections}")
|
||||
log.warning(f"Consider manually deleting these collections to avoid duplicate storage:")
|
||||
for collection in duplicate_collections:
|
||||
log.warning(f" - {collection}")
|
||||
log.warning(f"Continuing with insertion to knowledge collection '{knowledge_collection_name}'")
|
||||
else:
|
||||
log.info(f"No duplicate file-specific collections found for knowledge collection '{knowledge_collection_name}'")
|
||||
|
||||
def has_collection(self, collection_name: str) -> bool:
|
||||
"""
|
||||
Check if a vector index (collection) exists in the S3 vector bucket.
|
||||
|
|
@ -157,9 +130,6 @@ class S3VectorClient(VectorDBBase):
|
|||
distance_metric="cosine",
|
||||
)
|
||||
|
||||
# Check for any existing file-specific collections that might create duplicates
|
||||
self._check_for_duplicate_file_collections(collection_name, items)
|
||||
|
||||
# Prepare vectors for insertion
|
||||
vectors = []
|
||||
for item in items:
|
||||
|
|
@ -202,9 +172,6 @@ class S3VectorClient(VectorDBBase):
|
|||
def upsert(self, collection_name: str, items: List[Dict[str, Any]]) -> None:
|
||||
"""
|
||||
Insert or update vector items in the S3 Vector index. Create index if it does not exist.
|
||||
|
||||
Supports both knowledge collections and file-specific collections for compatibility
|
||||
with existing Open WebUI backend logic.
|
||||
"""
|
||||
if not items:
|
||||
log.warning("No items to upsert")
|
||||
|
|
@ -223,9 +190,6 @@ class S3VectorClient(VectorDBBase):
|
|||
distance_metric="cosine",
|
||||
)
|
||||
|
||||
# Check for any existing file-specific collections that might create duplicates
|
||||
self._check_for_duplicate_file_collections(collection_name, items)
|
||||
|
||||
# Prepare vectors for upsert
|
||||
vectors = []
|
||||
for item in items:
|
||||
|
|
@ -30,7 +30,7 @@ class Vector:
|
|||
from open_webui.retrieval.vector.dbs.pinecone import PineconeClient
|
||||
return PineconeClient()
|
||||
case VectorType.S3VECTOR:
|
||||
from open_webui.retrieval.vector.s3.s3vector import S3VectorClient
|
||||
from open_webui.retrieval.vector.dbs.s3vector import S3VectorClient
|
||||
return S3VectorClient()
|
||||
case VectorType.OPENSEARCH:
|
||||
from open_webui.retrieval.vector.dbs.opensearch import OpenSearchClient
|
||||
|
|
|
|||
Loading…
Reference in a new issue