mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-15 05:45:19 +00:00
feat: S3 vector support tested
This commit is contained in:
parent
f6ee1965cb
commit
d463a29ba1
3 changed files with 5 additions and 41 deletions
|
|
@ -1,3 +1,3 @@
|
||||||
export CORS_ALLOW_ORIGIN=http://localhost:5173/
|
export CORS_ALLOW_ORIGIN="http://localhost:5173"
|
||||||
PORT="${PORT:-8080}"
|
PORT="${PORT:-8080}"
|
||||||
uvicorn open_webui.main:app --port $PORT --host 0.0.0.0 --forwarded-allow-ips '*' --reload
|
uvicorn open_webui.main:app --port $PORT --host 0.0.0.0 --forwarded-allow-ips '*' --reload
|
||||||
|
|
|
||||||
|
|
@ -38,15 +38,16 @@ class S3VectorClient(VectorDBBase):
|
||||||
|
|
||||||
def _filter_metadata(self, metadata: Dict[str, Any], item_id: str) -> Dict[str, Any]:
|
def _filter_metadata(self, metadata: Dict[str, Any], item_id: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Filter metadata to comply with S3 Vector API limit of 10 keys maximum.
|
Filter vector metadata keys to comply with S3 Vector API limit of 10 keys maximum.
|
||||||
If AWS S3 Vector feature starts supporting more than 10 keys, this should be adjusted, and preferably removed.
|
If AWS S3 Vector feature starts supporting more than 10 keys, this should be adjusted, and preferably removed.
|
||||||
|
Limitation is documented here: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-vectors-limitations.html
|
||||||
"""
|
"""
|
||||||
if not isinstance(metadata, dict) or len(metadata) <= 10:
|
if not isinstance(metadata, dict) or len(metadata) <= 10:
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
# Keep only the first 10 keys, prioritizing important ones based on actual Open WebUI metadata
|
# Keep only the first 10 keys, prioritizing important ones based on actual Open WebUI metadata
|
||||||
important_keys = [
|
important_keys = [
|
||||||
'text', # THE MOST IMPORTANT - the actual document content
|
'text', # The actual document content
|
||||||
'file_id', # File ID
|
'file_id', # File ID
|
||||||
'source', # Document source file
|
'source', # Document source file
|
||||||
'title', # Document title
|
'title', # Document title
|
||||||
|
|
@ -77,34 +78,6 @@ class S3VectorClient(VectorDBBase):
|
||||||
log.warning(f"Metadata for key '{item_id}' had {len(metadata)} keys, limited to 10 keys")
|
log.warning(f"Metadata for key '{item_id}' had {len(metadata)} keys, limited to 10 keys")
|
||||||
return filtered_metadata
|
return filtered_metadata
|
||||||
|
|
||||||
def _check_for_duplicate_file_collections(self, knowledge_collection_name: str, new_items: List[Dict[str, Any]]) -> None:
|
|
||||||
"""
|
|
||||||
Check for existing file-specific collections that might create duplicates.
|
|
||||||
"""
|
|
||||||
# Extract file IDs from the new items to find corresponding file collections
|
|
||||||
file_ids = set()
|
|
||||||
for item in new_items:
|
|
||||||
metadata = item.get("metadata", {})
|
|
||||||
file_id = metadata.get("file_id")
|
|
||||||
if file_id:
|
|
||||||
file_ids.add(file_id)
|
|
||||||
|
|
||||||
# Check for existing file-specific collections
|
|
||||||
duplicate_collections = []
|
|
||||||
for file_id in file_ids:
|
|
||||||
file_collection_name = f"file-{file_id}"
|
|
||||||
if self.has_collection(file_collection_name):
|
|
||||||
duplicate_collections.append(file_collection_name)
|
|
||||||
|
|
||||||
if duplicate_collections:
|
|
||||||
log.warning(f"Found existing file-specific collections that may contain duplicate vectors: {duplicate_collections}")
|
|
||||||
log.warning(f"Consider manually deleting these collections to avoid duplicate storage:")
|
|
||||||
for collection in duplicate_collections:
|
|
||||||
log.warning(f" - {collection}")
|
|
||||||
log.warning(f"Continuing with insertion to knowledge collection '{knowledge_collection_name}'")
|
|
||||||
else:
|
|
||||||
log.info(f"No duplicate file-specific collections found for knowledge collection '{knowledge_collection_name}'")
|
|
||||||
|
|
||||||
def has_collection(self, collection_name: str) -> bool:
|
def has_collection(self, collection_name: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if a vector index (collection) exists in the S3 vector bucket.
|
Check if a vector index (collection) exists in the S3 vector bucket.
|
||||||
|
|
@ -157,9 +130,6 @@ class S3VectorClient(VectorDBBase):
|
||||||
distance_metric="cosine",
|
distance_metric="cosine",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check for any existing file-specific collections that might create duplicates
|
|
||||||
self._check_for_duplicate_file_collections(collection_name, items)
|
|
||||||
|
|
||||||
# Prepare vectors for insertion
|
# Prepare vectors for insertion
|
||||||
vectors = []
|
vectors = []
|
||||||
for item in items:
|
for item in items:
|
||||||
|
|
@ -202,9 +172,6 @@ class S3VectorClient(VectorDBBase):
|
||||||
def upsert(self, collection_name: str, items: List[Dict[str, Any]]) -> None:
|
def upsert(self, collection_name: str, items: List[Dict[str, Any]]) -> None:
|
||||||
"""
|
"""
|
||||||
Insert or update vector items in the S3 Vector index. Create index if it does not exist.
|
Insert or update vector items in the S3 Vector index. Create index if it does not exist.
|
||||||
|
|
||||||
Supports both knowledge collections and file-specific collections for compatibility
|
|
||||||
with existing Open WebUI backend logic.
|
|
||||||
"""
|
"""
|
||||||
if not items:
|
if not items:
|
||||||
log.warning("No items to upsert")
|
log.warning("No items to upsert")
|
||||||
|
|
@ -223,9 +190,6 @@ class S3VectorClient(VectorDBBase):
|
||||||
distance_metric="cosine",
|
distance_metric="cosine",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check for any existing file-specific collections that might create duplicates
|
|
||||||
self._check_for_duplicate_file_collections(collection_name, items)
|
|
||||||
|
|
||||||
# Prepare vectors for upsert
|
# Prepare vectors for upsert
|
||||||
vectors = []
|
vectors = []
|
||||||
for item in items:
|
for item in items:
|
||||||
|
|
@ -30,7 +30,7 @@ class Vector:
|
||||||
from open_webui.retrieval.vector.dbs.pinecone import PineconeClient
|
from open_webui.retrieval.vector.dbs.pinecone import PineconeClient
|
||||||
return PineconeClient()
|
return PineconeClient()
|
||||||
case VectorType.S3VECTOR:
|
case VectorType.S3VECTOR:
|
||||||
from open_webui.retrieval.vector.s3.s3vector import S3VectorClient
|
from open_webui.retrieval.vector.dbs.s3vector import S3VectorClient
|
||||||
return S3VectorClient()
|
return S3VectorClient()
|
||||||
case VectorType.OPENSEARCH:
|
case VectorType.OPENSEARCH:
|
||||||
from open_webui.retrieval.vector.dbs.opensearch import OpenSearchClient
|
from open_webui.retrieval.vector.dbs.opensearch import OpenSearchClient
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue