mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-13 12:55:19 +00:00
chore: final cleanup
This commit is contained in:
parent
6c283cdd93
commit
8dcf668448
1 changed files with 34 additions and 52 deletions
|
|
@ -11,19 +11,36 @@ log.setLevel(SRC_LOG_LEVELS["RAG"])
|
||||||
class S3VectorClient(VectorDBBase):
|
class S3VectorClient(VectorDBBase):
|
||||||
"""
|
"""
|
||||||
AWS S3 Vector integration for Open WebUI Knowledge.
|
AWS S3 Vector integration for Open WebUI Knowledge.
|
||||||
Assumes AWS credentials are available via environment variables or IAM roles.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.bucket_name = S3_VECTOR_BUCKET_NAME
|
self.bucket_name = S3_VECTOR_BUCKET_NAME
|
||||||
self.region = S3_VECTOR_REGION
|
self.region = S3_VECTOR_REGION
|
||||||
self.client = boto3.client("s3vectors", region_name=self.region)
|
|
||||||
|
# Simple validation - log warnings instead of raising exceptions
|
||||||
|
if not self.bucket_name:
|
||||||
|
log.warning("S3_VECTOR_BUCKET_NAME not set - S3Vector will not work")
|
||||||
|
if not self.region:
|
||||||
|
log.warning("S3_VECTOR_REGION not set - S3Vector will not work")
|
||||||
|
|
||||||
|
if self.bucket_name and self.region:
|
||||||
|
try:
|
||||||
|
self.client = boto3.client("s3vectors", region_name=self.region)
|
||||||
|
log.info(f"S3Vector client initialized for bucket '{self.bucket_name}' in region '{self.region}'")
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Failed to initialize S3Vector client: {e}")
|
||||||
|
self.client = None
|
||||||
|
else:
|
||||||
|
self.client = None
|
||||||
|
|
||||||
def _create_index(self, index_name: str, dimension: int, data_type: str = "float32", distance_metric: str = "cosine"):
|
def _create_index(self, index_name: str, dimension: int, data_type: str = "float32", distance_metric: str = "cosine") -> None:
|
||||||
"""
|
"""
|
||||||
Create a new index in the S3 vector bucket for the given collection if it does not exist.
|
Create a new index in the S3 vector bucket for the given collection if it does not exist.
|
||||||
"""
|
"""
|
||||||
if self.has_collection(index_name):
|
if self.has_collection(index_name):
|
||||||
|
log.debug(f"Index '{index_name}' already exists, skipping creation")
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.client.create_index(
|
self.client.create_index(
|
||||||
vectorBucketName=self.bucket_name,
|
vectorBucketName=self.bucket_name,
|
||||||
|
|
@ -35,12 +52,11 @@ class S3VectorClient(VectorDBBase):
|
||||||
log.info(f"Created S3 index: {index_name} (dim={dimension}, type={data_type}, metric={distance_metric})")
|
log.info(f"Created S3 index: {index_name} (dim={dimension}, type={data_type}, metric={distance_metric})")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Error creating S3 index '{index_name}': {e}")
|
log.error(f"Error creating S3 index '{index_name}': {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
def _filter_metadata(self, metadata: Dict[str, Any], item_id: str) -> Dict[str, Any]:
|
def _filter_metadata(self, metadata: Dict[str, Any], item_id: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Filter vector metadata keys to comply with S3 Vector API limit of 10 keys maximum.
|
Filter vector metadata keys to comply with S3 Vector API limit of 10 keys maximum.
|
||||||
If AWS S3 Vector feature starts supporting more than 10 keys, this should be adjusted, and preferably removed.
|
|
||||||
Limitation is documented here: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-vectors-limitations.html
|
|
||||||
"""
|
"""
|
||||||
if not isinstance(metadata, dict) or len(metadata) <= 10:
|
if not isinstance(metadata, dict) or len(metadata) <= 10:
|
||||||
return metadata
|
return metadata
|
||||||
|
|
@ -82,6 +98,7 @@ class S3VectorClient(VectorDBBase):
|
||||||
"""
|
"""
|
||||||
Check if a vector index (collection) exists in the S3 vector bucket.
|
Check if a vector index (collection) exists in the S3 vector bucket.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = self.client.list_indexes(vectorBucketName=self.bucket_name)
|
response = self.client.list_indexes(vectorBucketName=self.bucket_name)
|
||||||
indexes = response.get("indexes", [])
|
indexes = response.get("indexes", [])
|
||||||
|
|
@ -89,10 +106,12 @@ class S3VectorClient(VectorDBBase):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Error listing indexes: {e}")
|
log.error(f"Error listing indexes: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def delete_collection(self, collection_name: str) -> None:
|
def delete_collection(self, collection_name: str) -> None:
|
||||||
"""
|
"""
|
||||||
Delete an entire S3 Vector index/collection.
|
Delete an entire S3 Vector index/collection.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not self.has_collection(collection_name):
|
if not self.has_collection(collection_name):
|
||||||
log.warning(f"Collection '{collection_name}' does not exist, nothing to delete")
|
log.warning(f"Collection '{collection_name}' does not exist, nothing to delete")
|
||||||
return
|
return
|
||||||
|
|
@ -108,11 +127,9 @@ class S3VectorClient(VectorDBBase):
|
||||||
log.error(f"Error deleting collection '{collection_name}': {e}")
|
log.error(f"Error deleting collection '{collection_name}': {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def insert(self, collection_name: str, items: List[Dict[str, Any]]) -> None:
|
def insert(self, collection_name: str, items: List[VectorItem]) -> None:
|
||||||
"""
|
"""
|
||||||
Insert vector items into the S3 Vector index. Create index if it does not exist.
|
Insert vector items into the S3 Vector index. Create index if it does not exist.
|
||||||
|
|
||||||
Supports both knowledge collection indexes and file-specific indexes (file-{file_id}).
|
|
||||||
"""
|
"""
|
||||||
if not items:
|
if not items:
|
||||||
log.warning("No items to insert")
|
log.warning("No items to insert")
|
||||||
|
|
@ -143,10 +160,7 @@ class S3VectorClient(VectorDBBase):
|
||||||
metadata = item.get("metadata", {}).copy()
|
metadata = item.get("metadata", {}).copy()
|
||||||
|
|
||||||
# Add the text field to metadata so it's available for retrieval
|
# Add the text field to metadata so it's available for retrieval
|
||||||
if "text" in item:
|
metadata["text"] = item["text"]
|
||||||
metadata["text"] = item["text"]
|
|
||||||
else:
|
|
||||||
log.warning(f"No 'text' field found in item with ID: {item.get('id')}")
|
|
||||||
|
|
||||||
# Filter metadata to comply with S3 Vector API limit of 10 keys
|
# Filter metadata to comply with S3 Vector API limit of 10 keys
|
||||||
metadata = self._filter_metadata(metadata, item["id"])
|
metadata = self._filter_metadata(metadata, item["id"])
|
||||||
|
|
@ -169,7 +183,7 @@ class S3VectorClient(VectorDBBase):
|
||||||
log.error(f"Error inserting vectors: {e}")
|
log.error(f"Error inserting vectors: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def upsert(self, collection_name: str, items: List[Dict[str, Any]]) -> None:
|
def upsert(self, collection_name: str, items: List[VectorItem]) -> None:
|
||||||
"""
|
"""
|
||||||
Insert or update vector items in the S3 Vector index. Create index if it does not exist.
|
Insert or update vector items in the S3 Vector index. Create index if it does not exist.
|
||||||
"""
|
"""
|
||||||
|
|
@ -202,8 +216,7 @@ class S3VectorClient(VectorDBBase):
|
||||||
# Prepare metadata, ensuring the text field is preserved
|
# Prepare metadata, ensuring the text field is preserved
|
||||||
metadata = item.get("metadata", {}).copy()
|
metadata = item.get("metadata", {}).copy()
|
||||||
# Add the text field to metadata so it's available for retrieval
|
# Add the text field to metadata so it's available for retrieval
|
||||||
if "text" in item:
|
metadata["text"] = item["text"]
|
||||||
metadata["text"] = item["text"]
|
|
||||||
|
|
||||||
# Filter metadata to comply with S3 Vector API limit of 10 keys
|
# Filter metadata to comply with S3 Vector API limit of 10 keys
|
||||||
metadata = self._filter_metadata(metadata, item["id"])
|
metadata = self._filter_metadata(metadata, item["id"])
|
||||||
|
|
@ -230,17 +243,8 @@ class S3VectorClient(VectorDBBase):
|
||||||
def search(self, collection_name: str, vectors: List[List[Union[float, int]]], limit: int) -> Optional[SearchResult]:
|
def search(self, collection_name: str, vectors: List[List[Union[float, int]]], limit: int) -> Optional[SearchResult]:
|
||||||
"""
|
"""
|
||||||
Search for similar vectors in a collection using multiple query vectors.
|
Search for similar vectors in a collection using multiple query vectors.
|
||||||
|
|
||||||
Uses S3 Vector's query_vectors API to perform similarity search.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
collection_name: Name of the collection to search in
|
|
||||||
vectors: List of query vectors to search with
|
|
||||||
limit: Maximum number of results to return per query
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
SearchResult containing IDs, documents, metadatas, and distances
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not self.has_collection(collection_name):
|
if not self.has_collection(collection_name):
|
||||||
log.warning(f"Collection '{collection_name}' does not exist")
|
log.warning(f"Collection '{collection_name}' does not exist")
|
||||||
return None
|
return None
|
||||||
|
|
@ -343,18 +347,8 @@ class S3VectorClient(VectorDBBase):
|
||||||
def query(self, collection_name: str, filter: Dict, limit: Optional[int] = None) -> Optional[GetResult]:
|
def query(self, collection_name: str, filter: Dict, limit: Optional[int] = None) -> Optional[GetResult]:
|
||||||
"""
|
"""
|
||||||
Query vectors from a collection using metadata filter.
|
Query vectors from a collection using metadata filter.
|
||||||
|
|
||||||
For S3 Vector, this uses the list_vectors API with metadata filters.
|
|
||||||
Note: S3 Vector supports metadata filtering, but the exact filter syntax may vary.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
collection_name: Name of the collection to query
|
|
||||||
filter: Dictionary containing metadata filter conditions
|
|
||||||
limit: Maximum number of results to return (optional)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
GetResult containing IDs, documents, and metadatas
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not self.has_collection(collection_name):
|
if not self.has_collection(collection_name):
|
||||||
log.warning(f"Collection '{collection_name}' does not exist")
|
log.warning(f"Collection '{collection_name}' does not exist")
|
||||||
return GetResult(ids=[[]], documents=[[]], metadatas=[[]])
|
return GetResult(ids=[[]], documents=[[]], metadatas=[[]])
|
||||||
|
|
@ -423,10 +417,8 @@ class S3VectorClient(VectorDBBase):
|
||||||
def get(self, collection_name: str) -> Optional[GetResult]:
|
def get(self, collection_name: str) -> Optional[GetResult]:
|
||||||
"""
|
"""
|
||||||
Retrieve all vectors from a collection.
|
Retrieve all vectors from a collection.
|
||||||
|
|
||||||
Uses S3 Vector's list_vectors API to get all vectors with their data and metadata.
|
|
||||||
Handles pagination automatically to retrieve all vectors.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not self.has_collection(collection_name):
|
if not self.has_collection(collection_name):
|
||||||
log.warning(f"Collection '{collection_name}' does not exist")
|
log.warning(f"Collection '{collection_name}' does not exist")
|
||||||
return GetResult(ids=[[]], documents=[[]], metadatas=[[]])
|
return GetResult(ids=[[]], documents=[[]], metadatas=[[]])
|
||||||
|
|
@ -519,10 +511,8 @@ class S3VectorClient(VectorDBBase):
|
||||||
def delete(self, collection_name: str, ids: Optional[List[str]] = None, filter: Optional[Dict] = None) -> None:
|
def delete(self, collection_name: str, ids: Optional[List[str]] = None, filter: Optional[Dict] = None) -> None:
|
||||||
"""
|
"""
|
||||||
Delete vectors by ID or filter from a collection.
|
Delete vectors by ID or filter from a collection.
|
||||||
|
|
||||||
For S3 Vector, we support deletion by IDs. Filter-based deletion requires querying first.
|
|
||||||
For knowledge collections, also handles cleanup of related file-specific collections.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not self.has_collection(collection_name):
|
if not self.has_collection(collection_name):
|
||||||
log.warning(f"Collection '{collection_name}' does not exist, nothing to delete")
|
log.warning(f"Collection '{collection_name}' does not exist, nothing to delete")
|
||||||
return
|
return
|
||||||
|
|
@ -578,9 +568,9 @@ class S3VectorClient(VectorDBBase):
|
||||||
|
|
||||||
def reset(self) -> None:
|
def reset(self) -> None:
|
||||||
"""
|
"""
|
||||||
Reset/clear all vector data. For S3 Vector, this would mean deleting all indexes.
|
Reset/clear all vector data. For S3 Vector, this deletes all indexes.
|
||||||
Use with caution as this is destructive.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
log.warning("Reset called - this will delete all vector indexes in the S3 bucket")
|
log.warning("Reset called - this will delete all vector indexes in the S3 bucket")
|
||||||
|
|
||||||
|
|
@ -616,14 +606,6 @@ class S3VectorClient(VectorDBBase):
|
||||||
def _matches_filter(self, metadata: Dict[str, Any], filter: Dict[str, Any]) -> bool:
|
def _matches_filter(self, metadata: Dict[str, Any], filter: Dict[str, Any]) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if metadata matches the given filter conditions.
|
Check if metadata matches the given filter conditions.
|
||||||
Supports basic equality matching and simple logical operations.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
metadata: The metadata to check
|
|
||||||
filter: The filter conditions to match against
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if metadata matches all filter conditions, False otherwise
|
|
||||||
"""
|
"""
|
||||||
if not isinstance(metadata, dict) or not isinstance(filter, dict):
|
if not isinstance(metadata, dict) or not isinstance(filter, dict):
|
||||||
return False
|
return False
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue