This commit is contained in:
Classic298 2025-08-22 19:17:52 +02:00 committed by GitHub
parent 46288924a2
commit 8231588eb4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -461,30 +461,157 @@ class ChromaDatabaseCleaner(VectorDatabaseCleaner):
class PGVectorDatabaseCleaner(VectorDatabaseCleaner): class PGVectorDatabaseCleaner(VectorDatabaseCleaner):
""" """
Placeholder implementation for PGVector database cleanup. PGVector database cleanup implementation.
This is a stub implementation that can be extended by the community Leverages the existing PGVector client's delete() method for simple,
to support PGVector-specific cleanup operations. reliable collection cleanup while maintaining comprehensive error handling
and safety features.
According to PR feedback, PGVector stores data in document_chunk table
and cleanup should involve finding rows with matching file IDs.
""" """
def __init__(self):
# Validate that we can access the PGVector client
try:
if VECTOR_DB_CLIENT is None:
raise Exception("VECTOR_DB_CLIENT is not available")
# Test if we can access the session
if hasattr(VECTOR_DB_CLIENT, 'session') and VECTOR_DB_CLIENT.session:
self.session = VECTOR_DB_CLIENT.session
log.debug("PGVector cleaner initialized successfully")
else:
raise Exception("PGVector client session not available")
except Exception as e:
log.error(f"Failed to initialize PGVector client for cleanup: {e}")
self.session = None
def count_orphaned_collections(self, active_file_ids: Set[str], active_kb_ids: Set[str]) -> int: def count_orphaned_collections(self, active_file_ids: Set[str], active_kb_ids: Set[str]) -> int:
"""Count orphaned PGVector collections - to be implemented by community.""" """Count orphaned PGVector collections for preview."""
log.debug("PGVector collection counting not yet implemented") if not self.session:
log.warning("PGVector session not available for counting orphaned collections")
return 0
try:
orphaned_collections = self._get_orphaned_collections(active_file_ids, active_kb_ids)
self.session.rollback() # Read-only transaction
return len(orphaned_collections)
except Exception as e:
if self.session:
self.session.rollback()
log.error(f"Error counting orphaned PGVector collections: {e}")
return 0 return 0
def cleanup_orphaned_collections(self, active_file_ids: Set[str], active_kb_ids: Set[str]) -> int: def cleanup_orphaned_collections(self, active_file_ids: Set[str], active_kb_ids: Set[str]) -> int:
"""Cleanup orphaned PGVector collections - to be implemented by community.""" """
log.debug("PGVector collection cleanup not yet implemented") Delete orphaned PGVector collections using the existing client's delete method.
This is the "super easy" approach suggested by @recrudesce - just use the
existing PGVector client's delete() method for each orphaned collection.
"""
if not self.session:
log.warning("PGVector session not available for cleanup")
return 0
try:
orphaned_collections = self._get_orphaned_collections(active_file_ids, active_kb_ids)
if not orphaned_collections:
log.debug("No orphaned PGVector collections found")
return 0
deleted_count = 0
log.info(f"Deleting {len(orphaned_collections)} orphaned PGVector collections")
# SIMPLIFIED DELETION: Use existing PGVector client delete method
for collection_name in orphaned_collections:
try:
# This is @recrudesce's "super easy" approach:
# Just call the existing delete method!
VECTOR_DB_CLIENT.delete(collection_name)
deleted_count += 1
log.debug(f"Deleted PGVector collection: {collection_name}")
except Exception as e:
log.error(f"Failed to delete PGVector collection '{collection_name}': {e}")
# Continue with other collections even if one fails
continue
# PostgreSQL-specific optimization (if we have access to session)
try:
if self.session:
self.session.execute(text("VACUUM ANALYZE document_chunk"))
self.session.commit()
log.debug("Executed VACUUM ANALYZE on document_chunk table")
except Exception as e:
log.warning(f"Failed to VACUUM PGVector table: {e}")
if deleted_count > 0:
log.info(f"Successfully deleted {deleted_count} orphaned PGVector collections")
return deleted_count
except Exception as e:
if self.session:
self.session.rollback()
log.error(f"Error cleaning orphaned PGVector collections: {e}")
return 0 return 0
def delete_collection(self, collection_name: str) -> bool: def delete_collection(self, collection_name: str) -> bool:
"""Delete PGVector collection - to be implemented by community.""" """
log.debug(f"PGVector collection deletion not yet implemented: {collection_name}") Delete a specific PGVector collection using the existing client method.
Super simple - just call the existing delete method!
"""
try:
# @recrudesce's "super easy" approach: use existing client!
VECTOR_DB_CLIENT.delete(collection_name)
log.debug(f"Deleted PGVector collection: {collection_name}")
return True return True
except Exception as e:
log.error(f"Error deleting PGVector collection '{collection_name}': {e}")
return False
def _get_orphaned_collections(self, active_file_ids: Set[str], active_kb_ids: Set[str]) -> Set[str]:
"""
Find collections that exist in PGVector but are no longer referenced.
This is the only "complex" part - discovery. The actual deletion is simple!
"""
try:
expected_collections = self._build_expected_collections(active_file_ids, active_kb_ids)
# Query distinct collection names from document_chunk table
result = self.session.execute(
text("SELECT DISTINCT collection_name FROM document_chunk")
).fetchall()
existing_collections = {row[0] for row in result}
orphaned_collections = existing_collections - expected_collections
log.debug(f"Found {len(existing_collections)} existing collections, "
f"{len(expected_collections)} expected, "
f"{len(orphaned_collections)} orphaned")
return orphaned_collections
except Exception as e:
log.error(f"Error finding orphaned PGVector collections: {e}")
return set()
def _build_expected_collections(self, active_file_ids: Set[str], active_kb_ids: Set[str]) -> Set[str]:
"""Build set of collection names that should exist."""
expected_collections = set()
# File collections use "file-{id}" pattern (same as ChromaDB)
for file_id in active_file_ids:
expected_collections.add(f"file-{file_id}")
# Knowledge base collections use the KB ID directly (same as ChromaDB)
for kb_id in active_kb_ids:
expected_collections.add(kb_id)
return expected_collections
class NoOpVectorDatabaseCleaner(VectorDatabaseCleaner): class NoOpVectorDatabaseCleaner(VectorDatabaseCleaner):
""" """
@ -524,7 +651,7 @@ def get_vector_database_cleaner() -> VectorDatabaseCleaner:
log.debug("Using ChromaDB cleaner") log.debug("Using ChromaDB cleaner")
return ChromaDatabaseCleaner() return ChromaDatabaseCleaner()
elif "pgvector" in vector_db_type: elif "pgvector" in vector_db_type:
log.debug("Using PGVector cleaner (placeholder implementation)") log.debug("Using PGVector cleaner")
return PGVectorDatabaseCleaner() return PGVectorDatabaseCleaner()
else: else:
log.debug(f"No specific cleaner for vector database type: {VECTOR_DB}, using no-op cleaner") log.debug(f"No specific cleaner for vector database type: {VECTOR_DB}, using no-op cleaner")
@ -1297,6 +1424,13 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
log.info("Vacuumed ChromaDB database") log.info("Vacuumed ChromaDB database")
except Exception as e: except Exception as e:
log.error(f"Failed to vacuum ChromaDB database: {e}") log.error(f"Failed to vacuum ChromaDB database: {e}")
elif isinstance(vector_cleaner, PGVectorDatabaseCleaner) and vector_cleaner.session:
try:
vector_cleaner.session.execute(text("VACUUM ANALYZE"))
vector_cleaner.session.commit()
log.info("Executed VACUUM ANALYZE on PostgreSQL database")
except Exception as e:
log.error(f"Failed to vacuum PostgreSQL database: {e}")
log.info("Data pruning completed successfully") log.info("Data pruning completed successfully")
return True return True