Update prune.py

This commit is contained in:
Classic298 2025-08-12 13:20:16 +02:00 committed by GitHub
parent 60edac6c3f
commit 709c852917
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -38,7 +38,6 @@ class PruneDataForm(BaseModel):
days: Optional[int] = None
exempt_archived_chats: bool = False
exempt_chats_in_folders: bool = False
# Orphaned resource deletion toggles (for deleted users)
delete_orphaned_chats: bool = True
delete_orphaned_tools: bool = False
delete_orphaned_functions: bool = False
@ -47,19 +46,17 @@ class PruneDataForm(BaseModel):
delete_orphaned_models: bool = True
delete_orphaned_notes: bool = True
delete_orphaned_folders: bool = True
# Audio cache cleanup
audio_cache_max_age_days: Optional[int] = 30
def get_active_file_ids() -> Set[str]:
"""
Get all file IDs that are actively referenced by knowledge bases, chats, folders, and messages.
This is the ground truth for what files should be preserved.
"""
active_file_ids = set()
try:
# 1. Get files referenced by knowledge bases (original logic)
# Scan knowledge bases for file references
knowledge_bases = Knowledges.get_knowledge_bases()
log.debug(f"Found {len(knowledge_bases)} knowledge bases")
@ -67,15 +64,12 @@ def get_active_file_ids() -> Set[str]:
if not kb.data:
continue
# Handle different possible data structures for file references
file_ids = []
# Check for file_ids array
if isinstance(kb.data, dict) and "file_ids" in kb.data:
if isinstance(kb.data["file_ids"], list):
file_ids.extend(kb.data["file_ids"])
# Check for files array with id field
if isinstance(kb.data, dict) and "files" in kb.data:
if isinstance(kb.data["files"], list):
for file_ref in kb.data["files"]:
@ -84,13 +78,11 @@ def get_active_file_ids() -> Set[str]:
elif isinstance(file_ref, str):
file_ids.append(file_ref)
# Add all found file IDs
for file_id in file_ids:
if isinstance(file_id, str) and file_id.strip():
active_file_ids.add(file_id.strip())
log.debug(f"KB {kb.id} references file {file_id}")
# 2. Get files referenced in chats (NEW: scan chat JSON for file references)
# Scan chats for file references
chats = Chats.get_chats()
log.debug(f"Found {len(chats)} chats to scan for file references")
@ -99,40 +91,31 @@ def get_active_file_ids() -> Set[str]:
continue
try:
# Convert entire chat JSON to string and extract all file IDs
chat_json_str = json.dumps(chat.chat)
# Find all file ID patterns in the JSON
# Pattern 1: "id": "uuid" where uuid looks like a file ID
# Extract file IDs using regex patterns
file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
potential_file_ids = file_id_pattern.findall(chat_json_str)
# Pattern 2: URLs containing /api/v1/files/uuid
url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
potential_file_ids = file_id_pattern.findall(chat_json_str)
url_file_ids = url_pattern.findall(chat_json_str)
# Combine and validate against actual file records
all_potential_ids = set(potential_file_ids + url_file_ids)
for file_id in all_potential_ids:
# Verify this ID exists in the file table to avoid false positives
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
log.debug(f"Chat {chat.id}: Found active file {file_id}")
except Exception as e:
log.debug(f"Error processing chat {chat.id} for file references: {e}")
# 3. Get files referenced in folders (scan folder.items, folder.data, folder.meta)
# Scan folders for file references
try:
folders = Folders.get_all_folders()
log.debug(f"Found {len(folders)} folders to scan for file references")
for folder in folders:
# Check folder.items JSON
if folder.items:
try:
items_str = json.dumps(folder.items)
# Look for file ID patterns in the JSON
file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
@ -140,11 +123,9 @@ def get_active_file_ids() -> Set[str]:
for file_id in potential_ids:
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
log.debug(f"Folder {folder.id}: Found file {file_id} in items")
except Exception as e:
log.debug(f"Error processing folder {folder.id} items: {e}")
# Check folder.data JSON
if hasattr(folder, 'data') and folder.data:
try:
data_str = json.dumps(folder.data)
@ -155,24 +136,20 @@ def get_active_file_ids() -> Set[str]:
for file_id in potential_ids:
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
log.debug(f"Folder {folder.id}: Found file {file_id} in data")
except Exception as e:
log.debug(f"Error processing folder {folder.id} data: {e}")
except Exception as e:
log.debug(f"Error scanning folders for file references: {e}")
# 4. Get files referenced in standalone messages (message table)
# Scan standalone messages for file references
try:
# Query message table directly since we may not have a Messages model
with get_db() as db:
message_results = db.execute(text("SELECT id, data FROM message WHERE data IS NOT NULL")).fetchall()
log.debug(f"Found {len(message_results)} messages with data to scan")
for message_id, message_data_json in message_results:
if message_data_json:
try:
# Convert JSON to string and scan for file patterns
data_str = json.dumps(message_data_json) if isinstance(message_data_json, dict) else str(message_data_json)
file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
@ -182,7 +159,6 @@ def get_active_file_ids() -> Set[str]:
for file_id in potential_ids:
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
log.debug(f"Message {message_id}: Found file {file_id}")
except Exception as e:
log.debug(f"Error processing message {message_id} data: {e}")
except Exception as e:
@ -190,7 +166,6 @@ def get_active_file_ids() -> Set[str]:
except Exception as e:
log.error(f"Error determining active file IDs: {e}")
# Fail safe: return empty set, which will prevent deletion
return set()
log.info(f"Found {len(active_file_ids)} active file IDs")
@ -202,19 +177,15 @@ def safe_delete_vector_collection(collection_name: str) -> bool:
Safely delete a vector collection, handling both logical and physical cleanup.
"""
try:
# First, try to delete the collection through the client
try:
VECTOR_DB_CLIENT.delete_collection(collection_name=collection_name)
log.debug(f"Deleted collection from vector DB: {collection_name}")
except Exception as e:
log.debug(f"Collection {collection_name} may not exist in DB: {e}")
# Then, handle physical cleanup for ChromaDB
if "chroma" in VECTOR_DB.lower():
vector_dir = Path(CACHE_DIR).parent / "vector_db" / collection_name
if vector_dir.exists() and vector_dir.is_dir():
shutil.rmtree(vector_dir)
log.debug(f"Deleted physical vector directory: {vector_dir}")
return True
return True
@ -229,19 +200,14 @@ def safe_delete_file_by_id(file_id: str) -> bool:
Safely delete a file record and its associated vector collection.
"""
try:
# Get file info before deletion
file_record = Files.get_file_by_id(file_id)
if not file_record:
log.debug(f"File {file_id} not found in database")
return True # Already gone
return True
# Delete vector collection first
collection_name = f"file-{file_id}"
safe_delete_vector_collection(collection_name)
# Delete database record
Files.delete_file_by_id(file_id)
log.debug(f"Deleted file record: {file_id}")
return True
@ -256,7 +222,6 @@ def cleanup_orphaned_uploads(active_file_ids: Set[str]) -> None:
"""
upload_dir = Path(CACHE_DIR).parent / "uploads"
if not upload_dir.exists():
log.debug("Uploads directory does not exist")
return
deleted_count = 0
@ -267,33 +232,27 @@ def cleanup_orphaned_uploads(active_file_ids: Set[str]) -> None:
continue
filename = file_path.name
# Extract file ID from filename (common patterns)
file_id = None
# Pattern 1: UUID_filename or UUID-filename
# Extract file ID from filename patterns
if len(filename) > 36:
potential_id = filename[:36]
if potential_id.count('-') == 4: # UUID format
if potential_id.count('-') == 4:
file_id = potential_id
# Pattern 2: filename might be the file ID itself
if not file_id and filename.count('-') == 4 and len(filename) == 36:
file_id = filename
# Pattern 3: Check if any part of filename matches active IDs
if not file_id:
for active_id in active_file_ids:
if active_id in filename:
file_id = active_id
break
# If we found a potential file ID and it's not active, delete it
if file_id and file_id not in active_file_ids:
try:
file_path.unlink()
deleted_count += 1
log.debug(f"Deleted orphaned upload file: {filename}")
except Exception as e:
log.error(f"Failed to delete upload file {filename}: {e}")
@ -313,73 +272,46 @@ def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids
vector_dir = Path(CACHE_DIR).parent / "vector_db"
if not vector_dir.exists():
log.debug("Vector DB directory does not exist")
return
chroma_db_path = vector_dir / "chroma.sqlite3"
if not chroma_db_path.exists():
log.debug("ChromaDB metadata file does not exist")
return
# Build expected collection names
expected_collections = set()
# File collections: file-{file_id}
for file_id in active_file_ids:
expected_collections.add(f"file-{file_id}")
# Knowledge base collections: {kb_id}
for kb_id in active_kb_ids:
expected_collections.add(kb_id)
log.debug(f"Expected collections to preserve: {expected_collections}")
# Query ChromaDB metadata to get the complete mapping chain:
# Directory UUID -> Collection ID -> Collection Name
uuid_to_collection = {}
try:
import sqlite3
log.debug(f"Attempting to connect to ChromaDB at: {chroma_db_path}")
with sqlite3.connect(str(chroma_db_path)) as conn:
# First, check what tables exist
tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
log.debug(f"ChromaDB tables: {tables}")
# Check the schema of collections table
schema = conn.execute("PRAGMA table_info(collections)").fetchall()
log.debug(f"Collections table schema: {schema}")
# Get Collection ID -> Collection Name mapping
collection_id_to_name = {}
cursor = conn.execute("SELECT id, name FROM collections")
rows = cursor.fetchall()
log.debug(f"Raw ChromaDB collections query results: {rows}")
for row in rows:
collection_id, collection_name = row
collection_id_to_name[collection_id] = collection_name
log.debug(f"Mapped collection ID {collection_id} -> name {collection_name}")
# Get Directory UUID -> Collection ID mapping from segments table
# Only interested in VECTOR segments as those are the actual data directories
cursor = conn.execute("SELECT id, collection FROM segments WHERE scope = 'VECTOR'")
segment_rows = cursor.fetchall()
log.debug(f"Raw ChromaDB segments query results: {segment_rows}")
for row in segment_rows:
segment_id, collection_id = row
if collection_id in collection_id_to_name:
collection_name = collection_id_to_name[collection_id]
uuid_to_collection[segment_id] = collection_name
log.debug(f"Mapped directory UUID {segment_id} -> collection {collection_name}")
log.debug(f"Final uuid_to_collection mapping: {uuid_to_collection}")
log.info(f"Found {len(uuid_to_collection)} vector segments in ChromaDB metadata")
except Exception as e:
log.error(f"Error reading ChromaDB metadata: {e}")
# Fail safe: don't delete anything if we can't read metadata
return
deleted_count = 0
@ -391,16 +323,12 @@ def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids
dir_uuid = collection_dir.name
# Skip system/metadata files
if dir_uuid.startswith('.'):
continue
# Get the actual collection name from metadata
collection_name = uuid_to_collection.get(dir_uuid)
if collection_name is None:
# Directory exists but no metadata entry - it's orphaned
log.debug(f"Directory {dir_uuid} has no metadata entry, deleting")
try:
shutil.rmtree(collection_dir)
deleted_count += 1
@ -408,18 +336,12 @@ def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids
log.error(f"Failed to delete orphaned directory {dir_uuid}: {e}")
elif collection_name not in expected_collections:
# Collection exists but should be deleted
log.debug(f"Collection {collection_name} (UUID: {dir_uuid}) is orphaned, deleting")
try:
shutil.rmtree(collection_dir)
deleted_count += 1
except Exception as e:
log.error(f"Failed to delete collection directory {dir_uuid}: {e}")
else:
# Collection should be preserved
log.debug(f"Preserving collection {collection_name} (UUID: {dir_uuid})")
except Exception as e:
log.error(f"Error cleaning vector collections: {e}")
@ -430,9 +352,6 @@ def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids
def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
"""
Clean up audio cache files older than specified days.
Args:
max_age_days: Delete audio files older than this many days. If None, skip audio cleanup.
"""
if max_age_days is None:
log.info("Skipping audio cache cleanup (max_age_days is None)")
@ -442,7 +361,6 @@ def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
deleted_count = 0
total_size_deleted = 0
# Audio cache directories
audio_dirs = [
Path(CACHE_DIR) / "audio" / "speech",
Path(CACHE_DIR) / "audio" / "transcriptions"
@ -450,7 +368,6 @@ def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
for audio_dir in audio_dirs:
if not audio_dir.exists():
log.debug(f"Audio directory does not exist: {audio_dir}")
continue
try:
@ -458,7 +375,6 @@ def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
if not file_path.is_file():
continue
# Check file age
file_mtime = file_path.stat().st_mtime
if file_mtime < cutoff_time:
try:
@ -466,7 +382,6 @@ def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
file_path.unlink()
deleted_count += 1
total_size_deleted += file_size
log.debug(f"Deleted old audio file: {file_path}")
except Exception as e:
log.error(f"Failed to delete audio file {file_path}: {e}")
@ -516,23 +431,17 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
try:
log.info("Starting data pruning process")
# Stage 1: Delete old chats based on user criteria (optional)
# Stage 1: Delete old chats based on user criteria
if form_data.days is not None:
cutoff_time = int(time.time()) - (form_data.days * 86400)
chats_to_delete = []
for chat in Chats.get_chats():
if chat.updated_at < cutoff_time:
# Check exemption conditions
if form_data.exempt_archived_chats and chat.archived:
log.debug(f"Exempting archived chat: {chat.id}")
continue
if form_data.exempt_chats_in_folders and (getattr(chat, 'folder_id', None) is not None or getattr(chat, 'pinned', False)):
folder_status = f"folder_id: {getattr(chat, 'folder_id', None)}" if getattr(chat, 'folder_id', None) else "not in folder"
pinned_status = f"pinned: {getattr(chat, 'pinned', False)}"
log.debug(f"Exempting chat in folder or pinned: {chat.id} ({folder_status}, {pinned_status})")
continue
log.debug(f"Chat {chat.id} will be deleted - archived: {getattr(chat, 'archived', False)}, folder_id: {getattr(chat, 'folder_id', None)}, pinned: {getattr(chat, 'pinned', False)}")
chats_to_delete.append(chat)
if chats_to_delete:
@ -544,14 +453,12 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping chat deletion (days parameter is None)")
# Stage 2: Build ground truth of what should be preserved
# Stage 2: Build preservation set
log.info("Building preservation set")
# Get all active users
active_user_ids = {user.id for user in Users.get_users()["users"]}
log.info(f"Found {len(active_user_ids)} active users")
# Get all active knowledge bases and their file references
active_kb_ids = set()
knowledge_bases = Knowledges.get_knowledge_bases()
@ -561,13 +468,11 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
log.info(f"Found {len(active_kb_ids)} active knowledge bases")
# Get all files that should be preserved (NOW COMPREHENSIVE!)
active_file_ids = get_active_file_ids()
# Stage 3: Delete orphaned database records
log.info("Deleting orphaned database records")
# Delete files not referenced by any knowledge base or belonging to deleted users
deleted_files = 0
for file_record in Files.get_files():
should_delete = (
@ -582,7 +487,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
if deleted_files > 0:
log.info(f"Deleted {deleted_files} orphaned files")
# Delete knowledge bases from deleted users (if enabled)
deleted_kbs = 0
if form_data.delete_orphaned_knowledge_bases:
for kb in knowledge_bases:
@ -596,10 +500,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping knowledge base deletion (disabled)")
# Delete other user-owned resources from deleted users (conditional)
deleted_others = 0
# Delete orphaned chats of deleted users (conditional)
if form_data.delete_orphaned_chats:
chats_deleted = 0
for chat in Chats.get_chats():
@ -612,7 +514,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping orphaned chat deletion (disabled)")
# Delete orphaned tools of deleted users (conditional)
if form_data.delete_orphaned_tools:
tools_deleted = 0
for tool in Tools.get_tools():
@ -625,7 +526,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping tool deletion (disabled)")
# Delete orphaned functions of deleted users (conditional)
if form_data.delete_orphaned_functions:
functions_deleted = 0
for function in Functions.get_functions():
@ -638,7 +538,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping function deletion (disabled)")
# Delete orphaned notes of deleted users (conditional)
if form_data.delete_orphaned_notes:
notes_deleted = 0
for note in Notes.get_notes():
@ -651,7 +550,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping note deletion (disabled)")
# Delete orphaned prompts of deleted users (conditional)
if form_data.delete_orphaned_prompts:
prompts_deleted = 0
for prompt in Prompts.get_prompts():
@ -664,7 +562,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping prompt deletion (disabled)")
# Delete orphaned models of deleted users (conditional)
if form_data.delete_orphaned_models:
models_deleted = 0
for model in Models.get_all_models():
@ -677,7 +574,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping model deletion (disabled)")
# Delete orphaned folders of deleted users (conditional)
if form_data.delete_orphaned_folders:
folders_deleted = 0
for folder in Folders.get_all_folders():
@ -696,14 +592,10 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
# Stage 4: Clean up orphaned physical files
log.info("Cleaning up orphaned physical files")
# Rebuild active sets after database cleanup
final_active_file_ids = get_active_file_ids()
final_active_kb_ids = {kb.id for kb in Knowledges.get_knowledge_bases()}
# Clean uploads directory
cleanup_orphaned_uploads(final_active_file_ids)
# Clean vector collections
cleanup_orphaned_vector_collections(final_active_file_ids, final_active_kb_ids)
# Stage 5: Audio cache cleanup
@ -713,15 +605,12 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
# Stage 6: Database optimization
log.info("Optimizing database")
# Vacuum main database
try:
with get_db() as db:
db.execute(text("VACUUM"))
log.debug("Vacuumed main database")
except Exception as e:
log.error(f"Failed to vacuum main database: {e}")
# Vacuum ChromaDB database if it exists
if "chroma" in VECTOR_DB.lower():
chroma_db_path = Path(CACHE_DIR).parent / "vector_db" / "chroma.sqlite3"
if chroma_db_path.exists():
@ -729,7 +618,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
import sqlite3
with sqlite3.connect(str(chroma_db_path)) as conn:
conn.execute("VACUUM")
log.debug("Vacuumed ChromaDB database")
except Exception as e:
log.error(f"Failed to vacuum ChromaDB database: {e}")