From 34c9a8825cf3802318c73829a569eb57780ab352 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 12 Aug 2025 14:54:54 +0200
Subject: [PATCH] Update prune.py

---
 backend/open_webui/routers/prune.py | 479 +++++++++++++++++-----------
 1 file changed, 299 insertions(+), 180 deletions(-)

diff --git a/backend/open_webui/routers/prune.py b/backend/open_webui/routers/prune.py
index ca38951832..427c9586bd 100644
--- a/backend/open_webui/routers/prune.py
+++ b/backend/open_webui/routers/prune.py
@@ -38,6 +38,7 @@ class PruneDataForm(BaseModel):
     days: Optional[int] = None
     exempt_archived_chats: bool = False
     exempt_chats_in_folders: bool = False
+    # Orphaned resource deletion toggles (for deleted users)
     delete_orphaned_chats: bool = True
     delete_orphaned_tools: bool = False
     delete_orphaned_functions: bool = False
@@ -46,30 +47,33 @@ class PruneDataForm(BaseModel):
     delete_orphaned_models: bool = True
     delete_orphaned_notes: bool = True
     delete_orphaned_folders: bool = True
-    audio_cache_max_age_days: Optional[int] = 30
 
 
 def get_active_file_ids() -> Set[str]:
     """
     Get all file IDs that are actively referenced by knowledge bases, chats, folders, and messages.
+    This is the ground truth for what files should be preserved.
     """
     active_file_ids = set()
-    
+
     try:
-        # Scan knowledge bases for file references
+        # 1. Get files referenced by knowledge bases (original logic)
         knowledge_bases = Knowledges.get_knowledge_bases()
         log.debug(f"Found {len(knowledge_bases)} knowledge bases")
-        
+
         for kb in knowledge_bases:
             if not kb.data:
                 continue
-                
+
+            # Handle different possible data structures for file references
             file_ids = []
-            
+
+            # Check for file_ids array
             if isinstance(kb.data, dict) and "file_ids" in kb.data:
                 if isinstance(kb.data["file_ids"], list):
                     file_ids.extend(kb.data["file_ids"])
-            
+
+            # Check for files array with id field
             if isinstance(kb.data, dict) and "files" in kb.data:
                 if isinstance(kb.data["files"], list):
                     for file_ref in kb.data["files"]:
@@ -77,97 +81,152 @@ def get_active_file_ids() -> Set[str]:
                             file_ids.append(file_ref["id"])
                         elif isinstance(file_ref, str):
                             file_ids.append(file_ref)
-            
+
+            # Add all found file IDs
             for file_id in file_ids:
                 if isinstance(file_id, str) and file_id.strip():
                     active_file_ids.add(file_id.strip())
+                    log.debug(f"KB {kb.id} references file {file_id}")
 
-        # Scan chats for file references
+        # 2. Get files referenced in chats (NEW: scan chat JSON for file references)
         chats = Chats.get_chats()
         log.debug(f"Found {len(chats)} chats to scan for file references")
-        
+
         for chat in chats:
             if not chat.chat or not isinstance(chat.chat, dict):
                 continue
-                
+
             try:
+                # Convert entire chat JSON to string and extract all file IDs
                 chat_json_str = json.dumps(chat.chat)
-                
-                # Extract file IDs using regex patterns
-                file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
-                url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
-                
+
+                # Find all file ID patterns in the JSON
+                # Pattern 1: "id": "uuid" where uuid looks like a file ID
+                file_id_pattern = re.compile(
+                    r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
+                )
                 potential_file_ids = file_id_pattern.findall(chat_json_str)
+
+                # Pattern 2: URLs containing /api/v1/files/uuid
+                url_pattern = re.compile(
+                    r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
+                )
                 url_file_ids = url_pattern.findall(chat_json_str)
-                
+
+                # Combine and validate against actual file records
                 all_potential_ids = set(potential_file_ids + url_file_ids)
                 for file_id in all_potential_ids:
+                    # Verify this ID exists in the file table to avoid false positives
                     if Files.get_file_by_id(file_id):
                         active_file_ids.add(file_id)
-                        
+                        log.debug(f"Chat {chat.id}: Found active file {file_id}")
+
             except Exception as e:
                 log.debug(f"Error processing chat {chat.id} for file references: {e}")
 
-        # Scan folders for file references
+        # 3. Get files referenced in folders (scan folder.items, folder.data, folder.meta)
         try:
             folders = Folders.get_all_folders()
-            
+            log.debug(f"Found {len(folders)} folders to scan for file references")
+
             for folder in folders:
+                # Check folder.items JSON
                 if folder.items:
                     try:
                         items_str = json.dumps(folder.items)
-                        file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
-                        url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
-                        
-                        potential_ids = file_id_pattern.findall(items_str) + url_pattern.findall(items_str)
+                        # Look for file ID patterns in the JSON
+                        file_id_pattern = re.compile(
+                            r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
+                        )
+                        url_pattern = re.compile(
+                            r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
+                        )
+
+                        potential_ids = file_id_pattern.findall(
+                            items_str
+                        ) + url_pattern.findall(items_str)
                         for file_id in potential_ids:
                             if Files.get_file_by_id(file_id):
                                 active_file_ids.add(file_id)
+                                log.debug(
+                                    f"Folder {folder.id}: Found file {file_id} in items"
+                                )
                     except Exception as e:
                         log.debug(f"Error processing folder {folder.id} items: {e}")
-                
-                if hasattr(folder, 'data') and folder.data:
+
+                # Check folder.data JSON
+                if hasattr(folder, "data") and folder.data:
                     try:
                         data_str = json.dumps(folder.data)
-                        file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
-                        url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
-                        
-                        potential_ids = file_id_pattern.findall(data_str) + url_pattern.findall(data_str)
+                        file_id_pattern = re.compile(
+                            r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
+                        )
+                        url_pattern = re.compile(
+                            r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
+                        )
+
+                        potential_ids = file_id_pattern.findall(
+                            data_str
+                        ) + url_pattern.findall(data_str)
                         for file_id in potential_ids:
                             if Files.get_file_by_id(file_id):
                                 active_file_ids.add(file_id)
+                                log.debug(
+                                    f"Folder {folder.id}: Found file {file_id} in data"
+                                )
                     except Exception as e:
                         log.debug(f"Error processing folder {folder.id} data: {e}")
-                        
+
         except Exception as e:
             log.debug(f"Error scanning folders for file references: {e}")
 
-        # Scan standalone messages for file references
+        # 4. Get files referenced in standalone messages (message table)
         try:
+            # Query message table directly since we may not have a Messages model
             with get_db() as db:
-                message_results = db.execute(text("SELECT id, data FROM message WHERE data IS NOT NULL")).fetchall()
-                
+                message_results = db.execute(
+                    text("SELECT id, data FROM message WHERE data IS NOT NULL")
+                ).fetchall()
+                log.debug(f"Found {len(message_results)} messages with data to scan")
+
                 for message_id, message_data_json in message_results:
                     if message_data_json:
                         try:
-                            data_str = json.dumps(message_data_json) if isinstance(message_data_json, dict) else str(message_data_json)
-                            
-                            file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
-                            url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
-                            
-                            potential_ids = file_id_pattern.findall(data_str) + url_pattern.findall(data_str)
+                            # Convert JSON to string and scan for file patterns
+                            data_str = (
+                                json.dumps(message_data_json)
+                                if isinstance(message_data_json, dict)
+                                else str(message_data_json)
+                            )
+
+                            file_id_pattern = re.compile(
+                                r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
+                            )
+                            url_pattern = re.compile(
+                                r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
+                            )
+
+                            potential_ids = file_id_pattern.findall(
+                                data_str
+                            ) + url_pattern.findall(data_str)
                             for file_id in potential_ids:
                                 if Files.get_file_by_id(file_id):
                                     active_file_ids.add(file_id)
+                                    log.debug(
+                                        f"Message {message_id}: Found file {file_id}"
+                                    )
                         except Exception as e:
-                            log.debug(f"Error processing message {message_id} data: {e}")
+                            log.debug(
+                                f"Error processing message {message_id} data: {e}"
+                            )
         except Exception as e:
             log.debug(f"Error scanning messages for file references: {e}")
-    
+
     except Exception as e:
         log.error(f"Error determining active file IDs: {e}")
+        # Fail safe: return empty set, which will prevent deletion
         return set()
-    
+
     log.info(f"Found {len(active_file_ids)} active file IDs")
     return active_file_ids
 
@@ -177,19 +236,23 @@ def safe_delete_vector_collection(collection_name: str) -> bool:
     Safely delete a vector collection, handling both logical and physical cleanup.
     """
     try:
+        # First, try to delete the collection through the client
         try:
             VECTOR_DB_CLIENT.delete_collection(collection_name=collection_name)
+            log.debug(f"Deleted collection from vector DB: {collection_name}")
         except Exception as e:
             log.debug(f"Collection {collection_name} may not exist in DB: {e}")
-        
+
+        # Then, handle physical cleanup for ChromaDB
         if "chroma" in VECTOR_DB.lower():
             vector_dir = Path(CACHE_DIR).parent / "vector_db" / collection_name
             if vector_dir.exists() and vector_dir.is_dir():
                 shutil.rmtree(vector_dir)
+                log.debug(f"Deleted physical vector directory: {vector_dir}")
                 return True
-        
+
         return True
-        
+
     except Exception as e:
         log.error(f"Error deleting vector collection {collection_name}: {e}")
         return False
@@ -200,17 +263,22 @@ def safe_delete_file_by_id(file_id: str) -> bool:
     Safely delete a file record and its associated vector collection.
     """
     try:
+        # Get file info before deletion
         file_record = Files.get_file_by_id(file_id)
         if not file_record:
-            return True
-        
+            log.debug(f"File {file_id} not found in database")
+            return True  # Already gone
+
+        # Delete vector collection first
         collection_name = f"file-{file_id}"
         safe_delete_vector_collection(collection_name)
-        
+
+        # Delete database record
         Files.delete_file_by_id(file_id)
-        
+        log.debug(f"Deleted file record: {file_id}")
+
         return True
-        
+
     except Exception as e:
         log.error(f"Error deleting file {file_id}: {e}")
         return False
@@ -222,182 +290,197 @@ def cleanup_orphaned_uploads(active_file_ids: Set[str]) -> None:
     """
     upload_dir = Path(CACHE_DIR).parent / "uploads"
     if not upload_dir.exists():
+        log.debug("Uploads directory does not exist")
         return
-    
+
     deleted_count = 0
-    
+
     try:
         for file_path in upload_dir.iterdir():
             if not file_path.is_file():
                 continue
-                
+
             filename = file_path.name
+
+            # Extract file ID from filename (common patterns)
             file_id = None
-            
-            # Extract file ID from filename patterns
+
+            # Pattern 1: UUID_filename or UUID-filename
             if len(filename) > 36:
                 potential_id = filename[:36]
-                if potential_id.count('-') == 4:
+                if potential_id.count("-") == 4:  # UUID format
                     file_id = potential_id
-            
-            if not file_id and filename.count('-') == 4 and len(filename) == 36:
+
+            # Pattern 2: filename might be the file ID itself
+            if not file_id and filename.count("-") == 4 and len(filename) == 36:
                 file_id = filename
-            
+
+            # Pattern 3: Check if any part of filename matches active IDs
             if not file_id:
                 for active_id in active_file_ids:
                     if active_id in filename:
                         file_id = active_id
                         break
-            
+
+            # If we found a potential file ID and it's not active, delete it
             if file_id and file_id not in active_file_ids:
                 try:
                     file_path.unlink()
                     deleted_count += 1
+                    log.debug(f"Deleted orphaned upload file: {filename}")
                 except Exception as e:
                     log.error(f"Failed to delete upload file {filename}: {e}")
-    
+
     except Exception as e:
         log.error(f"Error cleaning uploads directory: {e}")
-    
+
     if deleted_count > 0:
         log.info(f"Deleted {deleted_count} orphaned upload files")
 
 
-def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids: Set[str]) -> None:
+def cleanup_orphaned_vector_collections(
+    active_file_ids: Set[str], active_kb_ids: Set[str]
+) -> None:
     """
     Clean up orphaned vector collections by querying ChromaDB metadata.
     """
     if "chroma" not in VECTOR_DB.lower():
         return
-    
+
     vector_dir = Path(CACHE_DIR).parent / "vector_db"
     if not vector_dir.exists():
+        log.debug("Vector DB directory does not exist")
         return
-    
+
     chroma_db_path = vector_dir / "chroma.sqlite3"
     if not chroma_db_path.exists():
+        log.debug("ChromaDB metadata file does not exist")
         return
-    
+
+    # Build expected collection names
     expected_collections = set()
-    
+
+    # File collections: file-{file_id}
     for file_id in active_file_ids:
         expected_collections.add(f"file-{file_id}")
-    
+
+    # Knowledge base collections: {kb_id}
     for kb_id in active_kb_ids:
         expected_collections.add(kb_id)
-    
+
+    log.debug(f"Expected collections to preserve: {expected_collections}")
+
+    # Query ChromaDB metadata to get the complete mapping chain:
+    # Directory UUID -> Collection ID -> Collection Name
     uuid_to_collection = {}
     try:
         import sqlite3
-        
+
+        log.debug(f"Attempting to connect to ChromaDB at: {chroma_db_path}")
+
         with sqlite3.connect(str(chroma_db_path)) as conn:
+            # First, check what tables exist
+            tables = conn.execute(
+                "SELECT name FROM sqlite_master WHERE type='table'"
+            ).fetchall()
+            log.debug(f"ChromaDB tables: {tables}")
+
+            # Check the schema of collections table
+            schema = conn.execute("PRAGMA table_info(collections)").fetchall()
+            log.debug(f"Collections table schema: {schema}")
+
+            # Get Collection ID -> Collection Name mapping
             collection_id_to_name = {}
             cursor = conn.execute("SELECT id, name FROM collections")
             rows = cursor.fetchall()
-            
+            log.debug(f"Raw ChromaDB collections query results: {rows}")
+
             for row in rows:
                 collection_id, collection_name = row
                 collection_id_to_name[collection_id] = collection_name
-            
-            cursor = conn.execute("SELECT id, collection FROM segments WHERE scope = 'VECTOR'")
+                log.debug(
+                    f"Mapped collection ID {collection_id} -> name {collection_name}"
+                )
+
+            # Get Directory UUID -> Collection ID mapping from segments table
+            # Only interested in VECTOR segments as those are the actual data directories
+            cursor = conn.execute(
+                "SELECT id, collection FROM segments WHERE scope = 'VECTOR'"
+            )
             segment_rows = cursor.fetchall()
-            
+            log.debug(f"Raw ChromaDB segments query results: {segment_rows}")
+
             for row in segment_rows:
                 segment_id, collection_id = row
                 if collection_id in collection_id_to_name:
                     collection_name = collection_id_to_name[collection_id]
                     uuid_to_collection[segment_id] = collection_name
-        
-        log.info(f"Found {len(uuid_to_collection)} vector segments in ChromaDB metadata")
-        
+                    log.debug(
+                        f"Mapped directory UUID {segment_id} -> collection {collection_name}"
+                    )
+
+        log.debug(f"Final uuid_to_collection mapping: {uuid_to_collection}")
+        log.info(
+            f"Found {len(uuid_to_collection)} vector segments in ChromaDB metadata"
+        )
+
     except Exception as e:
         log.error(f"Error reading ChromaDB metadata: {e}")
+        # Fail safe: don't delete anything if we can't read metadata
         return
-    
+
     deleted_count = 0
-    
+
     try:
         for collection_dir in vector_dir.iterdir():
             if not collection_dir.is_dir():
                 continue
-                
+
             dir_uuid = collection_dir.name
-            
-            if dir_uuid.startswith('.'):
+
+            # Skip system/metadata files
+            if dir_uuid.startswith("."):
                 continue
-            
+
+            # Get the actual collection name from metadata
             collection_name = uuid_to_collection.get(dir_uuid)
-            
+
             if collection_name is None:
+                # Directory exists but no metadata entry - it's orphaned
+                log.debug(f"Directory {dir_uuid} has no metadata entry, deleting")
                 try:
                     shutil.rmtree(collection_dir)
                     deleted_count += 1
                 except Exception as e:
                     log.error(f"Failed to delete orphaned directory {dir_uuid}: {e}")
-            
+
             elif collection_name not in expected_collections:
+                # Collection exists but should be deleted
+                log.debug(
+                    f"Collection {collection_name} (UUID: {dir_uuid}) is orphaned, deleting"
+                )
                 try:
                     shutil.rmtree(collection_dir)
                     deleted_count += 1
                 except Exception as e:
                     log.error(f"Failed to delete collection directory {dir_uuid}: {e}")
-    
+
+            else:
+                # Collection should be preserved
+                log.debug(f"Preserving collection {collection_name} (UUID: {dir_uuid})")
+
     except Exception as e:
         log.error(f"Error cleaning vector collections: {e}")
-    
+
     if deleted_count > 0:
         log.info(f"Deleted {deleted_count} orphaned vector collections")
 
 
-def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
-    """
-    Clean up audio cache files older than specified days.
-    """
-    if max_age_days is None:
-        log.info("Skipping audio cache cleanup (max_age_days is None)")
-        return
-    
-    cutoff_time = time.time() - (max_age_days * 86400)
-    deleted_count = 0
-    total_size_deleted = 0
-    
-    audio_dirs = [
-        Path(CACHE_DIR) / "audio" / "speech",
-        Path(CACHE_DIR) / "audio" / "transcriptions"
-    ]
-    
-    for audio_dir in audio_dirs:
-        if not audio_dir.exists():
-            continue
-        
-        try:
-            for file_path in audio_dir.iterdir():
-                if not file_path.is_file():
-                    continue
-                
-                file_mtime = file_path.stat().st_mtime
-                if file_mtime < cutoff_time:
-                    try:
-                        file_size = file_path.stat().st_size
-                        file_path.unlink()
-                        deleted_count += 1
-                        total_size_deleted += file_size
-                    except Exception as e:
-                        log.error(f"Failed to delete audio file {file_path}: {e}")
-                        
-        except Exception as e:
-            log.error(f"Error cleaning audio directory {audio_dir}: {e}")
-    
-    if deleted_count > 0:
-        size_mb = total_size_deleted / (1024 * 1024)
-        log.info(f"Deleted {deleted_count} audio cache files ({size_mb:.1f} MB), older than {max_age_days} days")
-
-
 @router.post("/", response_model=bool)
 async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
     """
     Prunes old and orphaned data using a safe, multi-stage process.
-    
+
     Parameters:
     - days: Optional[int] = None
       - If None: Skip chat deletion entirely
@@ -424,69 +507,90 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
       - If True: Delete notes from deleted users
     - delete_orphaned_folders: bool = True
       - If True: Delete folders from deleted users
-    - audio_cache_max_age_days: Optional[int] = 30
-      - If None: Skip audio cache cleanup
-      - If >= 0: Delete audio cache files (TTS, STT) older than specified days
     """
     try:
         log.info("Starting data pruning process")
-        
-        # Stage 1: Delete old chats based on user criteria
+
+        # Stage 1: Delete old chats based on user criteria (optional)
         if form_data.days is not None:
             cutoff_time = int(time.time()) - (form_data.days * 86400)
             chats_to_delete = []
-            
+
             for chat in Chats.get_chats():
                 if chat.updated_at < cutoff_time:
+                    # Check exemption conditions
                     if form_data.exempt_archived_chats and chat.archived:
+                        log.debug(f"Exempting archived chat: {chat.id}")
                         continue
-                    if form_data.exempt_chats_in_folders and (getattr(chat, 'folder_id', None) is not None or getattr(chat, 'pinned', False)):
+                    if form_data.exempt_chats_in_folders and (
+                        getattr(chat, "folder_id", None) is not None
+                        or getattr(chat, "pinned", False)
+                    ):
+                        folder_status = (
+                            f"folder_id: {getattr(chat, 'folder_id', None)}"
+                            if getattr(chat, "folder_id", None)
+                            else "not in folder"
+                        )
+                        pinned_status = f"pinned: {getattr(chat, 'pinned', False)}"
+                        log.debug(
+                            f"Exempting chat in folder or pinned: {chat.id} ({folder_status}, {pinned_status})"
+                        )
                         continue
+                    log.debug(
+                        f"Chat {chat.id} will be deleted - archived: {getattr(chat, 'archived', False)}, folder_id: {getattr(chat, 'folder_id', None)}, pinned: {getattr(chat, 'pinned', False)}"
+                    )
                     chats_to_delete.append(chat)
-            
+
             if chats_to_delete:
-                log.info(f"Deleting {len(chats_to_delete)} old chats (older than {form_data.days} days)")
+                log.info(
+                    f"Deleting {len(chats_to_delete)} old chats (older than {form_data.days} days)"
+                )
                 for chat in chats_to_delete:
                     Chats.delete_chat_by_id(chat.id)
             else:
                 log.info(f"No chats found older than {form_data.days} days")
         else:
             log.info("Skipping chat deletion (days parameter is None)")
-        
-        # Stage 2: Build preservation set
+
+        # Stage 2: Build ground truth of what should be preserved
         log.info("Building preservation set")
-        
+
+        # Get all active users
         active_user_ids = {user.id for user in Users.get_users()["users"]}
         log.info(f"Found {len(active_user_ids)} active users")
-        
+
+        # Get all active knowledge bases and their file references
         active_kb_ids = set()
         knowledge_bases = Knowledges.get_knowledge_bases()
-        
+
         for kb in knowledge_bases:
             if kb.user_id in active_user_ids:
                 active_kb_ids.add(kb.id)
-        
+
         log.info(f"Found {len(active_kb_ids)} active knowledge bases")
-        
+
+        # Get all files that should be preserved (NOW COMPREHENSIVE!)
         active_file_ids = get_active_file_ids()
-        
+
         # Stage 3: Delete orphaned database records
         log.info("Deleting orphaned database records")
-        
+
+        # Delete files not referenced by any knowledge base or belonging to deleted users
         deleted_files = 0
         for file_record in Files.get_files():
             should_delete = (
-                file_record.id not in active_file_ids or 
-                file_record.user_id not in active_user_ids
+                file_record.id not in active_file_ids
+                or file_record.user_id not in active_user_ids
             )
-            
+
             if should_delete:
                 if safe_delete_file_by_id(file_record.id):
                     deleted_files += 1
-        
+
         if deleted_files > 0:
             log.info(f"Deleted {deleted_files} orphaned files")
-        
+
+        # Delete knowledge bases from deleted users (if enabled)
         deleted_kbs = 0
         if form_data.delete_orphaned_knowledge_bases:
             for kb in knowledge_bases:
@@ -494,14 +598,16 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
                     if safe_delete_vector_collection(kb.id):
                         Knowledges.delete_knowledge_by_id(kb.id)
                         deleted_kbs += 1
-            
+
             if deleted_kbs > 0:
                 log.info(f"Deleted {deleted_kbs} orphaned knowledge bases")
         else:
             log.info("Skipping knowledge base deletion (disabled)")
-        
+
+        # Delete other user-owned resources from deleted users (conditional)
         deleted_others = 0
-        
+
+        # Delete orphaned chats of deleted users (conditional)
         if form_data.delete_orphaned_chats:
             chats_deleted = 0
             for chat in Chats.get_chats():
@@ -513,7 +619,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
                 log.info(f"Deleted {chats_deleted} orphaned chats")
         else:
             log.info("Skipping orphaned chat deletion (disabled)")
-        
+
+        # Delete orphaned tools of deleted users (conditional)
         if form_data.delete_orphaned_tools:
             tools_deleted = 0
             for tool in Tools.get_tools():
@@ -525,7 +632,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
                 log.info(f"Deleted {tools_deleted} orphaned tools")
         else:
             log.info("Skipping tool deletion (disabled)")
-        
+
+        # Delete orphaned functions of deleted users (conditional)
         if form_data.delete_orphaned_functions:
             functions_deleted = 0
             for function in Functions.get_functions():
@@ -537,7 +645,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
                 log.info(f"Deleted {functions_deleted} orphaned functions")
         else:
             log.info("Skipping function deletion (disabled)")
-        
+
+        # Delete orphaned notes of deleted users (conditional)
         if form_data.delete_orphaned_notes:
             notes_deleted = 0
             for note in Notes.get_notes():
@@ -549,7 +658,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
                 log.info(f"Deleted {notes_deleted} orphaned notes")
         else:
             log.info("Skipping note deletion (disabled)")
-        
+
+        # Delete orphaned prompts of deleted users (conditional)
         if form_data.delete_orphaned_prompts:
             prompts_deleted = 0
             for prompt in Prompts.get_prompts():
@@ -561,7 +671,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
                 log.info(f"Deleted {prompts_deleted} orphaned prompts")
         else:
             log.info("Skipping prompt deletion (disabled)")
-        
+
+        # Delete orphaned models of deleted users (conditional)
         if form_data.delete_orphaned_models:
             models_deleted = 0
             for model in Models.get_all_models():
@@ -573,57 +684,65 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
                 log.info(f"Deleted {models_deleted} orphaned models")
         else:
             log.info("Skipping model deletion (disabled)")
-        
+
+        # Delete orphaned folders of deleted users (conditional)
         if form_data.delete_orphaned_folders:
             folders_deleted = 0
             for folder in Folders.get_all_folders():
                 if folder.user_id not in active_user_ids:
-                    Folders.delete_folder_by_id_and_user_id(folder.id, folder.user_id, delete_chats=False)
+                    Folders.delete_folder_by_id_and_user_id(
+                        folder.id, folder.user_id, delete_chats=False
+                    )
                     folders_deleted += 1
                     deleted_others += 1
             if folders_deleted > 0:
                 log.info(f"Deleted {folders_deleted} orphaned folders")
         else:
             log.info("Skipping folder deletion (disabled)")
-        
+
         if deleted_others > 0:
             log.info(f"Total other orphaned records deleted: {deleted_others}")
-        
+
         # Stage 4: Clean up orphaned physical files
         log.info("Cleaning up orphaned physical files")
-        
+
+        # Rebuild active sets after database cleanup
         final_active_file_ids = get_active_file_ids()
         final_active_kb_ids = {kb.id for kb in Knowledges.get_knowledge_bases()}
-        
+
+        # Clean uploads directory
         cleanup_orphaned_uploads(final_active_file_ids)
+
+        # Clean vector collections
         cleanup_orphaned_vector_collections(final_active_file_ids, final_active_kb_ids)
-        
-        # Stage 5: Audio cache cleanup
-        log.info("Cleaning audio cache")
-        cleanup_audio_cache(form_data.audio_cache_max_age_days)
-        
-        # Stage 6: Database optimization
+
+        # Stage 5: Database optimization
         log.info("Optimizing database")
-        
+
+        # Vacuum main database
         try:
             with get_db() as db:
                 db.execute(text("VACUUM"))
+                log.debug("Vacuumed main database")
         except Exception as e:
             log.error(f"Failed to vacuum main database: {e}")
-        
+
+        # Vacuum ChromaDB database if it exists
         if "chroma" in VECTOR_DB.lower():
             chroma_db_path = Path(CACHE_DIR).parent / "vector_db" / "chroma.sqlite3"
             if chroma_db_path.exists():
                 try:
                     import sqlite3
+
                     with sqlite3.connect(str(chroma_db_path)) as conn:
                         conn.execute("VACUUM")
+                        log.debug("Vacuumed ChromaDB database")
                 except Exception as e:
                     log.error(f"Failed to vacuum ChromaDB database: {e}")
-        
+
         log.info("Data pruning completed successfully")
         return True
-        
+
     except Exception as e:
         log.exception(f"Error during data pruning: {e}")
         raise HTTPException(