Update prune.py

2025-12-11 20:05:19 +00:00 · 2025-08-12 14:54:54 +02:00 · 2025-08-12 14:54:54 +02:00 · 34c9a8825c
commit 34c9a8825c
parent 709c852917
1 changed files with 299 additions and 180 deletions
--- a/backend/open_webui/routers/prune.py
+++ b/backend/open_webui/routers/prune.py
@ -38,6 +38,7 @@ class PruneDataForm(BaseModel):
    days: Optional[int] = None
    exempt_archived_chats: bool = False
    exempt_chats_in_folders: bool = False
    # Orphaned resource deletion toggles (for deleted users)
    delete_orphaned_chats: bool = True
    delete_orphaned_tools: bool = False
    delete_orphaned_functions: bool = False
@ -46,30 +47,33 @@ class PruneDataForm(BaseModel):
    delete_orphaned_models: bool = True
    delete_orphaned_notes: bool = True
    delete_orphaned_folders: bool = True
    audio_cache_max_age_days: Optional[int] = 30
 def get_active_file_ids() -> Set[str]:
    """
    Get all file IDs that are actively referenced by knowledge bases, chats, folders, and messages.
    This is the ground truth for what files should be preserved.
    """
    active_file_ids = set()
-    
+
    try:
-        # Scan knowledge bases for file references
+        # 1. Get files referenced by knowledge bases (original logic)
        knowledge_bases = Knowledges.get_knowledge_bases()
        log.debug(f"Found {len(knowledge_bases)} knowledge bases")
-        
+
        for kb in knowledge_bases:
            if not kb.data:
                continue
-                
+
            # Handle different possible data structures for file references
            file_ids = []
-            
+
            # Check for file_ids array
            if isinstance(kb.data, dict) and "file_ids" in kb.data:
                if isinstance(kb.data["file_ids"], list):
                    file_ids.extend(kb.data["file_ids"])
-            
+
            # Check for files array with id field
            if isinstance(kb.data, dict) and "files" in kb.data:
                if isinstance(kb.data["files"], list):
                    for file_ref in kb.data["files"]:
@ -77,97 +81,152 @@ def get_active_file_ids() -> Set[str]:
                            file_ids.append(file_ref["id"])
                        elif isinstance(file_ref, str):
                            file_ids.append(file_ref)
-            
+
            # Add all found file IDs
            for file_id in file_ids:
                if isinstance(file_id, str) and file_id.strip():
                    active_file_ids.add(file_id.strip())
                    log.debug(f"KB {kb.id} references file {file_id}")
-        # Scan chats for file references
+        # 2. Get files referenced in chats (NEW: scan chat JSON for file references)
        chats = Chats.get_chats()
        log.debug(f"Found {len(chats)} chats to scan for file references")
-        
+
        for chat in chats:
            if not chat.chat or not isinstance(chat.chat, dict):
                continue
-                
+
            try:
                # Convert entire chat JSON to string and extract all file IDs
                chat_json_str = json.dumps(chat.chat)
-                
+
-                # Extract file IDs using regex patterns
+                # Find all file ID patterns in the JSON
-                file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
+                # Pattern 1: "id": "uuid" where uuid looks like a file ID
-                url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
+                file_id_pattern = re.compile(
-                
+                    r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
                )
                potential_file_ids = file_id_pattern.findall(chat_json_str)
                # Pattern 2: URLs containing /api/v1/files/uuid
                url_pattern = re.compile(
                    r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
                )
                url_file_ids = url_pattern.findall(chat_json_str)
-                
+
                # Combine and validate against actual file records
                all_potential_ids = set(potential_file_ids + url_file_ids)
                for file_id in all_potential_ids:
                    # Verify this ID exists in the file table to avoid false positives
                    if Files.get_file_by_id(file_id):
                        active_file_ids.add(file_id)
-                        
+                        log.debug(f"Chat {chat.id}: Found active file {file_id}")
            except Exception as e:
                log.debug(f"Error processing chat {chat.id} for file references: {e}")
-        # Scan folders for file references
+        # 3. Get files referenced in folders (scan folder.items, folder.data, folder.meta)
        try:
            folders = Folders.get_all_folders()
-            
+            log.debug(f"Found {len(folders)} folders to scan for file references")
            for folder in folders:
                # Check folder.items JSON
                if folder.items:
                    try:
                        items_str = json.dumps(folder.items)
-                        file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
+                        # Look for file ID patterns in the JSON
-                        url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
+                        file_id_pattern = re.compile(
-                        
+                            r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
-                        potential_ids = file_id_pattern.findall(items_str) + url_pattern.findall(items_str)
+                        )
                        url_pattern = re.compile(
                            r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
                        )
                        potential_ids = file_id_pattern.findall(
                            items_str
                        ) + url_pattern.findall(items_str)
                        for file_id in potential_ids:
                            if Files.get_file_by_id(file_id):
                                active_file_ids.add(file_id)
                                log.debug(
                                    f"Folder {folder.id}: Found file {file_id} in items"
                                )
                    except Exception as e:
                        log.debug(f"Error processing folder {folder.id} items: {e}")
-                
+
-                if hasattr(folder, 'data') and folder.data:
+                # Check folder.data JSON
                if hasattr(folder, "data") and folder.data:
                    try:
                        data_str = json.dumps(folder.data)
-                        file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
+                        file_id_pattern = re.compile(
-                        url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
+                            r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
-                        
+                        )
-                        potential_ids = file_id_pattern.findall(data_str) + url_pattern.findall(data_str)
+                        url_pattern = re.compile(
                            r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
                        )
                        potential_ids = file_id_pattern.findall(
                            data_str
                        ) + url_pattern.findall(data_str)
                        for file_id in potential_ids:
                            if Files.get_file_by_id(file_id):
                                active_file_ids.add(file_id)
                                log.debug(
                                    f"Folder {folder.id}: Found file {file_id} in data"
                                )
                    except Exception as e:
                        log.debug(f"Error processing folder {folder.id} data: {e}")
-                        
+
        except Exception as e:
            log.debug(f"Error scanning folders for file references: {e}")
-        # Scan standalone messages for file references
+        # 4. Get files referenced in standalone messages (message table)
        try:
            # Query message table directly since we may not have a Messages model
            with get_db() as db:
-                message_results = db.execute(text("SELECT id, data FROM message WHERE data IS NOT NULL")).fetchall()
+                message_results = db.execute(
-                
+                    text("SELECT id, data FROM message WHERE data IS NOT NULL")
                ).fetchall()
                log.debug(f"Found {len(message_results)} messages with data to scan")
                for message_id, message_data_json in message_results:
                    if message_data_json:
                        try:
-                            data_str = json.dumps(message_data_json) if isinstance(message_data_json, dict) else str(message_data_json)
+                            # Convert JSON to string and scan for file patterns
-                            
+                            data_str = (
-                            file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
+                                json.dumps(message_data_json)
-                            url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
+                                if isinstance(message_data_json, dict)
-                            
+                                else str(message_data_json)
-                            potential_ids = file_id_pattern.findall(data_str) + url_pattern.findall(data_str)
+                            )
                            file_id_pattern = re.compile(
                                r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
                            )
                            url_pattern = re.compile(
                                r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
                            )
                            potential_ids = file_id_pattern.findall(
                                data_str
                            ) + url_pattern.findall(data_str)
                            for file_id in potential_ids:
                                if Files.get_file_by_id(file_id):
                                    active_file_ids.add(file_id)
                                    log.debug(
                                        f"Message {message_id}: Found file {file_id}"
                                    )
                        except Exception as e:
-                            log.debug(f"Error processing message {message_id} data: {e}")
+                            log.debug(
                                f"Error processing message {message_id} data: {e}"
                            )
        except Exception as e:
            log.debug(f"Error scanning messages for file references: {e}")
-    
+
    except Exception as e:
        log.error(f"Error determining active file IDs: {e}")
        # Fail safe: return empty set, which will prevent deletion
        return set()
-    
+
    log.info(f"Found {len(active_file_ids)} active file IDs")
    return active_file_ids
@ -177,19 +236,23 @@ def safe_delete_vector_collection(collection_name: str) -> bool:
    Safely delete a vector collection, handling both logical and physical cleanup.
    """
    try:
        # First, try to delete the collection through the client
        try:
            VECTOR_DB_CLIENT.delete_collection(collection_name=collection_name)
            log.debug(f"Deleted collection from vector DB: {collection_name}")
        except Exception as e:
            log.debug(f"Collection {collection_name} may not exist in DB: {e}")
-        
+
        # Then, handle physical cleanup for ChromaDB
        if "chroma" in VECTOR_DB.lower():
            vector_dir = Path(CACHE_DIR).parent / "vector_db" / collection_name
            if vector_dir.exists() and vector_dir.is_dir():
                shutil.rmtree(vector_dir)
                log.debug(f"Deleted physical vector directory: {vector_dir}")
                return True
-        
+
        return True
-        
+
    except Exception as e:
        log.error(f"Error deleting vector collection {collection_name}: {e}")
        return False
@ -200,17 +263,22 @@ def safe_delete_file_by_id(file_id: str) -> bool:
    Safely delete a file record and its associated vector collection.
    """
    try:
        # Get file info before deletion
        file_record = Files.get_file_by_id(file_id)
        if not file_record:
-            return True
+            log.debug(f"File {file_id} not found in database")
-        
+            return True  # Already gone
        # Delete vector collection first
        collection_name = f"file-{file_id}"
        safe_delete_vector_collection(collection_name)
-        
+
        # Delete database record
        Files.delete_file_by_id(file_id)
-        
+        log.debug(f"Deleted file record: {file_id}")
        return True
-        
+
    except Exception as e:
        log.error(f"Error deleting file {file_id}: {e}")
        return False
@ -222,182 +290,197 @@ def cleanup_orphaned_uploads(active_file_ids: Set[str]) -> None:
    """
    upload_dir = Path(CACHE_DIR).parent / "uploads"
    if not upload_dir.exists():
        log.debug("Uploads directory does not exist")
        return
-    
+
    deleted_count = 0
-    
+
    try:
        for file_path in upload_dir.iterdir():
            if not file_path.is_file():
                continue
-                
+
            filename = file_path.name
            # Extract file ID from filename (common patterns)
            file_id = None
-            
+
-            # Extract file ID from filename patterns
+            # Pattern 1: UUID_filename or UUID-filename
            if len(filename) > 36:
                potential_id = filename[:36]
-                if potential_id.count('-') == 4:
+                if potential_id.count("-") == 4:  # UUID format
                    file_id = potential_id
-            
+
-            if not file_id and filename.count('-') == 4 and len(filename) == 36:
+            # Pattern 2: filename might be the file ID itself
            if not file_id and filename.count("-") == 4 and len(filename) == 36:
                file_id = filename
-            
+
            # Pattern 3: Check if any part of filename matches active IDs
            if not file_id:
                for active_id in active_file_ids:
                    if active_id in filename:
                        file_id = active_id
                        break
-            
+
            # If we found a potential file ID and it's not active, delete it
            if file_id and file_id not in active_file_ids:
                try:
                    file_path.unlink()
                    deleted_count += 1
                    log.debug(f"Deleted orphaned upload file: {filename}")
                except Exception as e:
                    log.error(f"Failed to delete upload file {filename}: {e}")
-    
+
    except Exception as e:
        log.error(f"Error cleaning uploads directory: {e}")
-    
+
    if deleted_count > 0:
        log.info(f"Deleted {deleted_count} orphaned upload files")
-def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids: Set[str]) -> None:
+def cleanup_orphaned_vector_collections(
    active_file_ids: Set[str], active_kb_ids: Set[str]
 ) -> None:
    """
    Clean up orphaned vector collections by querying ChromaDB metadata.
    """
    if "chroma" not in VECTOR_DB.lower():
        return
-    
+
    vector_dir = Path(CACHE_DIR).parent / "vector_db"
    if not vector_dir.exists():
        log.debug("Vector DB directory does not exist")
        return
-    
+
    chroma_db_path = vector_dir / "chroma.sqlite3"
    if not chroma_db_path.exists():
        log.debug("ChromaDB metadata file does not exist")
        return
-    
+
    # Build expected collection names
    expected_collections = set()
-    
+
    # File collections: file-{file_id}
    for file_id in active_file_ids:
        expected_collections.add(f"file-{file_id}")
-    
+
    # Knowledge base collections: {kb_id}
    for kb_id in active_kb_ids:
        expected_collections.add(kb_id)
-    
+
    log.debug(f"Expected collections to preserve: {expected_collections}")
    # Query ChromaDB metadata to get the complete mapping chain:
    # Directory UUID -> Collection ID -> Collection Name
    uuid_to_collection = {}
    try:
        import sqlite3
-        
+
        log.debug(f"Attempting to connect to ChromaDB at: {chroma_db_path}")
        with sqlite3.connect(str(chroma_db_path)) as conn:
            # First, check what tables exist
            tables = conn.execute(
                "SELECT name FROM sqlite_master WHERE type='table'"
            ).fetchall()
            log.debug(f"ChromaDB tables: {tables}")
            # Check the schema of collections table
            schema = conn.execute("PRAGMA table_info(collections)").fetchall()
            log.debug(f"Collections table schema: {schema}")
            # Get Collection ID -> Collection Name mapping
            collection_id_to_name = {}
            cursor = conn.execute("SELECT id, name FROM collections")
            rows = cursor.fetchall()
-            
+            log.debug(f"Raw ChromaDB collections query results: {rows}")
            for row in rows:
                collection_id, collection_name = row
                collection_id_to_name[collection_id] = collection_name
-            
+                log.debug(
-            cursor = conn.execute("SELECT id, collection FROM segments WHERE scope = 'VECTOR'")
+                    f"Mapped collection ID {collection_id} -> name {collection_name}"
                )
            # Get Directory UUID -> Collection ID mapping from segments table
            # Only interested in VECTOR segments as those are the actual data directories
            cursor = conn.execute(
                "SELECT id, collection FROM segments WHERE scope = 'VECTOR'"
            )
            segment_rows = cursor.fetchall()
-            
+            log.debug(f"Raw ChromaDB segments query results: {segment_rows}")
            for row in segment_rows:
                segment_id, collection_id = row
                if collection_id in collection_id_to_name:
                    collection_name = collection_id_to_name[collection_id]
                    uuid_to_collection[segment_id] = collection_name
-        
+                    log.debug(
-        log.info(f"Found {len(uuid_to_collection)} vector segments in ChromaDB metadata")
+                        f"Mapped directory UUID {segment_id} -> collection {collection_name}"
-        
+                    )
        log.debug(f"Final uuid_to_collection mapping: {uuid_to_collection}")
        log.info(
            f"Found {len(uuid_to_collection)} vector segments in ChromaDB metadata"
        )
    except Exception as e:
        log.error(f"Error reading ChromaDB metadata: {e}")
        # Fail safe: don't delete anything if we can't read metadata
        return
-    
+
    deleted_count = 0
-    
+
    try:
        for collection_dir in vector_dir.iterdir():
            if not collection_dir.is_dir():
                continue
-                
+
            dir_uuid = collection_dir.name
-            
+
-            if dir_uuid.startswith('.'):
+            # Skip system/metadata files
            if dir_uuid.startswith("."):
                continue
-            
+
            # Get the actual collection name from metadata
            collection_name = uuid_to_collection.get(dir_uuid)
-            
+
            if collection_name is None:
                # Directory exists but no metadata entry - it's orphaned
                log.debug(f"Directory {dir_uuid} has no metadata entry, deleting")
                try:
                    shutil.rmtree(collection_dir)
                    deleted_count += 1
                except Exception as e:
                    log.error(f"Failed to delete orphaned directory {dir_uuid}: {e}")
-            
+
            elif collection_name not in expected_collections:
                # Collection exists but should be deleted
                log.debug(
                    f"Collection {collection_name} (UUID: {dir_uuid}) is orphaned, deleting"
                )
                try:
                    shutil.rmtree(collection_dir)
                    deleted_count += 1
                except Exception as e:
                    log.error(f"Failed to delete collection directory {dir_uuid}: {e}")
-    
+
            else:
                # Collection should be preserved
                log.debug(f"Preserving collection {collection_name} (UUID: {dir_uuid})")
    except Exception as e:
        log.error(f"Error cleaning vector collections: {e}")
-    
+
    if deleted_count > 0:
        log.info(f"Deleted {deleted_count} orphaned vector collections")
 def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
    """
    Clean up audio cache files older than specified days.
    """
    if max_age_days is None:
        log.info("Skipping audio cache cleanup (max_age_days is None)")
        return
    cutoff_time = time.time() - (max_age_days * 86400)
    deleted_count = 0
    total_size_deleted = 0
    audio_dirs = [
        Path(CACHE_DIR) / "audio" / "speech",
        Path(CACHE_DIR) / "audio" / "transcriptions"
    ]
    for audio_dir in audio_dirs:
        if not audio_dir.exists():
            continue
        try:
            for file_path in audio_dir.iterdir():
                if not file_path.is_file():
                    continue
                file_mtime = file_path.stat().st_mtime
                if file_mtime < cutoff_time:
                    try:
                        file_size = file_path.stat().st_size
                        file_path.unlink()
                        deleted_count += 1
                        total_size_deleted += file_size
                    except Exception as e:
                        log.error(f"Failed to delete audio file {file_path}: {e}")
        except Exception as e:
            log.error(f"Error cleaning audio directory {audio_dir}: {e}")
    if deleted_count > 0:
        size_mb = total_size_deleted / (1024 * 1024)
        log.info(f"Deleted {deleted_count} audio cache files ({size_mb:.1f} MB), older than {max_age_days} days")
@router.post("/", response_model=bool)
 async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
    """
    Prunes old and orphaned data using a safe, multi-stage process.
-    
+
    Parameters:
    - days: Optional[int] = None
      - If None: Skip chat deletion entirely
@ -424,69 +507,90 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
      - If True: Delete notes from deleted users
    - delete_orphaned_folders: bool = True
      - If True: Delete folders from deleted users
    - audio_cache_max_age_days: Optional[int] = 30
      - If None: Skip audio cache cleanup
      - If >= 0: Delete audio cache files (TTS, STT) older than specified days
    """
    try:
        log.info("Starting data pruning process")
-        
+
-        # Stage 1: Delete old chats based on user criteria
+        # Stage 1: Delete old chats based on user criteria (optional)
        if form_data.days is not None:
            cutoff_time = int(time.time()) - (form_data.days * 86400)
            chats_to_delete = []
-            
+
            for chat in Chats.get_chats():
                if chat.updated_at < cutoff_time:
                    # Check exemption conditions
                    if form_data.exempt_archived_chats and chat.archived:
                        log.debug(f"Exempting archived chat: {chat.id}")
                        continue
-                    if form_data.exempt_chats_in_folders and (getattr(chat, 'folder_id', None) is not None or getattr(chat, 'pinned', False)):
+                    if form_data.exempt_chats_in_folders and (
                        getattr(chat, "folder_id", None) is not None
                        or getattr(chat, "pinned", False)
                    ):
                        folder_status = (
                            f"folder_id: {getattr(chat, 'folder_id', None)}"
                            if getattr(chat, "folder_id", None)
                            else "not in folder"
                        )
                        pinned_status = f"pinned: {getattr(chat, 'pinned', False)}"
                        log.debug(
                            f"Exempting chat in folder or pinned: {chat.id} ({folder_status}, {pinned_status})"
                        )
                        continue
                    log.debug(
                        f"Chat {chat.id} will be deleted - archived: {getattr(chat, 'archived', False)}, folder_id: {getattr(chat, 'folder_id', None)}, pinned: {getattr(chat, 'pinned', False)}"
                    )
                    chats_to_delete.append(chat)
-            
+
            if chats_to_delete:
-                log.info(f"Deleting {len(chats_to_delete)} old chats (older than {form_data.days} days)")
+                log.info(
                    f"Deleting {len(chats_to_delete)} old chats (older than {form_data.days} days)"
                )
                for chat in chats_to_delete:
                    Chats.delete_chat_by_id(chat.id)
            else:
                log.info(f"No chats found older than {form_data.days} days")
        else:
            log.info("Skipping chat deletion (days parameter is None)")
-        
+
-        # Stage 2: Build preservation set
+        # Stage 2: Build ground truth of what should be preserved
        log.info("Building preservation set")
-        
+
        # Get all active users
        active_user_ids = {user.id for user in Users.get_users()["users"]}
        log.info(f"Found {len(active_user_ids)} active users")
-        
+
        # Get all active knowledge bases and their file references
        active_kb_ids = set()
        knowledge_bases = Knowledges.get_knowledge_bases()
-        
+
        for kb in knowledge_bases:
            if kb.user_id in active_user_ids:
                active_kb_ids.add(kb.id)
-        
+
        log.info(f"Found {len(active_kb_ids)} active knowledge bases")
-        
+
        # Get all files that should be preserved (NOW COMPREHENSIVE!)
        active_file_ids = get_active_file_ids()
-        
+
        # Stage 3: Delete orphaned database records
        log.info("Deleting orphaned database records")
-        
+
        # Delete files not referenced by any knowledge base or belonging to deleted users
        deleted_files = 0
        for file_record in Files.get_files():
            should_delete = (
-                file_record.id not in active_file_ids or 
+                file_record.id not in active_file_ids
-                file_record.user_id not in active_user_ids
+                or file_record.user_id not in active_user_ids
            )
-            
+
            if should_delete:
                if safe_delete_file_by_id(file_record.id):
                    deleted_files += 1
-        
+
        if deleted_files > 0:
            log.info(f"Deleted {deleted_files} orphaned files")
-        
+
        # Delete knowledge bases from deleted users (if enabled)
        deleted_kbs = 0
        if form_data.delete_orphaned_knowledge_bases:
            for kb in knowledge_bases:
@ -494,14 +598,16 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
                    if safe_delete_vector_collection(kb.id):
                        Knowledges.delete_knowledge_by_id(kb.id)
                        deleted_kbs += 1
-            
+
            if deleted_kbs > 0:
                log.info(f"Deleted {deleted_kbs} orphaned knowledge bases")
        else:
            log.info("Skipping knowledge base deletion (disabled)")
-        
+
        # Delete other user-owned resources from deleted users (conditional)
        deleted_others = 0
-        
+
        # Delete orphaned chats of deleted users (conditional)
        if form_data.delete_orphaned_chats:
            chats_deleted = 0
            for chat in Chats.get_chats():
@ -513,7 +619,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
                log.info(f"Deleted {chats_deleted} orphaned chats")
        else:
            log.info("Skipping orphaned chat deletion (disabled)")
-        
+
        # Delete orphaned tools of deleted users (conditional)
        if form_data.delete_orphaned_tools:
            tools_deleted = 0
            for tool in Tools.get_tools():
@ -525,7 +632,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
                log.info(f"Deleted {tools_deleted} orphaned tools")
        else:
            log.info("Skipping tool deletion (disabled)")
-        
+
        # Delete orphaned functions of deleted users (conditional)
        if form_data.delete_orphaned_functions:
            functions_deleted = 0
            for function in Functions.get_functions():
@ -537,7 +645,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
                log.info(f"Deleted {functions_deleted} orphaned functions")
        else:
            log.info("Skipping function deletion (disabled)")
-        
+
        # Delete orphaned notes of deleted users (conditional)
        if form_data.delete_orphaned_notes:
            notes_deleted = 0
            for note in Notes.get_notes():
@ -549,7 +658,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
                log.info(f"Deleted {notes_deleted} orphaned notes")
        else:
            log.info("Skipping note deletion (disabled)")
-        
+
        # Delete orphaned prompts of deleted users (conditional)
        if form_data.delete_orphaned_prompts:
            prompts_deleted = 0
            for prompt in Prompts.get_prompts():
@ -561,7 +671,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
                log.info(f"Deleted {prompts_deleted} orphaned prompts")
        else:
            log.info("Skipping prompt deletion (disabled)")
-        
+
        # Delete orphaned models of deleted users (conditional)
        if form_data.delete_orphaned_models:
            models_deleted = 0
            for model in Models.get_all_models():
@ -573,57 +684,65 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
                log.info(f"Deleted {models_deleted} orphaned models")
        else:
            log.info("Skipping model deletion (disabled)")
-        
+
        # Delete orphaned folders of deleted users (conditional)
        if form_data.delete_orphaned_folders:
            folders_deleted = 0
            for folder in Folders.get_all_folders():
                if folder.user_id not in active_user_ids:
-                    Folders.delete_folder_by_id_and_user_id(folder.id, folder.user_id, delete_chats=False)
+                    Folders.delete_folder_by_id_and_user_id(
                        folder.id, folder.user_id, delete_chats=False
                    )
                    folders_deleted += 1
                    deleted_others += 1
            if folders_deleted > 0:
                log.info(f"Deleted {folders_deleted} orphaned folders")
        else:
            log.info("Skipping folder deletion (disabled)")
-        
+
        if deleted_others > 0:
            log.info(f"Total other orphaned records deleted: {deleted_others}")
-        
+
        # Stage 4: Clean up orphaned physical files
        log.info("Cleaning up orphaned physical files")
-        
+
        # Rebuild active sets after database cleanup
        final_active_file_ids = get_active_file_ids()
        final_active_kb_ids = {kb.id for kb in Knowledges.get_knowledge_bases()}
-        
+
        # Clean uploads directory
        cleanup_orphaned_uploads(final_active_file_ids)
        # Clean vector collections
        cleanup_orphaned_vector_collections(final_active_file_ids, final_active_kb_ids)
-        
+
-        # Stage 5: Audio cache cleanup
+        # Stage 5: Database optimization
        log.info("Cleaning audio cache")
        cleanup_audio_cache(form_data.audio_cache_max_age_days)
        # Stage 6: Database optimization
        log.info("Optimizing database")
-        
+
        # Vacuum main database
        try:
            with get_db() as db:
                db.execute(text("VACUUM"))
                log.debug("Vacuumed main database")
        except Exception as e:
            log.error(f"Failed to vacuum main database: {e}")
-        
+
        # Vacuum ChromaDB database if it exists
        if "chroma" in VECTOR_DB.lower():
            chroma_db_path = Path(CACHE_DIR).parent / "vector_db" / "chroma.sqlite3"
            if chroma_db_path.exists():
                try:
                    import sqlite3
                    with sqlite3.connect(str(chroma_db_path)) as conn:
                        conn.execute("VACUUM")
                        log.debug("Vacuumed ChromaDB database")
                except Exception as e:
                    log.error(f"Failed to vacuum ChromaDB database: {e}")
-        
+
        log.info("Data pruning completed successfully")
        return True
-        
+
    except Exception as e:
        log.exception(f"Error during data pruning: {e}")
        raise HTTPException(