From 482030ff6970ed344690b79df884be1e09ba7d2a Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 12 Aug 2025 14:56:44 +0200
Subject: [PATCH] Update prune.py

---
 backend/open_webui/routers/prune.py | 207 +++++++++-------------------
 1 file changed, 67 insertions(+), 140 deletions(-)

diff --git a/backend/open_webui/routers/prune.py b/backend/open_webui/routers/prune.py
index 427c9586bd..da08037046 100644
--- a/backend/open_webui/routers/prune.py
+++ b/backend/open_webui/routers/prune.py
@@ -38,7 +38,6 @@ class PruneDataForm(BaseModel):
     days: Optional[int] = None
     exempt_archived_chats: bool = False
     exempt_chats_in_folders: bool = False
-    # Orphaned resource deletion toggles (for deleted users)
     delete_orphaned_chats: bool = True
     delete_orphaned_tools: bool = False
     delete_orphaned_functions: bool = False
@@ -47,17 +46,17 @@ class PruneDataForm(BaseModel):
     delete_orphaned_models: bool = True
     delete_orphaned_notes: bool = True
     delete_orphaned_folders: bool = True
+    audio_cache_max_age_days: Optional[int] = 30
 
 
 def get_active_file_ids() -> Set[str]:
     """
     Get all file IDs that are actively referenced by knowledge bases, chats, folders, and messages.
-    This is the ground truth for what files should be preserved.
     """
     active_file_ids = set()
 
     try:
-        # 1. Get files referenced by knowledge bases (original logic)
+        # Scan knowledge bases for file references
         knowledge_bases = Knowledges.get_knowledge_bases()
         log.debug(f"Found {len(knowledge_bases)} knowledge bases")
 
@@ -65,15 +64,12 @@ def get_active_file_ids() -> Set[str]:
             if not kb.data:
                 continue
 
-            # Handle different possible data structures for file references
             file_ids = []
 
-            # Check for file_ids array
             if isinstance(kb.data, dict) and "file_ids" in kb.data:
                 if isinstance(kb.data["file_ids"], list):
                     file_ids.extend(kb.data["file_ids"])
 
-            # Check for files array with id field
             if isinstance(kb.data, dict) and "files" in kb.data:
                 if isinstance(kb.data["files"], list):
                     for file_ref in kb.data["files"]:
@@ -82,13 +78,11 @@ def get_active_file_ids() -> Set[str]:
                         elif isinstance(file_ref, str):
                             file_ids.append(file_ref)
 
-            # Add all found file IDs
             for file_id in file_ids:
                 if isinstance(file_id, str) and file_id.strip():
                     active_file_ids.add(file_id.strip())
-                    log.debug(f"KB {kb.id} references file {file_id}")
 
-        # 2. Get files referenced in chats (NEW: scan chat JSON for file references)
+        # Scan chats for file references
         chats = Chats.get_chats()
         log.debug(f"Found {len(chats)} chats to scan for file references")
 
@@ -97,44 +91,35 @@ def get_active_file_ids() -> Set[str]:
                 continue
 
             try:
-                # Convert entire chat JSON to string and extract all file IDs
                 chat_json_str = json.dumps(chat.chat)
 
-                # Find all file ID patterns in the JSON
-                # Pattern 1: "id": "uuid" where uuid looks like a file ID
+                # Extract file IDs using regex patterns
                 file_id_pattern = re.compile(
                     r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
                 )
-                potential_file_ids = file_id_pattern.findall(chat_json_str)
-
-                # Pattern 2: URLs containing /api/v1/files/uuid
                 url_pattern = re.compile(
                     r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
                 )
+
+                potential_file_ids = file_id_pattern.findall(chat_json_str)
                 url_file_ids = url_pattern.findall(chat_json_str)
 
-                # Combine and validate against actual file records
                 all_potential_ids = set(potential_file_ids + url_file_ids)
                 for file_id in all_potential_ids:
-                    # Verify this ID exists in the file table to avoid false positives
                     if Files.get_file_by_id(file_id):
                         active_file_ids.add(file_id)
-                        log.debug(f"Chat {chat.id}: Found active file {file_id}")
 
             except Exception as e:
                 log.debug(f"Error processing chat {chat.id} for file references: {e}")
 
-        # 3. Get files referenced in folders (scan folder.items, folder.data, folder.meta)
+        # Scan folders for file references
         try:
             folders = Folders.get_all_folders()
-            log.debug(f"Found {len(folders)} folders to scan for file references")
 
             for folder in folders:
-                # Check folder.items JSON
                 if folder.items:
                     try:
                         items_str = json.dumps(folder.items)
-                        # Look for file ID patterns in the JSON
                         file_id_pattern = re.compile(
                             r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
                         )
@@ -148,13 +133,9 @@ def get_active_file_ids() -> Set[str]:
                         for file_id in potential_ids:
                             if Files.get_file_by_id(file_id):
                                 active_file_ids.add(file_id)
-                                log.debug(
-                                    f"Folder {folder.id}: Found file {file_id} in items"
-                                )
                     except Exception as e:
                         log.debug(f"Error processing folder {folder.id} items: {e}")
 
-                # Check folder.data JSON
                 if hasattr(folder, "data") and folder.data:
                     try:
                         data_str = json.dumps(folder.data)
@@ -171,28 +152,22 @@ def get_active_file_ids() -> Set[str]:
                         for file_id in potential_ids:
                             if Files.get_file_by_id(file_id):
                                 active_file_ids.add(file_id)
-                                log.debug(
-                                    f"Folder {folder.id}: Found file {file_id} in data"
-                                )
                     except Exception as e:
                         log.debug(f"Error processing folder {folder.id} data: {e}")
 
         except Exception as e:
             log.debug(f"Error scanning folders for file references: {e}")
 
-        # 4. Get files referenced in standalone messages (message table)
+        # Scan standalone messages for file references
         try:
-            # Query message table directly since we may not have a Messages model
             with get_db() as db:
                 message_results = db.execute(
                     text("SELECT id, data FROM message WHERE data IS NOT NULL")
                 ).fetchall()
-                log.debug(f"Found {len(message_results)} messages with data to scan")
 
                 for message_id, message_data_json in message_results:
                     if message_data_json:
                         try:
-                            # Convert JSON to string and scan for file patterns
                             data_str = (
                                 json.dumps(message_data_json)
                                 if isinstance(message_data_json, dict)
@@ -212,9 +187,6 @@ def get_active_file_ids() -> Set[str]:
                             for file_id in potential_ids:
                                 if Files.get_file_by_id(file_id):
                                     active_file_ids.add(file_id)
-                                    log.debug(
-                                        f"Message {message_id}: Found file {file_id}"
-                                    )
                         except Exception as e:
                             log.debug(
                                 f"Error processing message {message_id} data: {e}"
@@ -224,7 +196,6 @@ def get_active_file_ids() -> Set[str]:
 
     except Exception as e:
         log.error(f"Error determining active file IDs: {e}")
-        # Fail safe: return empty set, which will prevent deletion
         return set()
 
     log.info(f"Found {len(active_file_ids)} active file IDs")
@@ -236,19 +207,15 @@ def safe_delete_vector_collection(collection_name: str) -> bool:
     Safely delete a vector collection, handling both logical and physical cleanup.
     """
     try:
-        # First, try to delete the collection through the client
         try:
             VECTOR_DB_CLIENT.delete_collection(collection_name=collection_name)
-            log.debug(f"Deleted collection from vector DB: {collection_name}")
         except Exception as e:
             log.debug(f"Collection {collection_name} may not exist in DB: {e}")
 
-        # Then, handle physical cleanup for ChromaDB
         if "chroma" in VECTOR_DB.lower():
             vector_dir = Path(CACHE_DIR).parent / "vector_db" / collection_name
             if vector_dir.exists() and vector_dir.is_dir():
                 shutil.rmtree(vector_dir)
-                log.debug(f"Deleted physical vector directory: {vector_dir}")
                 return True
 
         return True
@@ -263,19 +230,14 @@ def safe_delete_file_by_id(file_id: str) -> bool:
     Safely delete a file record and its associated vector collection.
     """
     try:
-        # Get file info before deletion
         file_record = Files.get_file_by_id(file_id)
         if not file_record:
-            log.debug(f"File {file_id} not found in database")
-            return True  # Already gone
+            return True
 
-        # Delete vector collection first
         collection_name = f"file-{file_id}"
         safe_delete_vector_collection(collection_name)
 
-        # Delete database record
         Files.delete_file_by_id(file_id)
-        log.debug(f"Deleted file record: {file_id}")
 
         return True
 
@@ -290,7 +252,6 @@ def cleanup_orphaned_uploads(active_file_ids: Set[str]) -> None:
     """
     upload_dir = Path(CACHE_DIR).parent / "uploads"
     if not upload_dir.exists():
-        log.debug("Uploads directory does not exist")
         return
 
     deleted_count = 0
@@ -301,33 +262,27 @@ def cleanup_orphaned_uploads(active_file_ids: Set[str]) -> None:
                 continue
 
             filename = file_path.name
-
-            # Extract file ID from filename (common patterns)
             file_id = None
 
-            # Pattern 1: UUID_filename or UUID-filename
+            # Extract file ID from filename patterns
             if len(filename) > 36:
                 potential_id = filename[:36]
-                if potential_id.count("-") == 4:  # UUID format
+                if potential_id.count("-") == 4:
                     file_id = potential_id
 
-            # Pattern 2: filename might be the file ID itself
             if not file_id and filename.count("-") == 4 and len(filename) == 36:
                 file_id = filename
 
-            # Pattern 3: Check if any part of filename matches active IDs
             if not file_id:
                 for active_id in active_file_ids:
                     if active_id in filename:
                         file_id = active_id
                         break
 
-            # If we found a potential file ID and it's not active, delete it
             if file_id and file_id not in active_file_ids:
                 try:
                     file_path.unlink()
                     deleted_count += 1
-                    log.debug(f"Deleted orphaned upload file: {filename}")
                 except Exception as e:
                     log.error(f"Failed to delete upload file {filename}: {e}")
 
@@ -349,84 +304,50 @@ def cleanup_orphaned_vector_collections(
 
     vector_dir = Path(CACHE_DIR).parent / "vector_db"
     if not vector_dir.exists():
-        log.debug("Vector DB directory does not exist")
         return
 
     chroma_db_path = vector_dir / "chroma.sqlite3"
     if not chroma_db_path.exists():
-        log.debug("ChromaDB metadata file does not exist")
         return
 
-    # Build expected collection names
     expected_collections = set()
 
-    # File collections: file-{file_id}
     for file_id in active_file_ids:
         expected_collections.add(f"file-{file_id}")
 
-    # Knowledge base collections: {kb_id}
     for kb_id in active_kb_ids:
         expected_collections.add(kb_id)
 
-    log.debug(f"Expected collections to preserve: {expected_collections}")
-
-    # Query ChromaDB metadata to get the complete mapping chain:
-    # Directory UUID -> Collection ID -> Collection Name
     uuid_to_collection = {}
     try:
         import sqlite3
 
-        log.debug(f"Attempting to connect to ChromaDB at: {chroma_db_path}")
-
         with sqlite3.connect(str(chroma_db_path)) as conn:
-            # First, check what tables exist
-            tables = conn.execute(
-                "SELECT name FROM sqlite_master WHERE type='table'"
-            ).fetchall()
-            log.debug(f"ChromaDB tables: {tables}")
-
-            # Check the schema of collections table
-            schema = conn.execute("PRAGMA table_info(collections)").fetchall()
-            log.debug(f"Collections table schema: {schema}")
-
-            # Get Collection ID -> Collection Name mapping
             collection_id_to_name = {}
             cursor = conn.execute("SELECT id, name FROM collections")
             rows = cursor.fetchall()
-            log.debug(f"Raw ChromaDB collections query results: {rows}")
 
             for row in rows:
                 collection_id, collection_name = row
                 collection_id_to_name[collection_id] = collection_name
-                log.debug(
-                    f"Mapped collection ID {collection_id} -> name {collection_name}"
-                )
 
-            # Get Directory UUID -> Collection ID mapping from segments table
-            # Only interested in VECTOR segments as those are the actual data directories
             cursor = conn.execute(
                 "SELECT id, collection FROM segments WHERE scope = 'VECTOR'"
             )
             segment_rows = cursor.fetchall()
-            log.debug(f"Raw ChromaDB segments query results: {segment_rows}")
 
             for row in segment_rows:
                 segment_id, collection_id = row
                 if collection_id in collection_id_to_name:
                     collection_name = collection_id_to_name[collection_id]
                     uuid_to_collection[segment_id] = collection_name
-                    log.debug(
-                        f"Mapped directory UUID {segment_id} -> collection {collection_name}"
-                    )
 
-        log.debug(f"Final uuid_to_collection mapping: {uuid_to_collection}")
         log.info(
             f"Found {len(uuid_to_collection)} vector segments in ChromaDB metadata"
         )
 
     except Exception as e:
         log.error(f"Error reading ChromaDB metadata: {e}")
-        # Fail safe: don't delete anything if we can't read metadata
         return
 
     deleted_count = 0
@@ -438,16 +359,12 @@ def cleanup_orphaned_vector_collections(
 
             dir_uuid = collection_dir.name
 
-            # Skip system/metadata files
             if dir_uuid.startswith("."):
                 continue
 
-            # Get the actual collection name from metadata
             collection_name = uuid_to_collection.get(dir_uuid)
 
             if collection_name is None:
-                # Directory exists but no metadata entry - it's orphaned
-                log.debug(f"Directory {dir_uuid} has no metadata entry, deleting")
                 try:
                     shutil.rmtree(collection_dir)
                     deleted_count += 1
@@ -455,20 +372,12 @@ def cleanup_orphaned_vector_collections(
                     log.error(f"Failed to delete orphaned directory {dir_uuid}: {e}")
 
             elif collection_name not in expected_collections:
-                # Collection exists but should be deleted
-                log.debug(
-                    f"Collection {collection_name} (UUID: {dir_uuid}) is orphaned, deleting"
-                )
                 try:
                     shutil.rmtree(collection_dir)
                     deleted_count += 1
                 except Exception as e:
                     log.error(f"Failed to delete collection directory {dir_uuid}: {e}")
 
-            else:
-                # Collection should be preserved
-                log.debug(f"Preserving collection {collection_name} (UUID: {dir_uuid})")
-
     except Exception as e:
         log.error(f"Error cleaning vector collections: {e}")
 
@@ -476,6 +385,52 @@ def cleanup_orphaned_vector_collections(
         log.info(f"Deleted {deleted_count} orphaned vector collections")
 
 
+def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
+    """
+    Clean up audio cache files older than specified days.
+    """
+    if max_age_days is None:
+        log.info("Skipping audio cache cleanup (max_age_days is None)")
+        return
+
+    cutoff_time = time.time() - (max_age_days * 86400)
+    deleted_count = 0
+    total_size_deleted = 0
+
+    audio_dirs = [
+        Path(CACHE_DIR) / "audio" / "speech",
+        Path(CACHE_DIR) / "audio" / "transcriptions",
+    ]
+
+    for audio_dir in audio_dirs:
+        if not audio_dir.exists():
+            continue
+
+        try:
+            for file_path in audio_dir.iterdir():
+                if not file_path.is_file():
+                    continue
+
+                file_mtime = file_path.stat().st_mtime
+                if file_mtime < cutoff_time:
+                    try:
+                        file_size = file_path.stat().st_size
+                        file_path.unlink()
+                        deleted_count += 1
+                        total_size_deleted += file_size
+                    except Exception as e:
+                        log.error(f"Failed to delete audio file {file_path}: {e}")
+
+        except Exception as e:
+            log.error(f"Error cleaning audio directory {audio_dir}: {e}")
+
+    if deleted_count > 0:
+        size_mb = total_size_deleted / (1024 * 1024)
+        log.info(
+            f"Deleted {deleted_count} audio cache files ({size_mb:.1f} MB), older than {max_age_days} days"
+        )
+
+
 @router.post("/", response_model=bool)
 async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
     """
@@ -507,38 +462,27 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
       - If True: Delete notes from deleted users
     - delete_orphaned_folders: bool = True
       - If True: Delete folders from deleted users
+    - audio_cache_max_age_days: Optional[int] = 30
+      - If None: Skip audio cache cleanup
+      - If >= 0: Delete audio cache files (TTS, STT) older than specified days
     """
     try:
         log.info("Starting data pruning process")
 
-        # Stage 1: Delete old chats based on user criteria (optional)
+        # Stage 1: Delete old chats based on user criteria
         if form_data.days is not None:
             cutoff_time = int(time.time()) - (form_data.days * 86400)
             chats_to_delete = []
 
             for chat in Chats.get_chats():
                 if chat.updated_at < cutoff_time:
-                    # Check exemption conditions
                     if form_data.exempt_archived_chats and chat.archived:
-                        log.debug(f"Exempting archived chat: {chat.id}")
                         continue
                     if form_data.exempt_chats_in_folders and (
                         getattr(chat, "folder_id", None) is not None
                         or getattr(chat, "pinned", False)
                     ):
-                        folder_status = (
-                            f"folder_id: {getattr(chat, 'folder_id', None)}"
-                            if getattr(chat, "folder_id", None)
-                            else "not in folder"
-                        )
-                        pinned_status = f"pinned: {getattr(chat, 'pinned', False)}"
-                        log.debug(
-                            f"Exempting chat in folder or pinned: {chat.id} ({folder_status}, {pinned_status})"
-                        )
                         continue
-                    log.debug(
-                        f"Chat {chat.id} will be deleted - archived: {getattr(chat, 'archived', False)}, folder_id: {getattr(chat, 'folder_id', None)}, pinned: {getattr(chat, 'pinned', False)}"
-                    )
                     chats_to_delete.append(chat)
 
             if chats_to_delete:
@@ -552,14 +496,12 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
         else:
             log.info("Skipping chat deletion (days parameter is None)")
 
-        # Stage 2: Build ground truth of what should be preserved
+        # Stage 2: Build preservation set
         log.info("Building preservation set")
 
-        # Get all active users
         active_user_ids = {user.id for user in Users.get_users()["users"]}
         log.info(f"Found {len(active_user_ids)} active users")
 
-        # Get all active knowledge bases and their file references
         active_kb_ids = set()
         knowledge_bases = Knowledges.get_knowledge_bases()
 
@@ -569,13 +511,11 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
 
         log.info(f"Found {len(active_kb_ids)} active knowledge bases")
 
-        # Get all files that should be preserved (NOW COMPREHENSIVE!)
         active_file_ids = get_active_file_ids()
 
         # Stage 3: Delete orphaned database records
         log.info("Deleting orphaned database records")
 
-        # Delete files not referenced by any knowledge base or belonging to deleted users
         deleted_files = 0
         for file_record in Files.get_files():
             should_delete = (
@@ -590,7 +530,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
         if deleted_files > 0:
             log.info(f"Deleted {deleted_files} orphaned files")
 
-        # Delete knowledge bases from deleted users (if enabled)
         deleted_kbs = 0
         if form_data.delete_orphaned_knowledge_bases:
             for kb in knowledge_bases:
@@ -604,10 +543,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
         else:
             log.info("Skipping knowledge base deletion (disabled)")
 
-        # Delete other user-owned resources from deleted users (conditional)
         deleted_others = 0
 
-        # Delete orphaned chats of deleted users (conditional)
         if form_data.delete_orphaned_chats:
             chats_deleted = 0
             for chat in Chats.get_chats():
@@ -620,7 +557,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
         else:
             log.info("Skipping orphaned chat deletion (disabled)")
 
-        # Delete orphaned tools of deleted users (conditional)
         if form_data.delete_orphaned_tools:
             tools_deleted = 0
             for tool in Tools.get_tools():
@@ -633,7 +569,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
         else:
             log.info("Skipping tool deletion (disabled)")
 
-        # Delete orphaned functions of deleted users (conditional)
         if form_data.delete_orphaned_functions:
             functions_deleted = 0
             for function in Functions.get_functions():
@@ -646,7 +581,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
         else:
             log.info("Skipping function deletion (disabled)")
 
-        # Delete orphaned notes of deleted users (conditional)
         if form_data.delete_orphaned_notes:
             notes_deleted = 0
             for note in Notes.get_notes():
@@ -659,7 +593,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
         else:
             log.info("Skipping note deletion (disabled)")
 
-        # Delete orphaned prompts of deleted users (conditional)
         if form_data.delete_orphaned_prompts:
             prompts_deleted = 0
             for prompt in Prompts.get_prompts():
@@ -672,7 +605,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
         else:
             log.info("Skipping prompt deletion (disabled)")
 
-        # Delete orphaned models of deleted users (conditional)
         if form_data.delete_orphaned_models:
             models_deleted = 0
             for model in Models.get_all_models():
@@ -685,7 +617,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
         else:
             log.info("Skipping model deletion (disabled)")
 
-        # Delete orphaned folders of deleted users (conditional)
         if form_data.delete_orphaned_folders:
             folders_deleted = 0
             for folder in Folders.get_all_folders():
@@ -706,28 +637,25 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
         # Stage 4: Clean up orphaned physical files
         log.info("Cleaning up orphaned physical files")
 
-        # Rebuild active sets after database cleanup
         final_active_file_ids = get_active_file_ids()
         final_active_kb_ids = {kb.id for kb in Knowledges.get_knowledge_bases()}
 
-        # Clean uploads directory
         cleanup_orphaned_uploads(final_active_file_ids)
-
-        # Clean vector collections
         cleanup_orphaned_vector_collections(final_active_file_ids, final_active_kb_ids)
 
-        # Stage 5: Database optimization
+        # Stage 5: Audio cache cleanup
+        log.info("Cleaning audio cache")
+        cleanup_audio_cache(form_data.audio_cache_max_age_days)
+
+        # Stage 6: Database optimization
         log.info("Optimizing database")
 
-        # Vacuum main database
         try:
             with get_db() as db:
                 db.execute(text("VACUUM"))
-                log.debug("Vacuumed main database")
         except Exception as e:
             log.error(f"Failed to vacuum main database: {e}")
 
-        # Vacuum ChromaDB database if it exists
         if "chroma" in VECTOR_DB.lower():
             chroma_db_path = Path(CACHE_DIR).parent / "vector_db" / "chroma.sqlite3"
             if chroma_db_path.exists():
@@ -736,7 +664,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
 
                     with sqlite3.connect(str(chroma_db_path)) as conn:
                         conn.execute("VACUUM")
-                        log.debug("Vacuumed ChromaDB database")
                 except Exception as e:
                     log.error(f"Failed to vacuum ChromaDB database: {e}")