From d454e6a03359155a10fd6e8305f1a640945206ea Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Sun, 10 Aug 2025 23:40:01 +0200
Subject: [PATCH 01/43] Feat/prune orphaned data (#16)
* feat: Add prune orphaned data functionality
* feat: Add prune orphaned data functionality
* feat: Add prune orphaned data functionality
* fix: Restyle PruneDataDialog modal
* feat: Add comprehensive prune orphaned data functionality and fix circular import
* feat: Add comprehensive prune orphaned data functionality and fix circular import
* feat: Add comprehensive prune orphaned data functionality and fix database size issues
* feat: Add comprehensive prune orphaned data functionality and fix database size issues
* feat: Add comprehensive prune orphaned data functionality and fix database size issues
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update folders.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Update prune.py
* Delete backend/open_webui/test/test_prune.py
* Update prune.ts
* Update PruneDataDialog.svelte
* Update prune.py
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update prune.py
* Update PruneDataDialog.svelte
* Update prune.ts
* Update Database.svelte
* Update prune.py
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update prune.py
* Update prune.py
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update Database.svelte
* Update prune.py
* Update prune.ts
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
* Update prune.py
* Update prune.ts
* Update PruneDataDialog.svelte
* Update files.py
* Update prompts.py
* Update notes.py
* Update models.py
* Update access_control.py
* Update PruneDataDialog.svelte
* Update PruneDataDialog.svelte
---------
Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
---
backend/open_webui/main.py | 2 +
backend/open_webui/models/folders.py | 4 +
backend/open_webui/routers/prune.py | 684 ++++++++++++++++++
src/lib/apis/prune.ts | 54 ++
.../components/admin/Settings/Database.svelte | 93 ++-
.../components/common/PruneDataDialog.svelte | 589 +++++++++++++++
6 files changed, 1402 insertions(+), 24 deletions(-)
create mode 100644 backend/open_webui/routers/prune.py
create mode 100644 src/lib/apis/prune.ts
create mode 100644 src/lib/components/common/PruneDataDialog.svelte
diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py
index 618640486d..f6398b23fa 100644
--- a/backend/open_webui/main.py
+++ b/backend/open_webui/main.py
@@ -81,6 +81,7 @@ from open_webui.routers import (
models,
knowledge,
prompts,
+ prune,
evaluations,
tools,
users,
@@ -1234,6 +1235,7 @@ app.include_router(
evaluations.router, prefix="/api/v1/evaluations", tags=["evaluations"]
)
app.include_router(utils.router, prefix="/api/v1/utils", tags=["utils"])
+app.include_router(prune.router, prefix="/api/v1/prune", tags=["prune"])
# SCIM 2.0 API for identity management
if SCIM_ENABLED:
diff --git a/backend/open_webui/models/folders.py b/backend/open_webui/models/folders.py
index 15deecbf42..8b631f88de 100644
--- a/backend/open_webui/models/folders.py
+++ b/backend/open_webui/models/folders.py
@@ -135,6 +135,10 @@ class FolderTable:
for folder in db.query(Folder).filter_by(user_id=user_id).all()
]
+ def get_all_folders(self) -> list[FolderModel]:
+ with get_db() as db:
+ return [FolderModel.model_validate(folder) for folder in db.query(Folder).all()]
+
def get_folder_by_parent_id_and_user_id_and_name(
self, parent_id: Optional[str], user_id: str, name: str
) -> Optional[FolderModel]:
diff --git a/backend/open_webui/routers/prune.py b/backend/open_webui/routers/prune.py
new file mode 100644
index 0000000000..78c333e538
--- /dev/null
+++ b/backend/open_webui/routers/prune.py
@@ -0,0 +1,684 @@
+import logging
+import time
+import os
+import shutil
+import json
+import re
+from typing import Optional, Set
+from pathlib import Path
+
+from fastapi import APIRouter, Depends, HTTPException, status
+from pydantic import BaseModel
+from sqlalchemy import text
+
+from open_webui.utils.auth import get_admin_user
+from open_webui.models.users import Users
+from open_webui.models.chats import Chats
+from open_webui.models.files import Files
+from open_webui.models.notes import Notes
+from open_webui.models.prompts import Prompts
+from open_webui.models.models import Models
+from open_webui.models.knowledge import Knowledges
+from open_webui.models.functions import Functions
+from open_webui.models.tools import Tools
+from open_webui.models.folders import Folders
+from open_webui.retrieval.vector.factory import VECTOR_DB_CLIENT, VECTOR_DB
+from open_webui.constants import ERROR_MESSAGES
+from open_webui.env import SRC_LOG_LEVELS
+from open_webui.config import CACHE_DIR
+from open_webui.internal.db import get_db
+
+log = logging.getLogger(__name__)
+log.setLevel(SRC_LOG_LEVELS["MODELS"])
+
+router = APIRouter()
+
+
+class PruneDataForm(BaseModel):
+ days: Optional[int] = None
+ exempt_archived_chats: bool = False
+ exempt_chats_in_folders: bool = False
+ # Orphaned resource deletion toggles (for deleted users)
+ delete_orphaned_chats: bool = True
+ delete_orphaned_tools: bool = False
+ delete_orphaned_functions: bool = False
+ delete_orphaned_prompts: bool = True
+ delete_orphaned_knowledge_bases: bool = True
+ delete_orphaned_models: bool = True
+ delete_orphaned_notes: bool = True
+ delete_orphaned_folders: bool = True
+
+
+def get_active_file_ids() -> Set[str]:
+ """
+ Get all file IDs that are actively referenced by knowledge bases, chats, folders, and messages.
+ This is the ground truth for what files should be preserved.
+ """
+ active_file_ids = set()
+
+ try:
+ # 1. Get files referenced by knowledge bases (original logic)
+ knowledge_bases = Knowledges.get_knowledge_bases()
+ log.debug(f"Found {len(knowledge_bases)} knowledge bases")
+
+ for kb in knowledge_bases:
+ if not kb.data:
+ continue
+
+ # Handle different possible data structures for file references
+ file_ids = []
+
+ # Check for file_ids array
+ if isinstance(kb.data, dict) and "file_ids" in kb.data:
+ if isinstance(kb.data["file_ids"], list):
+ file_ids.extend(kb.data["file_ids"])
+
+ # Check for files array with id field
+ if isinstance(kb.data, dict) and "files" in kb.data:
+ if isinstance(kb.data["files"], list):
+ for file_ref in kb.data["files"]:
+ if isinstance(file_ref, dict) and "id" in file_ref:
+ file_ids.append(file_ref["id"])
+ elif isinstance(file_ref, str):
+ file_ids.append(file_ref)
+
+ # Add all found file IDs
+ for file_id in file_ids:
+ if isinstance(file_id, str) and file_id.strip():
+ active_file_ids.add(file_id.strip())
+ log.debug(f"KB {kb.id} references file {file_id}")
+
+ # 2. Get files referenced in chats (NEW: scan chat JSON for file references)
+ chats = Chats.get_chats()
+ log.debug(f"Found {len(chats)} chats to scan for file references")
+
+ for chat in chats:
+ if not chat.chat or not isinstance(chat.chat, dict):
+ continue
+
+ try:
+ # Convert entire chat JSON to string and extract all file IDs
+ chat_json_str = json.dumps(chat.chat)
+
+ # Find all file ID patterns in the JSON
+ # Pattern 1: "id": "uuid" where uuid looks like a file ID
+ file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
+ potential_file_ids = file_id_pattern.findall(chat_json_str)
+
+ # Pattern 2: URLs containing /api/v1/files/uuid
+ url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
+ url_file_ids = url_pattern.findall(chat_json_str)
+
+ # Combine and validate against actual file records
+ all_potential_ids = set(potential_file_ids + url_file_ids)
+ for file_id in all_potential_ids:
+ # Verify this ID exists in the file table to avoid false positives
+ if Files.get_file_by_id(file_id):
+ active_file_ids.add(file_id)
+ log.debug(f"Chat {chat.id}: Found active file {file_id}")
+
+ except Exception as e:
+ log.debug(f"Error processing chat {chat.id} for file references: {e}")
+
+ # 3. Get files referenced in folders (scan folder.items, folder.data, folder.meta)
+ try:
+ folders = Folders.get_all_folders()
+ log.debug(f"Found {len(folders)} folders to scan for file references")
+
+ for folder in folders:
+ # Check folder.items JSON
+ if folder.items:
+ try:
+ items_str = json.dumps(folder.items)
+ # Look for file ID patterns in the JSON
+ file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
+ url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
+
+ potential_ids = file_id_pattern.findall(items_str) + url_pattern.findall(items_str)
+ for file_id in potential_ids:
+ if Files.get_file_by_id(file_id):
+ active_file_ids.add(file_id)
+ log.debug(f"Folder {folder.id}: Found file {file_id} in items")
+ except Exception as e:
+ log.debug(f"Error processing folder {folder.id} items: {e}")
+
+ # Check folder.data JSON
+ if hasattr(folder, 'data') and folder.data:
+ try:
+ data_str = json.dumps(folder.data)
+ file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
+ url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
+
+ potential_ids = file_id_pattern.findall(data_str) + url_pattern.findall(data_str)
+ for file_id in potential_ids:
+ if Files.get_file_by_id(file_id):
+ active_file_ids.add(file_id)
+ log.debug(f"Folder {folder.id}: Found file {file_id} in data")
+ except Exception as e:
+ log.debug(f"Error processing folder {folder.id} data: {e}")
+
+ except Exception as e:
+ log.debug(f"Error scanning folders for file references: {e}")
+
+ # 4. Get files referenced in standalone messages (message table)
+ try:
+ # Query message table directly since we may not have a Messages model
+ with get_db() as db:
+ message_results = db.execute(text("SELECT id, data FROM message WHERE data IS NOT NULL")).fetchall()
+ log.debug(f"Found {len(message_results)} messages with data to scan")
+
+ for message_id, message_data_json in message_results:
+ if message_data_json:
+ try:
+ # Convert JSON to string and scan for file patterns
+ data_str = json.dumps(message_data_json) if isinstance(message_data_json, dict) else str(message_data_json)
+
+ file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
+ url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
+
+ potential_ids = file_id_pattern.findall(data_str) + url_pattern.findall(data_str)
+ for file_id in potential_ids:
+ if Files.get_file_by_id(file_id):
+ active_file_ids.add(file_id)
+ log.debug(f"Message {message_id}: Found file {file_id}")
+ except Exception as e:
+ log.debug(f"Error processing message {message_id} data: {e}")
+ except Exception as e:
+ log.debug(f"Error scanning messages for file references: {e}")
+
+ except Exception as e:
+ log.error(f"Error determining active file IDs: {e}")
+ # Fail safe: return empty set, which will prevent deletion
+ return set()
+
+ log.info(f"Found {len(active_file_ids)} active file IDs")
+ return active_file_ids
+
+
+def safe_delete_vector_collection(collection_name: str) -> bool:
+ """
+ Safely delete a vector collection, handling both logical and physical cleanup.
+ """
+ try:
+ # First, try to delete the collection through the client
+ try:
+ VECTOR_DB_CLIENT.delete_collection(collection_name=collection_name)
+ log.debug(f"Deleted collection from vector DB: {collection_name}")
+ except Exception as e:
+ log.debug(f"Collection {collection_name} may not exist in DB: {e}")
+
+ # Then, handle physical cleanup for ChromaDB
+ if "chroma" in VECTOR_DB.lower():
+ vector_dir = Path(CACHE_DIR).parent / "vector_db" / collection_name
+ if vector_dir.exists() and vector_dir.is_dir():
+ shutil.rmtree(vector_dir)
+ log.debug(f"Deleted physical vector directory: {vector_dir}")
+ return True
+
+ return True
+
+ except Exception as e:
+ log.error(f"Error deleting vector collection {collection_name}: {e}")
+ return False
+
+
+def safe_delete_file_by_id(file_id: str) -> bool:
+ """
+ Safely delete a file record and its associated vector collection.
+ """
+ try:
+ # Get file info before deletion
+ file_record = Files.get_file_by_id(file_id)
+ if not file_record:
+ log.debug(f"File {file_id} not found in database")
+ return True # Already gone
+
+ # Delete vector collection first
+ collection_name = f"file-{file_id}"
+ safe_delete_vector_collection(collection_name)
+
+ # Delete database record
+ Files.delete_file_by_id(file_id)
+ log.debug(f"Deleted file record: {file_id}")
+
+ return True
+
+ except Exception as e:
+ log.error(f"Error deleting file {file_id}: {e}")
+ return False
+
+
+def cleanup_orphaned_uploads(active_file_ids: Set[str]) -> None:
+ """
+ Clean up orphaned files in the uploads directory.
+ """
+ upload_dir = Path(CACHE_DIR).parent / "uploads"
+ if not upload_dir.exists():
+ log.debug("Uploads directory does not exist")
+ return
+
+ deleted_count = 0
+
+ try:
+ for file_path in upload_dir.iterdir():
+ if not file_path.is_file():
+ continue
+
+ filename = file_path.name
+
+ # Extract file ID from filename (common patterns)
+ file_id = None
+
+ # Pattern 1: UUID_filename or UUID-filename
+ if len(filename) > 36:
+ potential_id = filename[:36]
+ if potential_id.count('-') == 4: # UUID format
+ file_id = potential_id
+
+ # Pattern 2: filename might be the file ID itself
+ if not file_id and filename.count('-') == 4 and len(filename) == 36:
+ file_id = filename
+
+ # Pattern 3: Check if any part of filename matches active IDs
+ if not file_id:
+ for active_id in active_file_ids:
+ if active_id in filename:
+ file_id = active_id
+ break
+
+ # If we found a potential file ID and it's not active, delete it
+ if file_id and file_id not in active_file_ids:
+ try:
+ file_path.unlink()
+ deleted_count += 1
+ log.debug(f"Deleted orphaned upload file: {filename}")
+ except Exception as e:
+ log.error(f"Failed to delete upload file {filename}: {e}")
+
+ except Exception as e:
+ log.error(f"Error cleaning uploads directory: {e}")
+
+ if deleted_count > 0:
+ log.info(f"Deleted {deleted_count} orphaned upload files")
+
+
+def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids: Set[str]) -> None:
+ """
+ Clean up orphaned vector collections by querying ChromaDB metadata.
+ """
+ if "chroma" not in VECTOR_DB.lower():
+ return
+
+ vector_dir = Path(CACHE_DIR).parent / "vector_db"
+ if not vector_dir.exists():
+ log.debug("Vector DB directory does not exist")
+ return
+
+ chroma_db_path = vector_dir / "chroma.sqlite3"
+ if not chroma_db_path.exists():
+ log.debug("ChromaDB metadata file does not exist")
+ return
+
+ # Build expected collection names
+ expected_collections = set()
+
+ # File collections: file-{file_id}
+ for file_id in active_file_ids:
+ expected_collections.add(f"file-{file_id}")
+
+ # Knowledge base collections: {kb_id}
+ for kb_id in active_kb_ids:
+ expected_collections.add(kb_id)
+
+ log.debug(f"Expected collections to preserve: {expected_collections}")
+
+ # Query ChromaDB metadata to get the complete mapping chain:
+ # Directory UUID -> Collection ID -> Collection Name
+ uuid_to_collection = {}
+ try:
+ import sqlite3
+ log.debug(f"Attempting to connect to ChromaDB at: {chroma_db_path}")
+
+ with sqlite3.connect(str(chroma_db_path)) as conn:
+ # First, check what tables exist
+ tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
+ log.debug(f"ChromaDB tables: {tables}")
+
+ # Check the schema of collections table
+ schema = conn.execute("PRAGMA table_info(collections)").fetchall()
+ log.debug(f"Collections table schema: {schema}")
+
+ # Get Collection ID -> Collection Name mapping
+ collection_id_to_name = {}
+ cursor = conn.execute("SELECT id, name FROM collections")
+ rows = cursor.fetchall()
+ log.debug(f"Raw ChromaDB collections query results: {rows}")
+
+ for row in rows:
+ collection_id, collection_name = row
+ collection_id_to_name[collection_id] = collection_name
+ log.debug(f"Mapped collection ID {collection_id} -> name {collection_name}")
+
+ # Get Directory UUID -> Collection ID mapping from segments table
+ # Only interested in VECTOR segments as those are the actual data directories
+ cursor = conn.execute("SELECT id, collection FROM segments WHERE scope = 'VECTOR'")
+ segment_rows = cursor.fetchall()
+ log.debug(f"Raw ChromaDB segments query results: {segment_rows}")
+
+ for row in segment_rows:
+ segment_id, collection_id = row
+ if collection_id in collection_id_to_name:
+ collection_name = collection_id_to_name[collection_id]
+ uuid_to_collection[segment_id] = collection_name
+ log.debug(f"Mapped directory UUID {segment_id} -> collection {collection_name}")
+
+ log.debug(f"Final uuid_to_collection mapping: {uuid_to_collection}")
+ log.info(f"Found {len(uuid_to_collection)} vector segments in ChromaDB metadata")
+
+ except Exception as e:
+ log.error(f"Error reading ChromaDB metadata: {e}")
+ # Fail safe: don't delete anything if we can't read metadata
+ return
+
+ deleted_count = 0
+
+ try:
+ for collection_dir in vector_dir.iterdir():
+ if not collection_dir.is_dir():
+ continue
+
+ dir_uuid = collection_dir.name
+
+ # Skip system/metadata files
+ if dir_uuid.startswith('.'):
+ continue
+
+ # Get the actual collection name from metadata
+ collection_name = uuid_to_collection.get(dir_uuid)
+
+ if collection_name is None:
+ # Directory exists but no metadata entry - it's orphaned
+ log.debug(f"Directory {dir_uuid} has no metadata entry, deleting")
+ try:
+ shutil.rmtree(collection_dir)
+ deleted_count += 1
+ except Exception as e:
+ log.error(f"Failed to delete orphaned directory {dir_uuid}: {e}")
+
+ elif collection_name not in expected_collections:
+ # Collection exists but should be deleted
+ log.debug(f"Collection {collection_name} (UUID: {dir_uuid}) is orphaned, deleting")
+ try:
+ shutil.rmtree(collection_dir)
+ deleted_count += 1
+ except Exception as e:
+ log.error(f"Failed to delete collection directory {dir_uuid}: {e}")
+
+ else:
+ # Collection should be preserved
+ log.debug(f"Preserving collection {collection_name} (UUID: {dir_uuid})")
+
+ except Exception as e:
+ log.error(f"Error cleaning vector collections: {e}")
+
+ if deleted_count > 0:
+ log.info(f"Deleted {deleted_count} orphaned vector collections")
+
+
+@router.post("/", response_model=bool)
+async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
+ """
+ Prunes old and orphaned data using a safe, multi-stage process.
+
+ Parameters:
+ - days: Optional[int] = None
+ - If None: Skip chat deletion entirely
+ - If 0: Delete all chats (older than 0 days = all chats)
+ - If >= 1: Delete chats older than specified number of days
+ - exempt_archived_chats: bool = False
+ - If True: Exempt archived chats from deletion (only applies when days is not None)
+ - exempt_chats_in_folders: bool = False
+ - If True: Exempt chats that are in folders OR pinned chats from deletion (only applies when days is not None)
+ Note: Pinned chats behave the same as chats in folders
+ - delete_orphaned_chats: bool = True
+ - If True: Delete chats from deleted users
+ - delete_orphaned_tools: bool = True
+ - If True: Delete tools from deleted users
+ - delete_orphaned_functions: bool = True
+ - If True: Delete functions from deleted users
+ - delete_orphaned_prompts: bool = True
+ - If True: Delete prompts from deleted users
+ - delete_orphaned_knowledge_bases: bool = True
+ - If True: Delete knowledge bases from deleted users
+ - delete_orphaned_models: bool = True
+ - If True: Delete models from deleted users
+ - delete_orphaned_notes: bool = True
+ - If True: Delete notes from deleted users
+ - delete_orphaned_folders: bool = True
+ - If True: Delete folders from deleted users
+ """
+ try:
+ log.info("Starting data pruning process")
+
+ # Stage 1: Delete old chats based on user criteria (optional)
+ if form_data.days is not None:
+ cutoff_time = int(time.time()) - (form_data.days * 86400)
+ chats_to_delete = []
+
+ for chat in Chats.get_chats():
+ if chat.updated_at < cutoff_time:
+ # Check exemption conditions
+ if form_data.exempt_archived_chats and chat.archived:
+ log.debug(f"Exempting archived chat: {chat.id}")
+ continue
+ if form_data.exempt_chats_in_folders and (getattr(chat, 'folder_id', None) is not None or getattr(chat, 'pinned', False)):
+ folder_status = f"folder_id: {getattr(chat, 'folder_id', None)}" if getattr(chat, 'folder_id', None) else "not in folder"
+ pinned_status = f"pinned: {getattr(chat, 'pinned', False)}"
+ log.debug(f"Exempting chat in folder or pinned: {chat.id} ({folder_status}, {pinned_status})")
+ continue
+ log.debug(f"Chat {chat.id} will be deleted - archived: {getattr(chat, 'archived', False)}, folder_id: {getattr(chat, 'folder_id', None)}, pinned: {getattr(chat, 'pinned', False)}")
+ chats_to_delete.append(chat)
+
+ if chats_to_delete:
+ log.info(f"Deleting {len(chats_to_delete)} old chats (older than {form_data.days} days)")
+ for chat in chats_to_delete:
+ Chats.delete_chat_by_id(chat.id)
+ else:
+ log.info(f"No chats found older than {form_data.days} days")
+ else:
+ log.info("Skipping chat deletion (days parameter is None)")
+
+ # Stage 2: Build ground truth of what should be preserved
+ log.info("Building preservation set")
+
+ # Get all active users
+ active_user_ids = {user.id for user in Users.get_users()["users"]}
+ log.info(f"Found {len(active_user_ids)} active users")
+
+ # Get all active knowledge bases and their file references
+ active_kb_ids = set()
+ knowledge_bases = Knowledges.get_knowledge_bases()
+
+ for kb in knowledge_bases:
+ if kb.user_id in active_user_ids:
+ active_kb_ids.add(kb.id)
+
+ log.info(f"Found {len(active_kb_ids)} active knowledge bases")
+
+ # Get all files that should be preserved (NOW COMPREHENSIVE!)
+ active_file_ids = get_active_file_ids()
+
+ # Stage 3: Delete orphaned database records
+ log.info("Deleting orphaned database records")
+
+ # Delete files not referenced by any knowledge base or belonging to deleted users
+ deleted_files = 0
+ for file_record in Files.get_files():
+ should_delete = (
+ file_record.id not in active_file_ids or
+ file_record.user_id not in active_user_ids
+ )
+
+ if should_delete:
+ if safe_delete_file_by_id(file_record.id):
+ deleted_files += 1
+
+ if deleted_files > 0:
+ log.info(f"Deleted {deleted_files} orphaned files")
+
+ # Delete knowledge bases from deleted users (if enabled)
+ deleted_kbs = 0
+ if form_data.delete_orphaned_knowledge_bases:
+ for kb in knowledge_bases:
+ if kb.user_id not in active_user_ids:
+ if safe_delete_vector_collection(kb.id):
+ Knowledges.delete_knowledge_by_id(kb.id)
+ deleted_kbs += 1
+
+ if deleted_kbs > 0:
+ log.info(f"Deleted {deleted_kbs} orphaned knowledge bases")
+ else:
+ log.info("Skipping knowledge base deletion (disabled)")
+
+ # Delete other user-owned resources from deleted users (conditional)
+ deleted_others = 0
+
+ # Delete orphaned chats of deleted users (conditional)
+ if form_data.delete_orphaned_chats:
+ chats_deleted = 0
+ for chat in Chats.get_chats():
+ if chat.user_id not in active_user_ids:
+ Chats.delete_chat_by_id(chat.id)
+ chats_deleted += 1
+ deleted_others += 1
+ if chats_deleted > 0:
+ log.info(f"Deleted {chats_deleted} orphaned chats")
+ else:
+ log.info("Skipping orphaned chat deletion (disabled)")
+
+ # Delete orphaned tools of deleted users (conditional)
+ if form_data.delete_orphaned_tools:
+ tools_deleted = 0
+ for tool in Tools.get_tools():
+ if tool.user_id not in active_user_ids:
+ Tools.delete_tool_by_id(tool.id)
+ tools_deleted += 1
+ deleted_others += 1
+ if tools_deleted > 0:
+ log.info(f"Deleted {tools_deleted} orphaned tools")
+ else:
+ log.info("Skipping tool deletion (disabled)")
+
+ # Delete orphaned functions of deleted users (conditional)
+ if form_data.delete_orphaned_functions:
+ functions_deleted = 0
+ for function in Functions.get_functions():
+ if function.user_id not in active_user_ids:
+ Functions.delete_function_by_id(function.id)
+ functions_deleted += 1
+ deleted_others += 1
+ if functions_deleted > 0:
+ log.info(f"Deleted {functions_deleted} orphaned functions")
+ else:
+ log.info("Skipping function deletion (disabled)")
+
+ # Delete orphaned notes of deleted users (conditional)
+ if form_data.delete_orphaned_notes:
+ notes_deleted = 0
+ for note in Notes.get_notes():
+ if note.user_id not in active_user_ids:
+ Notes.delete_note_by_id(note.id)
+ notes_deleted += 1
+ deleted_others += 1
+ if notes_deleted > 0:
+ log.info(f"Deleted {notes_deleted} orphaned notes")
+ else:
+ log.info("Skipping note deletion (disabled)")
+
+ # Delete orphaned prompts of deleted users (conditional)
+ if form_data.delete_orphaned_prompts:
+ prompts_deleted = 0
+ for prompt in Prompts.get_prompts():
+ if prompt.user_id not in active_user_ids:
+ Prompts.delete_prompt_by_command(prompt.command)
+ prompts_deleted += 1
+ deleted_others += 1
+ if prompts_deleted > 0:
+ log.info(f"Deleted {prompts_deleted} orphaned prompts")
+ else:
+ log.info("Skipping prompt deletion (disabled)")
+
+ # Delete orphaned models of deleted users (conditional)
+ if form_data.delete_orphaned_models:
+ models_deleted = 0
+ for model in Models.get_all_models():
+ if model.user_id not in active_user_ids:
+ Models.delete_model_by_id(model.id)
+ models_deleted += 1
+ deleted_others += 1
+ if models_deleted > 0:
+ log.info(f"Deleted {models_deleted} orphaned models")
+ else:
+ log.info("Skipping model deletion (disabled)")
+
+ # Delete orphaned folders of deleted users (conditional)
+ if form_data.delete_orphaned_folders:
+ folders_deleted = 0
+ for folder in Folders.get_all_folders():
+ if folder.user_id not in active_user_ids:
+ Folders.delete_folder_by_id_and_user_id(folder.id, folder.user_id, delete_chats=False)
+ folders_deleted += 1
+ deleted_others += 1
+ if folders_deleted > 0:
+ log.info(f"Deleted {folders_deleted} orphaned folders")
+ else:
+ log.info("Skipping folder deletion (disabled)")
+
+ if deleted_others > 0:
+ log.info(f"Total other orphaned records deleted: {deleted_others}")
+
+ # Stage 4: Clean up orphaned physical files
+ log.info("Cleaning up orphaned physical files")
+
+ # Rebuild active sets after database cleanup
+ final_active_file_ids = get_active_file_ids()
+ final_active_kb_ids = {kb.id for kb in Knowledges.get_knowledge_bases()}
+
+ # Clean uploads directory
+ cleanup_orphaned_uploads(final_active_file_ids)
+
+ # Clean vector collections
+ cleanup_orphaned_vector_collections(final_active_file_ids, final_active_kb_ids)
+
+ # Stage 5: Database optimization
+ log.info("Optimizing database")
+
+ # Vacuum main database
+ try:
+ with get_db() as db:
+ db.execute(text("VACUUM"))
+ log.debug("Vacuumed main database")
+ except Exception as e:
+ log.error(f"Failed to vacuum main database: {e}")
+
+ # Vacuum ChromaDB database if it exists
+ if "chroma" in VECTOR_DB.lower():
+ chroma_db_path = Path(CACHE_DIR).parent / "vector_db" / "chroma.sqlite3"
+ if chroma_db_path.exists():
+ try:
+ import sqlite3
+ with sqlite3.connect(str(chroma_db_path)) as conn:
+ conn.execute("VACUUM")
+ log.debug("Vacuumed ChromaDB database")
+ except Exception as e:
+ log.error(f"Failed to vacuum ChromaDB database: {e}")
+
+ log.info("Data pruning completed successfully")
+ return True
+
+ except Exception as e:
+ log.exception(f"Error during data pruning: {e}")
+ raise HTTPException(
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+ detail=ERROR_MESSAGES.DEFAULT("Data pruning failed"),
+ )
diff --git a/src/lib/apis/prune.ts b/src/lib/apis/prune.ts
new file mode 100644
index 0000000000..d95d662438
--- /dev/null
+++ b/src/lib/apis/prune.ts
@@ -0,0 +1,54 @@
+import { WEBUI_API_BASE_URL } from '$lib/constants';
+
+export const pruneData = async (
+ token: string,
+ days: number | null,
+ exempt_archived_chats: boolean,
+ exempt_chats_in_folders: boolean,
+ delete_orphaned_chats: boolean = true,
+ delete_orphaned_tools: boolean = false,
+ delete_orphaned_functions: boolean = false,
+ delete_orphaned_prompts: boolean = true,
+ delete_orphaned_knowledge_bases: boolean = true,
+ delete_orphaned_models: boolean = true,
+ delete_orphaned_notes: boolean = true,
+ delete_orphaned_folders: boolean = true
+) => {
+ let error = null;
+
+ const res = await fetch(`${WEBUI_API_BASE_URL}/prune/`, {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ Authorization: `Bearer ${token}`
+ },
+ body: JSON.stringify({
+ days,
+ exempt_archived_chats,
+ exempt_chats_in_folders,
+ delete_orphaned_chats,
+ delete_orphaned_tools,
+ delete_orphaned_functions,
+ delete_orphaned_prompts,
+ delete_orphaned_knowledge_bases,
+ delete_orphaned_models,
+ delete_orphaned_notes,
+ delete_orphaned_folders
+ })
+ })
+ .then(async (res) => {
+ if (!res.ok) throw await res.json();
+ return res.json();
+ })
+ .catch((err) => {
+ error = err;
+ console.log(err);
+ return null;
+ });
+
+ if (error) {
+ throw error;
+ }
+
+ return res;
+};
diff --git a/src/lib/components/admin/Settings/Database.svelte b/src/lib/components/admin/Settings/Database.svelte
index b2ac5553de..19ec874746 100644
--- a/src/lib/components/admin/Settings/Database.svelte
+++ b/src/lib/components/admin/Settings/Database.svelte
@@ -1,7 +1,6 @@
-
+
+
\ No newline at end of file
diff --git a/src/lib/components/common/PruneDataDialog.svelte b/src/lib/components/common/PruneDataDialog.svelte
new file mode 100644
index 0000000000..10a29d2594
--- /dev/null
+++ b/src/lib/components/common/PruneDataDialog.svelte
@@ -0,0 +1,589 @@
+
+
+
+
+
+
+ {$i18n.t('Prune Orphaned Data')}
+
+
{
+ show = false;
+ }}
+ >
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {$i18n.t('Destructive Operation - Backup Recommended')}
+
+
+
{$i18n.t('This action will permanently delete data from your database. Only orphaned or old data, based on your configuration settings, will be deleted. All active, referenced data remains completely safe.')}
+
{$i18n.t('This operation cannot be undone. Create a complete backup of your database and files before proceeding. This operation is performed entirely at your own risk - having a backup ensures you can restore any data if something unexpected occurs.')}
+
+
+
+
showDetailsExpanded = !showDetailsExpanded}
+ >
+
+
+
+ {showDetailsExpanded ? $i18n.t('Hide details') : $i18n.t('Show details')}
+
+
+ {#if showDetailsExpanded}
+
+
{$i18n.t('Note:')} {$i18n.t('This list provides an overview of what will be deleted during the pruning process and may not be complete or fully up-to-date.')}
+
+
+
+ activeDetailsTab = 'chats'}
+ >
+ {$i18n.t('Chats')}
+
+ activeDetailsTab = 'workspace'}
+ >
+ {$i18n.t('Workspace')}
+
+ activeDetailsTab = 'datavector'}
+ >
+ {$i18n.t('Data & Vector')}
+
+ activeDetailsTab = 'imagesaudio'}
+ >
+ {$i18n.t('Images & Audio')}
+
+ activeDetailsTab = 'system'}
+ >
+ {$i18n.t('System & Database')}
+
+
+
+
+
+ {#if activeDetailsTab === 'chats'}
+
+
{$i18n.t('Age-Based Chat Deletion:')}
+
• {$i18n.t('Removes conversations older than specified days based on when they were last modified or updated (not when they were created)')}
+
• {$i18n.t('Supports exemptions for:')}
+
◦ {$i18n.t('Archived chats')}
+
◦ {$i18n.t('Chats organized in folders and pinned chats')}
+
+
{$i18n.t('Orphaned Content Cleanup:')}
+
• {$i18n.t('Delete orphaned chats from deleted users')}
+
• {$i18n.t('Delete orphaned folders from deleted users')}
+
+ {:else if activeDetailsTab === 'workspace'}
+
+
{$i18n.t('Orphaned Workspace Items from Deleted Users:')}
+
• {$i18n.t('Delete orphaned knowledge bases')}
+
• {$i18n.t('Delete orphaned custom tools')}
+
• {$i18n.t('Delete orphaned custom functions (Actions, Pipes, Filters)')}
+
• {$i18n.t('Delete orphaned custom prompts and templates')}
+
• {$i18n.t('Delete orphaned custom models and configurations')}
+
• {$i18n.t('Delete orphaned notes')}
+
+ {:else if activeDetailsTab === 'datavector'}
+
+
{$i18n.t('Files & Vector Storage:')}
+
• {$i18n.t('Orphaned files and attachments from deleted content')}
+
• {$i18n.t('Vector embeddings and collections for removed data')}
+
• {$i18n.t('Uploaded files that lost their database references')}
+
• {$i18n.t('Vector storage directories without corresponding data')}
+
+ {:else if activeDetailsTab === 'imagesaudio'}
+
+
{$i18n.t('Images & Audio Content Cleanup:')}
+
• {$i18n.t('TBD - Image cleanup functionality')}
+
• {$i18n.t('TBD - Audio cleanup functionality')}
+
• {$i18n.t('TBD - Orphaned images and audio files')}
+
• {$i18n.t('TBD - Media processing cache cleanup')}
+
+ {:else if activeDetailsTab === 'system'}
+
+
{$i18n.t('Database & System Cleanup:')}
+
• {$i18n.t('Removal of broken database references and stale entries')}
+
• {$i18n.t('Disk space reclamation by database cleanup')}
+
• {$i18n.t('Synchronization of database records with actual file storage')}
+
• {$i18n.t('Fix inconsistencies between storage systems')}
+
• {$i18n.t('Database performance optimization')}
+
+ {/if}
+
+
+ {/if}
+
+
+
+
+
+
+
+
+
+
+
+
+ {$i18n.t('Performance Warning: This operation may take a very long time to complete, especially if you have never cleaned your database before or if your instance stores large amounts of data. The process could take anywhere from seconds, to minutes, to half an hour and beyond depending on your data size.')}
+
+
+
+
+
+
+
+
+
+
+
+
+ {$i18n.t('Pruning Configuration')}
+
+
+
+ {$i18n.t('Configure what data should be cleaned up during the pruning process.')}
+
+
+
+
+ activeSettingsTab = 'chats'}
+ >
+ {$i18n.t('Chats')}
+
+ activeSettingsTab = 'workspace'}
+ >
+ {$i18n.t('Workspace')}
+
+
+
+
+
+ {#if activeSettingsTab === 'chats'}
+
+
+
+
+
+
+
+
+
+ {$i18n.t('Delete chats by age')}
+
+
+ {$i18n.t('Optionally remove old chats based on last update time')}
+
+
+
+
+
+
+ {#if deleteChatsByAge}
+
+
+
+ {$i18n.t('Delete chats older than')}
+
+
+
+ {$i18n.t('days')}
+
+
+ {$i18n.t('Set to 0 to delete all chats, or specify number of days')}
+
+
+
+
+
+
+
+
+
+
+ {$i18n.t('Exempt archived chats')}
+
+
+ {$i18n.t('Keep archived chats even if they are old')}
+
+
+
+
+
+
+
+
+
+
+
+
+ {$i18n.t('Exempt chats in folders')}
+
+
+ {$i18n.t('Keep chats that are organized in folders or pinned')}
+
+
+
+
+
+ {/if}
+
+
+
+
+
+
+
+
+
+
+ {$i18n.t('Delete orphaned chats')}
+
+
+ {$i18n.t('Delete orphaned chats from deleted users')}
+
+
+
+
+
+
+
+
+
+
+
+
+ {$i18n.t('Delete orphaned folders')}
+
+
+ {$i18n.t('Delete orphaned folders from deleted users')}
+
+
+
+
+
+
+
+ {:else if activeSettingsTab === 'workspace'}
+
+
+
+
+
+
+
+
+
+ {$i18n.t('Delete orphaned knowledge bases')}
+
+
+ {$i18n.t('Delete orphaned knowledge bases from deleted users')}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {$i18n.t('Delete orphaned tools')}
+
+
+ {$i18n.t('Delete orphaned custom tools from deleted users')}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
{$i18n.t('Delete orphaned functions')}
+
+
+
+
+
+
{$i18n.t('Admin panel functions - all functions, including:')}
+
+
• {$i18n.t('Actions')}
+
• {$i18n.t('Pipes')}
+
• {$i18n.t('Filters')}
+
+
+
+
+
+
+ {$i18n.t('Delete orphaned custom functions from deleted users')}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {$i18n.t('Delete orphaned prompts')}
+
+
+ {$i18n.t('Delete orphaned custom prompts from deleted users')}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {$i18n.t('Delete orphaned models')}
+
+
+ {$i18n.t('Delete orphaned custom models from deleted users')}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {$i18n.t('Delete orphaned notes')}
+
+
+ {$i18n.t('Delete orphaned notes from deleted users')}
+
+
+
+
+
+ {/if}
+
+
+
+
+
+
+
+
+
+ {$i18n.t('API Automation Helper')}
+
+
+
showApiPreview = !showApiPreview}
+ >
+
+
+
+ {showApiPreview ? $i18n.t('Hide API call') : $i18n.t('Show API call')}
+
+
+ {#if showApiPreview}
+
+
+ {$i18n.t('Use this API call configuration to automate pruning operations in your own maintenance scripts.')}
+
+
+
+ {/if}
+
+
+
+
+
+
+
+ (show = false)}
+ >
+ {$i18n.t('Cancel')}
+
+
+ {$i18n.t('Prune Data')}
+
+
+
+
+
From 028a2e598497f4f28d0b583a309911af0f17dc8f Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 12 Aug 2025 12:47:19 +0200
Subject: [PATCH 02/43] Update prune.py
---
backend/open_webui/routers/prune.py | 62 ++++++++++++++++++++++++++++-
1 file changed, 61 insertions(+), 1 deletion(-)
diff --git a/backend/open_webui/routers/prune.py b/backend/open_webui/routers/prune.py
index 78c333e538..d8b221e87d 100644
--- a/backend/open_webui/routers/prune.py
+++ b/backend/open_webui/routers/prune.py
@@ -47,6 +47,8 @@ class PruneDataForm(BaseModel):
delete_orphaned_models: bool = True
delete_orphaned_notes: bool = True
delete_orphaned_folders: bool = True
+ # Audio cache cleanup
+ audio_cache_max_age_days: Optional[int] = 30
def get_active_file_ids() -> Set[str]:
@@ -425,6 +427,57 @@ def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids
log.info(f"Deleted {deleted_count} orphaned vector collections")
+def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
+ """
+ Clean up audio cache files older than specified days.
+
+ Args:
+ max_age_days: Delete audio files older than this many days. If None, skip audio cleanup.
+ """
+ if max_age_days is None:
+ log.info("Skipping audio cache cleanup (max_age_days is None)")
+ return
+
+ cutoff_time = time.time() - (max_age_days * 86400)
+ deleted_count = 0
+ total_size_deleted = 0
+
+ # Audio cache directories
+ audio_dirs = [
+ Path(CACHE_DIR) / "audio" / "speech",
+ Path(CACHE_DIR) / "audio" / "transcriptions"
+ ]
+
+ for audio_dir in audio_dirs:
+ if not audio_dir.exists():
+ log.debug(f"Audio directory does not exist: {audio_dir}")
+ continue
+
+ try:
+ for file_path in audio_dir.iterdir():
+ if not file_path.is_file():
+ continue
+
+ # Check file age
+ file_mtime = file_path.stat().st_mtime
+ if file_mtime < cutoff_time:
+ try:
+ file_size = file_path.stat().st_size
+ file_path.unlink()
+ deleted_count += 1
+ total_size_deleted += file_size
+ log.debug(f"Deleted old audio file: {file_path}")
+ except Exception as e:
+ log.error(f"Failed to delete audio file {file_path}: {e}")
+
+ except Exception as e:
+ log.error(f"Error cleaning audio directory {audio_dir}: {e}")
+
+ if deleted_count > 0:
+ size_mb = total_size_deleted / (1024 * 1024)
+ log.info(f"Deleted {deleted_count} audio cache files ({size_mb:.1f} MB), older than {max_age_days} days")
+
+
@router.post("/", response_model=bool)
async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
"""
@@ -456,6 +509,9 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
- If True: Delete notes from deleted users
- delete_orphaned_folders: bool = True
- If True: Delete folders from deleted users
+ - audio_cache_max_age_days: Optional[int] = 30
+ - If None: Skip audio cache cleanup
+ - If >= 0: Delete audio cache files (TTS, STT) older than specified days
"""
try:
log.info("Starting data pruning process")
@@ -650,7 +706,11 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
# Clean vector collections
cleanup_orphaned_vector_collections(final_active_file_ids, final_active_kb_ids)
- # Stage 5: Database optimization
+ # Stage 5: Audio cache cleanup
+ log.info("Cleaning audio cache")
+ cleanup_audio_cache(form_data.audio_cache_max_age_days)
+
+ # Stage 6: Database optimization
log.info("Optimizing database")
# Vacuum main database
From 0bd42e5c6d93d2bea2930041636124148a8b47d0 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 12 Aug 2025 12:47:34 +0200
Subject: [PATCH 03/43] Update Database.svelte
---
.../components/admin/Settings/Database.svelte | 26 ++++++++++---------
1 file changed, 14 insertions(+), 12 deletions(-)
diff --git a/src/lib/components/admin/Settings/Database.svelte b/src/lib/components/admin/Settings/Database.svelte
index 19ec874746..736f201931 100644
--- a/src/lib/components/admin/Settings/Database.svelte
+++ b/src/lib/components/admin/Settings/Database.svelte
@@ -32,7 +32,8 @@
delete_orphaned_knowledge_bases,
delete_orphaned_models,
delete_orphaned_notes,
- delete_orphaned_folders
+ delete_orphaned_folders,
+ audio_cache_max_age_days
} = event.detail;
const res = await pruneData(
@@ -47,7 +48,8 @@
delete_orphaned_knowledge_bases,
delete_orphaned_models,
delete_orphaned_notes,
- delete_orphaned_folders
+ delete_orphaned_folders,
+ audio_cache_max_age_days
).catch((error) => {
toast.error(`${error}`);
return null;
@@ -243,15 +245,15 @@
-
-
-
- {$i18n.t('Export Users')}
-
-
- {/if}
+ clip-rule="evenodd"
+ />
+
+
+
+ {$i18n.t('Export Users')}
+
+
+ {/if}
-->
-
\ No newline at end of file
+
From 5ce002d5b3745f3eeb46cd614897d4f9a0efc6f8 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 12 Aug 2025 12:47:51 +0200
Subject: [PATCH 04/43] Update PruneDataDialog.svelte
---
.../components/common/PruneDataDialog.svelte | 83 +++++++++++++++++--
1 file changed, 75 insertions(+), 8 deletions(-)
diff --git a/src/lib/components/common/PruneDataDialog.svelte b/src/lib/components/common/PruneDataDialog.svelte
index 10a29d2594..1dd11e984a 100644
--- a/src/lib/components/common/PruneDataDialog.svelte
+++ b/src/lib/components/common/PruneDataDialog.svelte
@@ -22,6 +22,10 @@
let delete_orphaned_notes = true;
let delete_orphaned_folders = true;
+ // Audio cache cleanup
+ let cleanupAudioCache = true;
+ let audio_cache_max_age_days = 30;
+
let showDetailsExpanded = false;
let activeDetailsTab = 'chats';
let activeSettingsTab = 'chats';
@@ -41,7 +45,8 @@
delete_orphaned_knowledge_bases,
delete_orphaned_models,
delete_orphaned_notes,
- delete_orphaned_folders
+ delete_orphaned_folders,
+ audio_cache_max_age_days: cleanupAudioCache ? audio_cache_max_age_days : null
});
show = false;
};
@@ -62,7 +67,8 @@ Authorization: Bearer
"delete_orphaned_knowledge_bases": ${delete_orphaned_knowledge_bases},
"delete_orphaned_models": ${delete_orphaned_models},
"delete_orphaned_notes": ${delete_orphaned_notes},
- "delete_orphaned_folders": ${delete_orphaned_folders}
+ "delete_orphaned_folders": ${delete_orphaned_folders},
+ "audio_cache_max_age_days": ${cleanupAudioCache ? audio_cache_max_age_days : null}
}`;
const copyApiCall = () => {
@@ -207,10 +213,10 @@ Authorization: Bearer
{:else if activeDetailsTab === 'imagesaudio'}
{$i18n.t('Images & Audio Content Cleanup:')}
-
• {$i18n.t('TBD - Image cleanup functionality')}
-
• {$i18n.t('TBD - Audio cleanup functionality')}
-
• {$i18n.t('TBD - Orphaned images and audio files')}
-
• {$i18n.t('TBD - Media processing cache cleanup')}
+
• {$i18n.t('Generated images: Already integrated with file system - orphaned images are automatically cleaned up when chats are deleted')}
+
• {$i18n.t('Uploaded images: Already integrated with file system - orphaned images are automatically cleaned up based on active references')}
+
• {$i18n.t('Audio cache cleanup: Remove old text-to-speech (TTS) generated audio files and speech-to-text (STT) transcription files')}
+
• {$i18n.t('Audio recordings and transcriptions: Clean up cached audio files older than specified days')}
{:else if activeDetailsTab === 'system'}
@@ -261,7 +267,7 @@ Authorization: Bearer
{$i18n.t('Configure what data should be cleaned up during the pruning process.')}
-
+
>
{$i18n.t('Workspace')}
+ activeSettingsTab = 'audio'}
+ >
+ {$i18n.t('Audio Cache')}
+
-
+
{#if activeSettingsTab === 'chats'}
@@ -508,6 +520,61 @@ Authorization: Bearer
+
+ {:else if activeSettingsTab === 'audio'}
+
+
+
+
+
+
+
+
+
+ {$i18n.t('Clean audio cache')}
+
+
+ {$i18n.t('Remove old audio cache files (TTS and STT recordings)')}
+
+
+
+
+
+
+ {#if cleanupAudioCache}
+
+
+
+ {$i18n.t('Delete audio files older than')}
+
+
+
+ {$i18n.t('days')}
+
+
+ {$i18n.t('Remove cached TTS (text-to-speech) and STT (speech-to-text) files older than specified days')}
+
+
+
+
+
+ {$i18n.t('Audio Cache Types:')}
+
+
+
• {$i18n.t('TTS Files:')} {$i18n.t('Generated audio files when AI speaks text to you')}
+
• {$i18n.t('STT Files:')} {$i18n.t('Uploaded audio files for transcription (voice messages)')}
+
• {$i18n.t('Metadata:')} {$i18n.t('Associated JSON files with transcription data')}
+
+
+
+ {/if}
+
{/if}
From 8d7273afaeb64e144b3cf91a26d2553df4db405a Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 12 Aug 2025 12:48:05 +0200
Subject: [PATCH 05/43] Update prune.ts
---
src/lib/apis/prune.ts | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/src/lib/apis/prune.ts b/src/lib/apis/prune.ts
index d95d662438..8413ca24c0 100644
--- a/src/lib/apis/prune.ts
+++ b/src/lib/apis/prune.ts
@@ -12,7 +12,8 @@ export const pruneData = async (
delete_orphaned_knowledge_bases: boolean = true,
delete_orphaned_models: boolean = true,
delete_orphaned_notes: boolean = true,
- delete_orphaned_folders: boolean = true
+ delete_orphaned_folders: boolean = true,
+ audio_cache_max_age_days: number | null = 30
) => {
let error = null;
@@ -33,7 +34,8 @@ export const pruneData = async (
delete_orphaned_knowledge_bases,
delete_orphaned_models,
delete_orphaned_notes,
- delete_orphaned_folders
+ delete_orphaned_folders,
+ audio_cache_max_age_days
})
})
.then(async (res) => {
From e4a0bd86405d9eb7ba613e3401c221d9733ab35b Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 12 Aug 2025 13:15:38 +0200
Subject: [PATCH 06/43] Update Database.svelte
---
.../components/admin/Settings/Database.svelte | 23 ++++++++++++++++++-
1 file changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/lib/components/admin/Settings/Database.svelte b/src/lib/components/admin/Settings/Database.svelte
index 736f201931..1ee2d79325 100644
--- a/src/lib/components/admin/Settings/Database.svelte
+++ b/src/lib/components/admin/Settings/Database.svelte
@@ -1,6 +1,7 @@
+
From 709c852917ca3e03c9af7434460943eee3508f69 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 12 Aug 2025 13:20:16 +0200
Subject: [PATCH 08/43] Update prune.py
---
backend/open_webui/routers/prune.py | 136 +++-------------------------
1 file changed, 12 insertions(+), 124 deletions(-)
diff --git a/backend/open_webui/routers/prune.py b/backend/open_webui/routers/prune.py
index d8b221e87d..ca38951832 100644
--- a/backend/open_webui/routers/prune.py
+++ b/backend/open_webui/routers/prune.py
@@ -38,7 +38,6 @@ class PruneDataForm(BaseModel):
days: Optional[int] = None
exempt_archived_chats: bool = False
exempt_chats_in_folders: bool = False
- # Orphaned resource deletion toggles (for deleted users)
delete_orphaned_chats: bool = True
delete_orphaned_tools: bool = False
delete_orphaned_functions: bool = False
@@ -47,19 +46,17 @@ class PruneDataForm(BaseModel):
delete_orphaned_models: bool = True
delete_orphaned_notes: bool = True
delete_orphaned_folders: bool = True
- # Audio cache cleanup
audio_cache_max_age_days: Optional[int] = 30
def get_active_file_ids() -> Set[str]:
"""
Get all file IDs that are actively referenced by knowledge bases, chats, folders, and messages.
- This is the ground truth for what files should be preserved.
"""
active_file_ids = set()
try:
- # 1. Get files referenced by knowledge bases (original logic)
+ # Scan knowledge bases for file references
knowledge_bases = Knowledges.get_knowledge_bases()
log.debug(f"Found {len(knowledge_bases)} knowledge bases")
@@ -67,15 +64,12 @@ def get_active_file_ids() -> Set[str]:
if not kb.data:
continue
- # Handle different possible data structures for file references
file_ids = []
- # Check for file_ids array
if isinstance(kb.data, dict) and "file_ids" in kb.data:
if isinstance(kb.data["file_ids"], list):
file_ids.extend(kb.data["file_ids"])
- # Check for files array with id field
if isinstance(kb.data, dict) and "files" in kb.data:
if isinstance(kb.data["files"], list):
for file_ref in kb.data["files"]:
@@ -84,13 +78,11 @@ def get_active_file_ids() -> Set[str]:
elif isinstance(file_ref, str):
file_ids.append(file_ref)
- # Add all found file IDs
for file_id in file_ids:
if isinstance(file_id, str) and file_id.strip():
active_file_ids.add(file_id.strip())
- log.debug(f"KB {kb.id} references file {file_id}")
- # 2. Get files referenced in chats (NEW: scan chat JSON for file references)
+ # Scan chats for file references
chats = Chats.get_chats()
log.debug(f"Found {len(chats)} chats to scan for file references")
@@ -99,40 +91,31 @@ def get_active_file_ids() -> Set[str]:
continue
try:
- # Convert entire chat JSON to string and extract all file IDs
chat_json_str = json.dumps(chat.chat)
- # Find all file ID patterns in the JSON
- # Pattern 1: "id": "uuid" where uuid looks like a file ID
+ # Extract file IDs using regex patterns
file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
- potential_file_ids = file_id_pattern.findall(chat_json_str)
-
- # Pattern 2: URLs containing /api/v1/files/uuid
url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
+
+ potential_file_ids = file_id_pattern.findall(chat_json_str)
url_file_ids = url_pattern.findall(chat_json_str)
- # Combine and validate against actual file records
all_potential_ids = set(potential_file_ids + url_file_ids)
for file_id in all_potential_ids:
- # Verify this ID exists in the file table to avoid false positives
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
- log.debug(f"Chat {chat.id}: Found active file {file_id}")
except Exception as e:
log.debug(f"Error processing chat {chat.id} for file references: {e}")
- # 3. Get files referenced in folders (scan folder.items, folder.data, folder.meta)
+ # Scan folders for file references
try:
folders = Folders.get_all_folders()
- log.debug(f"Found {len(folders)} folders to scan for file references")
for folder in folders:
- # Check folder.items JSON
if folder.items:
try:
items_str = json.dumps(folder.items)
- # Look for file ID patterns in the JSON
file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
@@ -140,11 +123,9 @@ def get_active_file_ids() -> Set[str]:
for file_id in potential_ids:
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
- log.debug(f"Folder {folder.id}: Found file {file_id} in items")
except Exception as e:
log.debug(f"Error processing folder {folder.id} items: {e}")
- # Check folder.data JSON
if hasattr(folder, 'data') and folder.data:
try:
data_str = json.dumps(folder.data)
@@ -155,24 +136,20 @@ def get_active_file_ids() -> Set[str]:
for file_id in potential_ids:
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
- log.debug(f"Folder {folder.id}: Found file {file_id} in data")
except Exception as e:
log.debug(f"Error processing folder {folder.id} data: {e}")
except Exception as e:
log.debug(f"Error scanning folders for file references: {e}")
- # 4. Get files referenced in standalone messages (message table)
+ # Scan standalone messages for file references
try:
- # Query message table directly since we may not have a Messages model
with get_db() as db:
message_results = db.execute(text("SELECT id, data FROM message WHERE data IS NOT NULL")).fetchall()
- log.debug(f"Found {len(message_results)} messages with data to scan")
for message_id, message_data_json in message_results:
if message_data_json:
try:
- # Convert JSON to string and scan for file patterns
data_str = json.dumps(message_data_json) if isinstance(message_data_json, dict) else str(message_data_json)
file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
@@ -182,7 +159,6 @@ def get_active_file_ids() -> Set[str]:
for file_id in potential_ids:
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
- log.debug(f"Message {message_id}: Found file {file_id}")
except Exception as e:
log.debug(f"Error processing message {message_id} data: {e}")
except Exception as e:
@@ -190,7 +166,6 @@ def get_active_file_ids() -> Set[str]:
except Exception as e:
log.error(f"Error determining active file IDs: {e}")
- # Fail safe: return empty set, which will prevent deletion
return set()
log.info(f"Found {len(active_file_ids)} active file IDs")
@@ -202,19 +177,15 @@ def safe_delete_vector_collection(collection_name: str) -> bool:
Safely delete a vector collection, handling both logical and physical cleanup.
"""
try:
- # First, try to delete the collection through the client
try:
VECTOR_DB_CLIENT.delete_collection(collection_name=collection_name)
- log.debug(f"Deleted collection from vector DB: {collection_name}")
except Exception as e:
log.debug(f"Collection {collection_name} may not exist in DB: {e}")
- # Then, handle physical cleanup for ChromaDB
if "chroma" in VECTOR_DB.lower():
vector_dir = Path(CACHE_DIR).parent / "vector_db" / collection_name
if vector_dir.exists() and vector_dir.is_dir():
shutil.rmtree(vector_dir)
- log.debug(f"Deleted physical vector directory: {vector_dir}")
return True
return True
@@ -229,19 +200,14 @@ def safe_delete_file_by_id(file_id: str) -> bool:
Safely delete a file record and its associated vector collection.
"""
try:
- # Get file info before deletion
file_record = Files.get_file_by_id(file_id)
if not file_record:
- log.debug(f"File {file_id} not found in database")
- return True # Already gone
+ return True
- # Delete vector collection first
collection_name = f"file-{file_id}"
safe_delete_vector_collection(collection_name)
- # Delete database record
Files.delete_file_by_id(file_id)
- log.debug(f"Deleted file record: {file_id}")
return True
@@ -256,7 +222,6 @@ def cleanup_orphaned_uploads(active_file_ids: Set[str]) -> None:
"""
upload_dir = Path(CACHE_DIR).parent / "uploads"
if not upload_dir.exists():
- log.debug("Uploads directory does not exist")
return
deleted_count = 0
@@ -267,33 +232,27 @@ def cleanup_orphaned_uploads(active_file_ids: Set[str]) -> None:
continue
filename = file_path.name
-
- # Extract file ID from filename (common patterns)
file_id = None
- # Pattern 1: UUID_filename or UUID-filename
+ # Extract file ID from filename patterns
if len(filename) > 36:
potential_id = filename[:36]
- if potential_id.count('-') == 4: # UUID format
+ if potential_id.count('-') == 4:
file_id = potential_id
- # Pattern 2: filename might be the file ID itself
if not file_id and filename.count('-') == 4 and len(filename) == 36:
file_id = filename
- # Pattern 3: Check if any part of filename matches active IDs
if not file_id:
for active_id in active_file_ids:
if active_id in filename:
file_id = active_id
break
- # If we found a potential file ID and it's not active, delete it
if file_id and file_id not in active_file_ids:
try:
file_path.unlink()
deleted_count += 1
- log.debug(f"Deleted orphaned upload file: {filename}")
except Exception as e:
log.error(f"Failed to delete upload file {filename}: {e}")
@@ -313,73 +272,46 @@ def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids
vector_dir = Path(CACHE_DIR).parent / "vector_db"
if not vector_dir.exists():
- log.debug("Vector DB directory does not exist")
return
chroma_db_path = vector_dir / "chroma.sqlite3"
if not chroma_db_path.exists():
- log.debug("ChromaDB metadata file does not exist")
return
- # Build expected collection names
expected_collections = set()
- # File collections: file-{file_id}
for file_id in active_file_ids:
expected_collections.add(f"file-{file_id}")
- # Knowledge base collections: {kb_id}
for kb_id in active_kb_ids:
expected_collections.add(kb_id)
- log.debug(f"Expected collections to preserve: {expected_collections}")
-
- # Query ChromaDB metadata to get the complete mapping chain:
- # Directory UUID -> Collection ID -> Collection Name
uuid_to_collection = {}
try:
import sqlite3
- log.debug(f"Attempting to connect to ChromaDB at: {chroma_db_path}")
with sqlite3.connect(str(chroma_db_path)) as conn:
- # First, check what tables exist
- tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
- log.debug(f"ChromaDB tables: {tables}")
-
- # Check the schema of collections table
- schema = conn.execute("PRAGMA table_info(collections)").fetchall()
- log.debug(f"Collections table schema: {schema}")
-
- # Get Collection ID -> Collection Name mapping
collection_id_to_name = {}
cursor = conn.execute("SELECT id, name FROM collections")
rows = cursor.fetchall()
- log.debug(f"Raw ChromaDB collections query results: {rows}")
for row in rows:
collection_id, collection_name = row
collection_id_to_name[collection_id] = collection_name
- log.debug(f"Mapped collection ID {collection_id} -> name {collection_name}")
- # Get Directory UUID -> Collection ID mapping from segments table
- # Only interested in VECTOR segments as those are the actual data directories
cursor = conn.execute("SELECT id, collection FROM segments WHERE scope = 'VECTOR'")
segment_rows = cursor.fetchall()
- log.debug(f"Raw ChromaDB segments query results: {segment_rows}")
for row in segment_rows:
segment_id, collection_id = row
if collection_id in collection_id_to_name:
collection_name = collection_id_to_name[collection_id]
uuid_to_collection[segment_id] = collection_name
- log.debug(f"Mapped directory UUID {segment_id} -> collection {collection_name}")
- log.debug(f"Final uuid_to_collection mapping: {uuid_to_collection}")
log.info(f"Found {len(uuid_to_collection)} vector segments in ChromaDB metadata")
except Exception as e:
log.error(f"Error reading ChromaDB metadata: {e}")
- # Fail safe: don't delete anything if we can't read metadata
return
deleted_count = 0
@@ -391,16 +323,12 @@ def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids
dir_uuid = collection_dir.name
- # Skip system/metadata files
if dir_uuid.startswith('.'):
continue
- # Get the actual collection name from metadata
collection_name = uuid_to_collection.get(dir_uuid)
if collection_name is None:
- # Directory exists but no metadata entry - it's orphaned
- log.debug(f"Directory {dir_uuid} has no metadata entry, deleting")
try:
shutil.rmtree(collection_dir)
deleted_count += 1
@@ -408,17 +336,11 @@ def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids
log.error(f"Failed to delete orphaned directory {dir_uuid}: {e}")
elif collection_name not in expected_collections:
- # Collection exists but should be deleted
- log.debug(f"Collection {collection_name} (UUID: {dir_uuid}) is orphaned, deleting")
try:
shutil.rmtree(collection_dir)
deleted_count += 1
except Exception as e:
log.error(f"Failed to delete collection directory {dir_uuid}: {e}")
-
- else:
- # Collection should be preserved
- log.debug(f"Preserving collection {collection_name} (UUID: {dir_uuid})")
except Exception as e:
log.error(f"Error cleaning vector collections: {e}")
@@ -430,9 +352,6 @@ def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids
def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
"""
Clean up audio cache files older than specified days.
-
- Args:
- max_age_days: Delete audio files older than this many days. If None, skip audio cleanup.
"""
if max_age_days is None:
log.info("Skipping audio cache cleanup (max_age_days is None)")
@@ -442,7 +361,6 @@ def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
deleted_count = 0
total_size_deleted = 0
- # Audio cache directories
audio_dirs = [
Path(CACHE_DIR) / "audio" / "speech",
Path(CACHE_DIR) / "audio" / "transcriptions"
@@ -450,7 +368,6 @@ def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
for audio_dir in audio_dirs:
if not audio_dir.exists():
- log.debug(f"Audio directory does not exist: {audio_dir}")
continue
try:
@@ -458,7 +375,6 @@ def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
if not file_path.is_file():
continue
- # Check file age
file_mtime = file_path.stat().st_mtime
if file_mtime < cutoff_time:
try:
@@ -466,7 +382,6 @@ def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
file_path.unlink()
deleted_count += 1
total_size_deleted += file_size
- log.debug(f"Deleted old audio file: {file_path}")
except Exception as e:
log.error(f"Failed to delete audio file {file_path}: {e}")
@@ -516,23 +431,17 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
try:
log.info("Starting data pruning process")
- # Stage 1: Delete old chats based on user criteria (optional)
+ # Stage 1: Delete old chats based on user criteria
if form_data.days is not None:
cutoff_time = int(time.time()) - (form_data.days * 86400)
chats_to_delete = []
for chat in Chats.get_chats():
if chat.updated_at < cutoff_time:
- # Check exemption conditions
if form_data.exempt_archived_chats and chat.archived:
- log.debug(f"Exempting archived chat: {chat.id}")
continue
if form_data.exempt_chats_in_folders and (getattr(chat, 'folder_id', None) is not None or getattr(chat, 'pinned', False)):
- folder_status = f"folder_id: {getattr(chat, 'folder_id', None)}" if getattr(chat, 'folder_id', None) else "not in folder"
- pinned_status = f"pinned: {getattr(chat, 'pinned', False)}"
- log.debug(f"Exempting chat in folder or pinned: {chat.id} ({folder_status}, {pinned_status})")
continue
- log.debug(f"Chat {chat.id} will be deleted - archived: {getattr(chat, 'archived', False)}, folder_id: {getattr(chat, 'folder_id', None)}, pinned: {getattr(chat, 'pinned', False)}")
chats_to_delete.append(chat)
if chats_to_delete:
@@ -544,14 +453,12 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping chat deletion (days parameter is None)")
- # Stage 2: Build ground truth of what should be preserved
+ # Stage 2: Build preservation set
log.info("Building preservation set")
- # Get all active users
active_user_ids = {user.id for user in Users.get_users()["users"]}
log.info(f"Found {len(active_user_ids)} active users")
- # Get all active knowledge bases and their file references
active_kb_ids = set()
knowledge_bases = Knowledges.get_knowledge_bases()
@@ -561,13 +468,11 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
log.info(f"Found {len(active_kb_ids)} active knowledge bases")
- # Get all files that should be preserved (NOW COMPREHENSIVE!)
active_file_ids = get_active_file_ids()
# Stage 3: Delete orphaned database records
log.info("Deleting orphaned database records")
- # Delete files not referenced by any knowledge base or belonging to deleted users
deleted_files = 0
for file_record in Files.get_files():
should_delete = (
@@ -582,7 +487,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
if deleted_files > 0:
log.info(f"Deleted {deleted_files} orphaned files")
- # Delete knowledge bases from deleted users (if enabled)
deleted_kbs = 0
if form_data.delete_orphaned_knowledge_bases:
for kb in knowledge_bases:
@@ -596,10 +500,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping knowledge base deletion (disabled)")
- # Delete other user-owned resources from deleted users (conditional)
deleted_others = 0
- # Delete orphaned chats of deleted users (conditional)
if form_data.delete_orphaned_chats:
chats_deleted = 0
for chat in Chats.get_chats():
@@ -612,7 +514,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping orphaned chat deletion (disabled)")
- # Delete orphaned tools of deleted users (conditional)
if form_data.delete_orphaned_tools:
tools_deleted = 0
for tool in Tools.get_tools():
@@ -625,7 +526,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping tool deletion (disabled)")
- # Delete orphaned functions of deleted users (conditional)
if form_data.delete_orphaned_functions:
functions_deleted = 0
for function in Functions.get_functions():
@@ -638,7 +538,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping function deletion (disabled)")
- # Delete orphaned notes of deleted users (conditional)
if form_data.delete_orphaned_notes:
notes_deleted = 0
for note in Notes.get_notes():
@@ -651,7 +550,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping note deletion (disabled)")
- # Delete orphaned prompts of deleted users (conditional)
if form_data.delete_orphaned_prompts:
prompts_deleted = 0
for prompt in Prompts.get_prompts():
@@ -664,7 +562,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping prompt deletion (disabled)")
- # Delete orphaned models of deleted users (conditional)
if form_data.delete_orphaned_models:
models_deleted = 0
for model in Models.get_all_models():
@@ -677,7 +574,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping model deletion (disabled)")
- # Delete orphaned folders of deleted users (conditional)
if form_data.delete_orphaned_folders:
folders_deleted = 0
for folder in Folders.get_all_folders():
@@ -696,14 +592,10 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
# Stage 4: Clean up orphaned physical files
log.info("Cleaning up orphaned physical files")
- # Rebuild active sets after database cleanup
final_active_file_ids = get_active_file_ids()
final_active_kb_ids = {kb.id for kb in Knowledges.get_knowledge_bases()}
- # Clean uploads directory
cleanup_orphaned_uploads(final_active_file_ids)
-
- # Clean vector collections
cleanup_orphaned_vector_collections(final_active_file_ids, final_active_kb_ids)
# Stage 5: Audio cache cleanup
@@ -713,15 +605,12 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
# Stage 6: Database optimization
log.info("Optimizing database")
- # Vacuum main database
try:
with get_db() as db:
db.execute(text("VACUUM"))
- log.debug("Vacuumed main database")
except Exception as e:
log.error(f"Failed to vacuum main database: {e}")
- # Vacuum ChromaDB database if it exists
if "chroma" in VECTOR_DB.lower():
chroma_db_path = Path(CACHE_DIR).parent / "vector_db" / "chroma.sqlite3"
if chroma_db_path.exists():
@@ -729,7 +618,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
import sqlite3
with sqlite3.connect(str(chroma_db_path)) as conn:
conn.execute("VACUUM")
- log.debug("Vacuumed ChromaDB database")
except Exception as e:
log.error(f"Failed to vacuum ChromaDB database: {e}")
From 34c9a8825cf3802318c73829a569eb57780ab352 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 12 Aug 2025 14:54:54 +0200
Subject: [PATCH 09/43] Update prune.py
---
backend/open_webui/routers/prune.py | 479 +++++++++++++++++-----------
1 file changed, 299 insertions(+), 180 deletions(-)
diff --git a/backend/open_webui/routers/prune.py b/backend/open_webui/routers/prune.py
index ca38951832..427c9586bd 100644
--- a/backend/open_webui/routers/prune.py
+++ b/backend/open_webui/routers/prune.py
@@ -38,6 +38,7 @@ class PruneDataForm(BaseModel):
days: Optional[int] = None
exempt_archived_chats: bool = False
exempt_chats_in_folders: bool = False
+ # Orphaned resource deletion toggles (for deleted users)
delete_orphaned_chats: bool = True
delete_orphaned_tools: bool = False
delete_orphaned_functions: bool = False
@@ -46,30 +47,33 @@ class PruneDataForm(BaseModel):
delete_orphaned_models: bool = True
delete_orphaned_notes: bool = True
delete_orphaned_folders: bool = True
- audio_cache_max_age_days: Optional[int] = 30
def get_active_file_ids() -> Set[str]:
"""
Get all file IDs that are actively referenced by knowledge bases, chats, folders, and messages.
+ This is the ground truth for what files should be preserved.
"""
active_file_ids = set()
-
+
try:
- # Scan knowledge bases for file references
+ # 1. Get files referenced by knowledge bases (original logic)
knowledge_bases = Knowledges.get_knowledge_bases()
log.debug(f"Found {len(knowledge_bases)} knowledge bases")
-
+
for kb in knowledge_bases:
if not kb.data:
continue
-
+
+ # Handle different possible data structures for file references
file_ids = []
-
+
+ # Check for file_ids array
if isinstance(kb.data, dict) and "file_ids" in kb.data:
if isinstance(kb.data["file_ids"], list):
file_ids.extend(kb.data["file_ids"])
-
+
+ # Check for files array with id field
if isinstance(kb.data, dict) and "files" in kb.data:
if isinstance(kb.data["files"], list):
for file_ref in kb.data["files"]:
@@ -77,97 +81,152 @@ def get_active_file_ids() -> Set[str]:
file_ids.append(file_ref["id"])
elif isinstance(file_ref, str):
file_ids.append(file_ref)
-
+
+ # Add all found file IDs
for file_id in file_ids:
if isinstance(file_id, str) and file_id.strip():
active_file_ids.add(file_id.strip())
+ log.debug(f"KB {kb.id} references file {file_id}")
- # Scan chats for file references
+ # 2. Get files referenced in chats (NEW: scan chat JSON for file references)
chats = Chats.get_chats()
log.debug(f"Found {len(chats)} chats to scan for file references")
-
+
for chat in chats:
if not chat.chat or not isinstance(chat.chat, dict):
continue
-
+
try:
+ # Convert entire chat JSON to string and extract all file IDs
chat_json_str = json.dumps(chat.chat)
-
- # Extract file IDs using regex patterns
- file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
- url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
-
+
+ # Find all file ID patterns in the JSON
+ # Pattern 1: "id": "uuid" where uuid looks like a file ID
+ file_id_pattern = re.compile(
+ r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
+ )
potential_file_ids = file_id_pattern.findall(chat_json_str)
+
+ # Pattern 2: URLs containing /api/v1/files/uuid
+ url_pattern = re.compile(
+ r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
+ )
url_file_ids = url_pattern.findall(chat_json_str)
-
+
+ # Combine and validate against actual file records
all_potential_ids = set(potential_file_ids + url_file_ids)
for file_id in all_potential_ids:
+ # Verify this ID exists in the file table to avoid false positives
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
-
+ log.debug(f"Chat {chat.id}: Found active file {file_id}")
+
except Exception as e:
log.debug(f"Error processing chat {chat.id} for file references: {e}")
- # Scan folders for file references
+ # 3. Get files referenced in folders (scan folder.items, folder.data, folder.meta)
try:
folders = Folders.get_all_folders()
-
+ log.debug(f"Found {len(folders)} folders to scan for file references")
+
for folder in folders:
+ # Check folder.items JSON
if folder.items:
try:
items_str = json.dumps(folder.items)
- file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
- url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
-
- potential_ids = file_id_pattern.findall(items_str) + url_pattern.findall(items_str)
+ # Look for file ID patterns in the JSON
+ file_id_pattern = re.compile(
+ r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
+ )
+ url_pattern = re.compile(
+ r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
+ )
+
+ potential_ids = file_id_pattern.findall(
+ items_str
+ ) + url_pattern.findall(items_str)
for file_id in potential_ids:
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
+ log.debug(
+ f"Folder {folder.id}: Found file {file_id} in items"
+ )
except Exception as e:
log.debug(f"Error processing folder {folder.id} items: {e}")
-
- if hasattr(folder, 'data') and folder.data:
+
+ # Check folder.data JSON
+ if hasattr(folder, "data") and folder.data:
try:
data_str = json.dumps(folder.data)
- file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
- url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
-
- potential_ids = file_id_pattern.findall(data_str) + url_pattern.findall(data_str)
+ file_id_pattern = re.compile(
+ r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
+ )
+ url_pattern = re.compile(
+ r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
+ )
+
+ potential_ids = file_id_pattern.findall(
+ data_str
+ ) + url_pattern.findall(data_str)
for file_id in potential_ids:
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
+ log.debug(
+ f"Folder {folder.id}: Found file {file_id} in data"
+ )
except Exception as e:
log.debug(f"Error processing folder {folder.id} data: {e}")
-
+
except Exception as e:
log.debug(f"Error scanning folders for file references: {e}")
- # Scan standalone messages for file references
+ # 4. Get files referenced in standalone messages (message table)
try:
+ # Query message table directly since we may not have a Messages model
with get_db() as db:
- message_results = db.execute(text("SELECT id, data FROM message WHERE data IS NOT NULL")).fetchall()
-
+ message_results = db.execute(
+ text("SELECT id, data FROM message WHERE data IS NOT NULL")
+ ).fetchall()
+ log.debug(f"Found {len(message_results)} messages with data to scan")
+
for message_id, message_data_json in message_results:
if message_data_json:
try:
- data_str = json.dumps(message_data_json) if isinstance(message_data_json, dict) else str(message_data_json)
-
- file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
- url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
-
- potential_ids = file_id_pattern.findall(data_str) + url_pattern.findall(data_str)
+ # Convert JSON to string and scan for file patterns
+ data_str = (
+ json.dumps(message_data_json)
+ if isinstance(message_data_json, dict)
+ else str(message_data_json)
+ )
+
+ file_id_pattern = re.compile(
+ r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
+ )
+ url_pattern = re.compile(
+ r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
+ )
+
+ potential_ids = file_id_pattern.findall(
+ data_str
+ ) + url_pattern.findall(data_str)
for file_id in potential_ids:
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
+ log.debug(
+ f"Message {message_id}: Found file {file_id}"
+ )
except Exception as e:
- log.debug(f"Error processing message {message_id} data: {e}")
+ log.debug(
+ f"Error processing message {message_id} data: {e}"
+ )
except Exception as e:
log.debug(f"Error scanning messages for file references: {e}")
-
+
except Exception as e:
log.error(f"Error determining active file IDs: {e}")
+ # Fail safe: return empty set, which will prevent deletion
return set()
-
+
log.info(f"Found {len(active_file_ids)} active file IDs")
return active_file_ids
@@ -177,19 +236,23 @@ def safe_delete_vector_collection(collection_name: str) -> bool:
Safely delete a vector collection, handling both logical and physical cleanup.
"""
try:
+ # First, try to delete the collection through the client
try:
VECTOR_DB_CLIENT.delete_collection(collection_name=collection_name)
+ log.debug(f"Deleted collection from vector DB: {collection_name}")
except Exception as e:
log.debug(f"Collection {collection_name} may not exist in DB: {e}")
-
+
+ # Then, handle physical cleanup for ChromaDB
if "chroma" in VECTOR_DB.lower():
vector_dir = Path(CACHE_DIR).parent / "vector_db" / collection_name
if vector_dir.exists() and vector_dir.is_dir():
shutil.rmtree(vector_dir)
+ log.debug(f"Deleted physical vector directory: {vector_dir}")
return True
-
+
return True
-
+
except Exception as e:
log.error(f"Error deleting vector collection {collection_name}: {e}")
return False
@@ -200,17 +263,22 @@ def safe_delete_file_by_id(file_id: str) -> bool:
Safely delete a file record and its associated vector collection.
"""
try:
+ # Get file info before deletion
file_record = Files.get_file_by_id(file_id)
if not file_record:
- return True
-
+ log.debug(f"File {file_id} not found in database")
+ return True # Already gone
+
+ # Delete vector collection first
collection_name = f"file-{file_id}"
safe_delete_vector_collection(collection_name)
-
+
+ # Delete database record
Files.delete_file_by_id(file_id)
-
+ log.debug(f"Deleted file record: {file_id}")
+
return True
-
+
except Exception as e:
log.error(f"Error deleting file {file_id}: {e}")
return False
@@ -222,182 +290,197 @@ def cleanup_orphaned_uploads(active_file_ids: Set[str]) -> None:
"""
upload_dir = Path(CACHE_DIR).parent / "uploads"
if not upload_dir.exists():
+ log.debug("Uploads directory does not exist")
return
-
+
deleted_count = 0
-
+
try:
for file_path in upload_dir.iterdir():
if not file_path.is_file():
continue
-
+
filename = file_path.name
+
+ # Extract file ID from filename (common patterns)
file_id = None
-
- # Extract file ID from filename patterns
+
+ # Pattern 1: UUID_filename or UUID-filename
if len(filename) > 36:
potential_id = filename[:36]
- if potential_id.count('-') == 4:
+ if potential_id.count("-") == 4: # UUID format
file_id = potential_id
-
- if not file_id and filename.count('-') == 4 and len(filename) == 36:
+
+ # Pattern 2: filename might be the file ID itself
+ if not file_id and filename.count("-") == 4 and len(filename) == 36:
file_id = filename
-
+
+ # Pattern 3: Check if any part of filename matches active IDs
if not file_id:
for active_id in active_file_ids:
if active_id in filename:
file_id = active_id
break
-
+
+ # If we found a potential file ID and it's not active, delete it
if file_id and file_id not in active_file_ids:
try:
file_path.unlink()
deleted_count += 1
+ log.debug(f"Deleted orphaned upload file: {filename}")
except Exception as e:
log.error(f"Failed to delete upload file {filename}: {e}")
-
+
except Exception as e:
log.error(f"Error cleaning uploads directory: {e}")
-
+
if deleted_count > 0:
log.info(f"Deleted {deleted_count} orphaned upload files")
-def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids: Set[str]) -> None:
+def cleanup_orphaned_vector_collections(
+ active_file_ids: Set[str], active_kb_ids: Set[str]
+) -> None:
"""
Clean up orphaned vector collections by querying ChromaDB metadata.
"""
if "chroma" not in VECTOR_DB.lower():
return
-
+
vector_dir = Path(CACHE_DIR).parent / "vector_db"
if not vector_dir.exists():
+ log.debug("Vector DB directory does not exist")
return
-
+
chroma_db_path = vector_dir / "chroma.sqlite3"
if not chroma_db_path.exists():
+ log.debug("ChromaDB metadata file does not exist")
return
-
+
+ # Build expected collection names
expected_collections = set()
-
+
+ # File collections: file-{file_id}
for file_id in active_file_ids:
expected_collections.add(f"file-{file_id}")
-
+
+ # Knowledge base collections: {kb_id}
for kb_id in active_kb_ids:
expected_collections.add(kb_id)
-
+
+ log.debug(f"Expected collections to preserve: {expected_collections}")
+
+ # Query ChromaDB metadata to get the complete mapping chain:
+ # Directory UUID -> Collection ID -> Collection Name
uuid_to_collection = {}
try:
import sqlite3
-
+
+ log.debug(f"Attempting to connect to ChromaDB at: {chroma_db_path}")
+
with sqlite3.connect(str(chroma_db_path)) as conn:
+ # First, check what tables exist
+ tables = conn.execute(
+ "SELECT name FROM sqlite_master WHERE type='table'"
+ ).fetchall()
+ log.debug(f"ChromaDB tables: {tables}")
+
+ # Check the schema of collections table
+ schema = conn.execute("PRAGMA table_info(collections)").fetchall()
+ log.debug(f"Collections table schema: {schema}")
+
+ # Get Collection ID -> Collection Name mapping
collection_id_to_name = {}
cursor = conn.execute("SELECT id, name FROM collections")
rows = cursor.fetchall()
-
+ log.debug(f"Raw ChromaDB collections query results: {rows}")
+
for row in rows:
collection_id, collection_name = row
collection_id_to_name[collection_id] = collection_name
-
- cursor = conn.execute("SELECT id, collection FROM segments WHERE scope = 'VECTOR'")
+ log.debug(
+ f"Mapped collection ID {collection_id} -> name {collection_name}"
+ )
+
+ # Get Directory UUID -> Collection ID mapping from segments table
+ # Only interested in VECTOR segments as those are the actual data directories
+ cursor = conn.execute(
+ "SELECT id, collection FROM segments WHERE scope = 'VECTOR'"
+ )
segment_rows = cursor.fetchall()
-
+ log.debug(f"Raw ChromaDB segments query results: {segment_rows}")
+
for row in segment_rows:
segment_id, collection_id = row
if collection_id in collection_id_to_name:
collection_name = collection_id_to_name[collection_id]
uuid_to_collection[segment_id] = collection_name
-
- log.info(f"Found {len(uuid_to_collection)} vector segments in ChromaDB metadata")
-
+ log.debug(
+ f"Mapped directory UUID {segment_id} -> collection {collection_name}"
+ )
+
+ log.debug(f"Final uuid_to_collection mapping: {uuid_to_collection}")
+ log.info(
+ f"Found {len(uuid_to_collection)} vector segments in ChromaDB metadata"
+ )
+
except Exception as e:
log.error(f"Error reading ChromaDB metadata: {e}")
+ # Fail safe: don't delete anything if we can't read metadata
return
-
+
deleted_count = 0
-
+
try:
for collection_dir in vector_dir.iterdir():
if not collection_dir.is_dir():
continue
-
+
dir_uuid = collection_dir.name
-
- if dir_uuid.startswith('.'):
+
+ # Skip system/metadata files
+ if dir_uuid.startswith("."):
continue
-
+
+ # Get the actual collection name from metadata
collection_name = uuid_to_collection.get(dir_uuid)
-
+
if collection_name is None:
+ # Directory exists but no metadata entry - it's orphaned
+ log.debug(f"Directory {dir_uuid} has no metadata entry, deleting")
try:
shutil.rmtree(collection_dir)
deleted_count += 1
except Exception as e:
log.error(f"Failed to delete orphaned directory {dir_uuid}: {e}")
-
+
elif collection_name not in expected_collections:
+ # Collection exists but should be deleted
+ log.debug(
+ f"Collection {collection_name} (UUID: {dir_uuid}) is orphaned, deleting"
+ )
try:
shutil.rmtree(collection_dir)
deleted_count += 1
except Exception as e:
log.error(f"Failed to delete collection directory {dir_uuid}: {e}")
-
+
+ else:
+ # Collection should be preserved
+ log.debug(f"Preserving collection {collection_name} (UUID: {dir_uuid})")
+
except Exception as e:
log.error(f"Error cleaning vector collections: {e}")
-
+
if deleted_count > 0:
log.info(f"Deleted {deleted_count} orphaned vector collections")
-def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
- """
- Clean up audio cache files older than specified days.
- """
- if max_age_days is None:
- log.info("Skipping audio cache cleanup (max_age_days is None)")
- return
-
- cutoff_time = time.time() - (max_age_days * 86400)
- deleted_count = 0
- total_size_deleted = 0
-
- audio_dirs = [
- Path(CACHE_DIR) / "audio" / "speech",
- Path(CACHE_DIR) / "audio" / "transcriptions"
- ]
-
- for audio_dir in audio_dirs:
- if not audio_dir.exists():
- continue
-
- try:
- for file_path in audio_dir.iterdir():
- if not file_path.is_file():
- continue
-
- file_mtime = file_path.stat().st_mtime
- if file_mtime < cutoff_time:
- try:
- file_size = file_path.stat().st_size
- file_path.unlink()
- deleted_count += 1
- total_size_deleted += file_size
- except Exception as e:
- log.error(f"Failed to delete audio file {file_path}: {e}")
-
- except Exception as e:
- log.error(f"Error cleaning audio directory {audio_dir}: {e}")
-
- if deleted_count > 0:
- size_mb = total_size_deleted / (1024 * 1024)
- log.info(f"Deleted {deleted_count} audio cache files ({size_mb:.1f} MB), older than {max_age_days} days")
-
-
@router.post("/", response_model=bool)
async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
"""
Prunes old and orphaned data using a safe, multi-stage process.
-
+
Parameters:
- days: Optional[int] = None
- If None: Skip chat deletion entirely
@@ -424,69 +507,90 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
- If True: Delete notes from deleted users
- delete_orphaned_folders: bool = True
- If True: Delete folders from deleted users
- - audio_cache_max_age_days: Optional[int] = 30
- - If None: Skip audio cache cleanup
- - If >= 0: Delete audio cache files (TTS, STT) older than specified days
"""
try:
log.info("Starting data pruning process")
-
- # Stage 1: Delete old chats based on user criteria
+
+ # Stage 1: Delete old chats based on user criteria (optional)
if form_data.days is not None:
cutoff_time = int(time.time()) - (form_data.days * 86400)
chats_to_delete = []
-
+
for chat in Chats.get_chats():
if chat.updated_at < cutoff_time:
+ # Check exemption conditions
if form_data.exempt_archived_chats and chat.archived:
+ log.debug(f"Exempting archived chat: {chat.id}")
continue
- if form_data.exempt_chats_in_folders and (getattr(chat, 'folder_id', None) is not None or getattr(chat, 'pinned', False)):
+ if form_data.exempt_chats_in_folders and (
+ getattr(chat, "folder_id", None) is not None
+ or getattr(chat, "pinned", False)
+ ):
+ folder_status = (
+ f"folder_id: {getattr(chat, 'folder_id', None)}"
+ if getattr(chat, "folder_id", None)
+ else "not in folder"
+ )
+ pinned_status = f"pinned: {getattr(chat, 'pinned', False)}"
+ log.debug(
+ f"Exempting chat in folder or pinned: {chat.id} ({folder_status}, {pinned_status})"
+ )
continue
+ log.debug(
+ f"Chat {chat.id} will be deleted - archived: {getattr(chat, 'archived', False)}, folder_id: {getattr(chat, 'folder_id', None)}, pinned: {getattr(chat, 'pinned', False)}"
+ )
chats_to_delete.append(chat)
-
+
if chats_to_delete:
- log.info(f"Deleting {len(chats_to_delete)} old chats (older than {form_data.days} days)")
+ log.info(
+ f"Deleting {len(chats_to_delete)} old chats (older than {form_data.days} days)"
+ )
for chat in chats_to_delete:
Chats.delete_chat_by_id(chat.id)
else:
log.info(f"No chats found older than {form_data.days} days")
else:
log.info("Skipping chat deletion (days parameter is None)")
-
- # Stage 2: Build preservation set
+
+ # Stage 2: Build ground truth of what should be preserved
log.info("Building preservation set")
-
+
+ # Get all active users
active_user_ids = {user.id for user in Users.get_users()["users"]}
log.info(f"Found {len(active_user_ids)} active users")
-
+
+ # Get all active knowledge bases and their file references
active_kb_ids = set()
knowledge_bases = Knowledges.get_knowledge_bases()
-
+
for kb in knowledge_bases:
if kb.user_id in active_user_ids:
active_kb_ids.add(kb.id)
-
+
log.info(f"Found {len(active_kb_ids)} active knowledge bases")
-
+
+ # Get all files that should be preserved (NOW COMPREHENSIVE!)
active_file_ids = get_active_file_ids()
-
+
# Stage 3: Delete orphaned database records
log.info("Deleting orphaned database records")
-
+
+ # Delete files not referenced by any knowledge base or belonging to deleted users
deleted_files = 0
for file_record in Files.get_files():
should_delete = (
- file_record.id not in active_file_ids or
- file_record.user_id not in active_user_ids
+ file_record.id not in active_file_ids
+ or file_record.user_id not in active_user_ids
)
-
+
if should_delete:
if safe_delete_file_by_id(file_record.id):
deleted_files += 1
-
+
if deleted_files > 0:
log.info(f"Deleted {deleted_files} orphaned files")
-
+
+ # Delete knowledge bases from deleted users (if enabled)
deleted_kbs = 0
if form_data.delete_orphaned_knowledge_bases:
for kb in knowledge_bases:
@@ -494,14 +598,16 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
if safe_delete_vector_collection(kb.id):
Knowledges.delete_knowledge_by_id(kb.id)
deleted_kbs += 1
-
+
if deleted_kbs > 0:
log.info(f"Deleted {deleted_kbs} orphaned knowledge bases")
else:
log.info("Skipping knowledge base deletion (disabled)")
-
+
+ # Delete other user-owned resources from deleted users (conditional)
deleted_others = 0
-
+
+ # Delete orphaned chats of deleted users (conditional)
if form_data.delete_orphaned_chats:
chats_deleted = 0
for chat in Chats.get_chats():
@@ -513,7 +619,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
log.info(f"Deleted {chats_deleted} orphaned chats")
else:
log.info("Skipping orphaned chat deletion (disabled)")
-
+
+ # Delete orphaned tools of deleted users (conditional)
if form_data.delete_orphaned_tools:
tools_deleted = 0
for tool in Tools.get_tools():
@@ -525,7 +632,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
log.info(f"Deleted {tools_deleted} orphaned tools")
else:
log.info("Skipping tool deletion (disabled)")
-
+
+ # Delete orphaned functions of deleted users (conditional)
if form_data.delete_orphaned_functions:
functions_deleted = 0
for function in Functions.get_functions():
@@ -537,7 +645,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
log.info(f"Deleted {functions_deleted} orphaned functions")
else:
log.info("Skipping function deletion (disabled)")
-
+
+ # Delete orphaned notes of deleted users (conditional)
if form_data.delete_orphaned_notes:
notes_deleted = 0
for note in Notes.get_notes():
@@ -549,7 +658,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
log.info(f"Deleted {notes_deleted} orphaned notes")
else:
log.info("Skipping note deletion (disabled)")
-
+
+ # Delete orphaned prompts of deleted users (conditional)
if form_data.delete_orphaned_prompts:
prompts_deleted = 0
for prompt in Prompts.get_prompts():
@@ -561,7 +671,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
log.info(f"Deleted {prompts_deleted} orphaned prompts")
else:
log.info("Skipping prompt deletion (disabled)")
-
+
+ # Delete orphaned models of deleted users (conditional)
if form_data.delete_orphaned_models:
models_deleted = 0
for model in Models.get_all_models():
@@ -573,57 +684,65 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
log.info(f"Deleted {models_deleted} orphaned models")
else:
log.info("Skipping model deletion (disabled)")
-
+
+ # Delete orphaned folders of deleted users (conditional)
if form_data.delete_orphaned_folders:
folders_deleted = 0
for folder in Folders.get_all_folders():
if folder.user_id not in active_user_ids:
- Folders.delete_folder_by_id_and_user_id(folder.id, folder.user_id, delete_chats=False)
+ Folders.delete_folder_by_id_and_user_id(
+ folder.id, folder.user_id, delete_chats=False
+ )
folders_deleted += 1
deleted_others += 1
if folders_deleted > 0:
log.info(f"Deleted {folders_deleted} orphaned folders")
else:
log.info("Skipping folder deletion (disabled)")
-
+
if deleted_others > 0:
log.info(f"Total other orphaned records deleted: {deleted_others}")
-
+
# Stage 4: Clean up orphaned physical files
log.info("Cleaning up orphaned physical files")
-
+
+ # Rebuild active sets after database cleanup
final_active_file_ids = get_active_file_ids()
final_active_kb_ids = {kb.id for kb in Knowledges.get_knowledge_bases()}
-
+
+ # Clean uploads directory
cleanup_orphaned_uploads(final_active_file_ids)
+
+ # Clean vector collections
cleanup_orphaned_vector_collections(final_active_file_ids, final_active_kb_ids)
-
- # Stage 5: Audio cache cleanup
- log.info("Cleaning audio cache")
- cleanup_audio_cache(form_data.audio_cache_max_age_days)
-
- # Stage 6: Database optimization
+
+ # Stage 5: Database optimization
log.info("Optimizing database")
-
+
+ # Vacuum main database
try:
with get_db() as db:
db.execute(text("VACUUM"))
+ log.debug("Vacuumed main database")
except Exception as e:
log.error(f"Failed to vacuum main database: {e}")
-
+
+ # Vacuum ChromaDB database if it exists
if "chroma" in VECTOR_DB.lower():
chroma_db_path = Path(CACHE_DIR).parent / "vector_db" / "chroma.sqlite3"
if chroma_db_path.exists():
try:
import sqlite3
+
with sqlite3.connect(str(chroma_db_path)) as conn:
conn.execute("VACUUM")
+ log.debug("Vacuumed ChromaDB database")
except Exception as e:
log.error(f"Failed to vacuum ChromaDB database: {e}")
-
+
log.info("Data pruning completed successfully")
return True
-
+
except Exception as e:
log.exception(f"Error during data pruning: {e}")
raise HTTPException(
From 482030ff6970ed344690b79df884be1e09ba7d2a Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 12 Aug 2025 14:56:44 +0200
Subject: [PATCH 10/43] Update prune.py
---
backend/open_webui/routers/prune.py | 207 +++++++++-------------------
1 file changed, 67 insertions(+), 140 deletions(-)
diff --git a/backend/open_webui/routers/prune.py b/backend/open_webui/routers/prune.py
index 427c9586bd..da08037046 100644
--- a/backend/open_webui/routers/prune.py
+++ b/backend/open_webui/routers/prune.py
@@ -38,7 +38,6 @@ class PruneDataForm(BaseModel):
days: Optional[int] = None
exempt_archived_chats: bool = False
exempt_chats_in_folders: bool = False
- # Orphaned resource deletion toggles (for deleted users)
delete_orphaned_chats: bool = True
delete_orphaned_tools: bool = False
delete_orphaned_functions: bool = False
@@ -47,17 +46,17 @@ class PruneDataForm(BaseModel):
delete_orphaned_models: bool = True
delete_orphaned_notes: bool = True
delete_orphaned_folders: bool = True
+ audio_cache_max_age_days: Optional[int] = 30
def get_active_file_ids() -> Set[str]:
"""
Get all file IDs that are actively referenced by knowledge bases, chats, folders, and messages.
- This is the ground truth for what files should be preserved.
"""
active_file_ids = set()
try:
- # 1. Get files referenced by knowledge bases (original logic)
+ # Scan knowledge bases for file references
knowledge_bases = Knowledges.get_knowledge_bases()
log.debug(f"Found {len(knowledge_bases)} knowledge bases")
@@ -65,15 +64,12 @@ def get_active_file_ids() -> Set[str]:
if not kb.data:
continue
- # Handle different possible data structures for file references
file_ids = []
- # Check for file_ids array
if isinstance(kb.data, dict) and "file_ids" in kb.data:
if isinstance(kb.data["file_ids"], list):
file_ids.extend(kb.data["file_ids"])
- # Check for files array with id field
if isinstance(kb.data, dict) and "files" in kb.data:
if isinstance(kb.data["files"], list):
for file_ref in kb.data["files"]:
@@ -82,13 +78,11 @@ def get_active_file_ids() -> Set[str]:
elif isinstance(file_ref, str):
file_ids.append(file_ref)
- # Add all found file IDs
for file_id in file_ids:
if isinstance(file_id, str) and file_id.strip():
active_file_ids.add(file_id.strip())
- log.debug(f"KB {kb.id} references file {file_id}")
- # 2. Get files referenced in chats (NEW: scan chat JSON for file references)
+ # Scan chats for file references
chats = Chats.get_chats()
log.debug(f"Found {len(chats)} chats to scan for file references")
@@ -97,44 +91,35 @@ def get_active_file_ids() -> Set[str]:
continue
try:
- # Convert entire chat JSON to string and extract all file IDs
chat_json_str = json.dumps(chat.chat)
- # Find all file ID patterns in the JSON
- # Pattern 1: "id": "uuid" where uuid looks like a file ID
+ # Extract file IDs using regex patterns
file_id_pattern = re.compile(
r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
)
- potential_file_ids = file_id_pattern.findall(chat_json_str)
-
- # Pattern 2: URLs containing /api/v1/files/uuid
url_pattern = re.compile(
r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
)
+
+ potential_file_ids = file_id_pattern.findall(chat_json_str)
url_file_ids = url_pattern.findall(chat_json_str)
- # Combine and validate against actual file records
all_potential_ids = set(potential_file_ids + url_file_ids)
for file_id in all_potential_ids:
- # Verify this ID exists in the file table to avoid false positives
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
- log.debug(f"Chat {chat.id}: Found active file {file_id}")
except Exception as e:
log.debug(f"Error processing chat {chat.id} for file references: {e}")
- # 3. Get files referenced in folders (scan folder.items, folder.data, folder.meta)
+ # Scan folders for file references
try:
folders = Folders.get_all_folders()
- log.debug(f"Found {len(folders)} folders to scan for file references")
for folder in folders:
- # Check folder.items JSON
if folder.items:
try:
items_str = json.dumps(folder.items)
- # Look for file ID patterns in the JSON
file_id_pattern = re.compile(
r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
)
@@ -148,13 +133,9 @@ def get_active_file_ids() -> Set[str]:
for file_id in potential_ids:
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
- log.debug(
- f"Folder {folder.id}: Found file {file_id} in items"
- )
except Exception as e:
log.debug(f"Error processing folder {folder.id} items: {e}")
- # Check folder.data JSON
if hasattr(folder, "data") and folder.data:
try:
data_str = json.dumps(folder.data)
@@ -171,28 +152,22 @@ def get_active_file_ids() -> Set[str]:
for file_id in potential_ids:
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
- log.debug(
- f"Folder {folder.id}: Found file {file_id} in data"
- )
except Exception as e:
log.debug(f"Error processing folder {folder.id} data: {e}")
except Exception as e:
log.debug(f"Error scanning folders for file references: {e}")
- # 4. Get files referenced in standalone messages (message table)
+ # Scan standalone messages for file references
try:
- # Query message table directly since we may not have a Messages model
with get_db() as db:
message_results = db.execute(
text("SELECT id, data FROM message WHERE data IS NOT NULL")
).fetchall()
- log.debug(f"Found {len(message_results)} messages with data to scan")
for message_id, message_data_json in message_results:
if message_data_json:
try:
- # Convert JSON to string and scan for file patterns
data_str = (
json.dumps(message_data_json)
if isinstance(message_data_json, dict)
@@ -212,9 +187,6 @@ def get_active_file_ids() -> Set[str]:
for file_id in potential_ids:
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
- log.debug(
- f"Message {message_id}: Found file {file_id}"
- )
except Exception as e:
log.debug(
f"Error processing message {message_id} data: {e}"
@@ -224,7 +196,6 @@ def get_active_file_ids() -> Set[str]:
except Exception as e:
log.error(f"Error determining active file IDs: {e}")
- # Fail safe: return empty set, which will prevent deletion
return set()
log.info(f"Found {len(active_file_ids)} active file IDs")
@@ -236,19 +207,15 @@ def safe_delete_vector_collection(collection_name: str) -> bool:
Safely delete a vector collection, handling both logical and physical cleanup.
"""
try:
- # First, try to delete the collection through the client
try:
VECTOR_DB_CLIENT.delete_collection(collection_name=collection_name)
- log.debug(f"Deleted collection from vector DB: {collection_name}")
except Exception as e:
log.debug(f"Collection {collection_name} may not exist in DB: {e}")
- # Then, handle physical cleanup for ChromaDB
if "chroma" in VECTOR_DB.lower():
vector_dir = Path(CACHE_DIR).parent / "vector_db" / collection_name
if vector_dir.exists() and vector_dir.is_dir():
shutil.rmtree(vector_dir)
- log.debug(f"Deleted physical vector directory: {vector_dir}")
return True
return True
@@ -263,19 +230,14 @@ def safe_delete_file_by_id(file_id: str) -> bool:
Safely delete a file record and its associated vector collection.
"""
try:
- # Get file info before deletion
file_record = Files.get_file_by_id(file_id)
if not file_record:
- log.debug(f"File {file_id} not found in database")
- return True # Already gone
+ return True
- # Delete vector collection first
collection_name = f"file-{file_id}"
safe_delete_vector_collection(collection_name)
- # Delete database record
Files.delete_file_by_id(file_id)
- log.debug(f"Deleted file record: {file_id}")
return True
@@ -290,7 +252,6 @@ def cleanup_orphaned_uploads(active_file_ids: Set[str]) -> None:
"""
upload_dir = Path(CACHE_DIR).parent / "uploads"
if not upload_dir.exists():
- log.debug("Uploads directory does not exist")
return
deleted_count = 0
@@ -301,33 +262,27 @@ def cleanup_orphaned_uploads(active_file_ids: Set[str]) -> None:
continue
filename = file_path.name
-
- # Extract file ID from filename (common patterns)
file_id = None
- # Pattern 1: UUID_filename or UUID-filename
+ # Extract file ID from filename patterns
if len(filename) > 36:
potential_id = filename[:36]
- if potential_id.count("-") == 4: # UUID format
+ if potential_id.count("-") == 4:
file_id = potential_id
- # Pattern 2: filename might be the file ID itself
if not file_id and filename.count("-") == 4 and len(filename) == 36:
file_id = filename
- # Pattern 3: Check if any part of filename matches active IDs
if not file_id:
for active_id in active_file_ids:
if active_id in filename:
file_id = active_id
break
- # If we found a potential file ID and it's not active, delete it
if file_id and file_id not in active_file_ids:
try:
file_path.unlink()
deleted_count += 1
- log.debug(f"Deleted orphaned upload file: {filename}")
except Exception as e:
log.error(f"Failed to delete upload file {filename}: {e}")
@@ -349,84 +304,50 @@ def cleanup_orphaned_vector_collections(
vector_dir = Path(CACHE_DIR).parent / "vector_db"
if not vector_dir.exists():
- log.debug("Vector DB directory does not exist")
return
chroma_db_path = vector_dir / "chroma.sqlite3"
if not chroma_db_path.exists():
- log.debug("ChromaDB metadata file does not exist")
return
- # Build expected collection names
expected_collections = set()
- # File collections: file-{file_id}
for file_id in active_file_ids:
expected_collections.add(f"file-{file_id}")
- # Knowledge base collections: {kb_id}
for kb_id in active_kb_ids:
expected_collections.add(kb_id)
- log.debug(f"Expected collections to preserve: {expected_collections}")
-
- # Query ChromaDB metadata to get the complete mapping chain:
- # Directory UUID -> Collection ID -> Collection Name
uuid_to_collection = {}
try:
import sqlite3
- log.debug(f"Attempting to connect to ChromaDB at: {chroma_db_path}")
-
with sqlite3.connect(str(chroma_db_path)) as conn:
- # First, check what tables exist
- tables = conn.execute(
- "SELECT name FROM sqlite_master WHERE type='table'"
- ).fetchall()
- log.debug(f"ChromaDB tables: {tables}")
-
- # Check the schema of collections table
- schema = conn.execute("PRAGMA table_info(collections)").fetchall()
- log.debug(f"Collections table schema: {schema}")
-
- # Get Collection ID -> Collection Name mapping
collection_id_to_name = {}
cursor = conn.execute("SELECT id, name FROM collections")
rows = cursor.fetchall()
- log.debug(f"Raw ChromaDB collections query results: {rows}")
for row in rows:
collection_id, collection_name = row
collection_id_to_name[collection_id] = collection_name
- log.debug(
- f"Mapped collection ID {collection_id} -> name {collection_name}"
- )
- # Get Directory UUID -> Collection ID mapping from segments table
- # Only interested in VECTOR segments as those are the actual data directories
cursor = conn.execute(
"SELECT id, collection FROM segments WHERE scope = 'VECTOR'"
)
segment_rows = cursor.fetchall()
- log.debug(f"Raw ChromaDB segments query results: {segment_rows}")
for row in segment_rows:
segment_id, collection_id = row
if collection_id in collection_id_to_name:
collection_name = collection_id_to_name[collection_id]
uuid_to_collection[segment_id] = collection_name
- log.debug(
- f"Mapped directory UUID {segment_id} -> collection {collection_name}"
- )
- log.debug(f"Final uuid_to_collection mapping: {uuid_to_collection}")
log.info(
f"Found {len(uuid_to_collection)} vector segments in ChromaDB metadata"
)
except Exception as e:
log.error(f"Error reading ChromaDB metadata: {e}")
- # Fail safe: don't delete anything if we can't read metadata
return
deleted_count = 0
@@ -438,16 +359,12 @@ def cleanup_orphaned_vector_collections(
dir_uuid = collection_dir.name
- # Skip system/metadata files
if dir_uuid.startswith("."):
continue
- # Get the actual collection name from metadata
collection_name = uuid_to_collection.get(dir_uuid)
if collection_name is None:
- # Directory exists but no metadata entry - it's orphaned
- log.debug(f"Directory {dir_uuid} has no metadata entry, deleting")
try:
shutil.rmtree(collection_dir)
deleted_count += 1
@@ -455,20 +372,12 @@ def cleanup_orphaned_vector_collections(
log.error(f"Failed to delete orphaned directory {dir_uuid}: {e}")
elif collection_name not in expected_collections:
- # Collection exists but should be deleted
- log.debug(
- f"Collection {collection_name} (UUID: {dir_uuid}) is orphaned, deleting"
- )
try:
shutil.rmtree(collection_dir)
deleted_count += 1
except Exception as e:
log.error(f"Failed to delete collection directory {dir_uuid}: {e}")
- else:
- # Collection should be preserved
- log.debug(f"Preserving collection {collection_name} (UUID: {dir_uuid})")
-
except Exception as e:
log.error(f"Error cleaning vector collections: {e}")
@@ -476,6 +385,52 @@ def cleanup_orphaned_vector_collections(
log.info(f"Deleted {deleted_count} orphaned vector collections")
+def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
+ """
+ Clean up audio cache files older than specified days.
+ """
+ if max_age_days is None:
+ log.info("Skipping audio cache cleanup (max_age_days is None)")
+ return
+
+ cutoff_time = time.time() - (max_age_days * 86400)
+ deleted_count = 0
+ total_size_deleted = 0
+
+ audio_dirs = [
+ Path(CACHE_DIR) / "audio" / "speech",
+ Path(CACHE_DIR) / "audio" / "transcriptions",
+ ]
+
+ for audio_dir in audio_dirs:
+ if not audio_dir.exists():
+ continue
+
+ try:
+ for file_path in audio_dir.iterdir():
+ if not file_path.is_file():
+ continue
+
+ file_mtime = file_path.stat().st_mtime
+ if file_mtime < cutoff_time:
+ try:
+ file_size = file_path.stat().st_size
+ file_path.unlink()
+ deleted_count += 1
+ total_size_deleted += file_size
+ except Exception as e:
+ log.error(f"Failed to delete audio file {file_path}: {e}")
+
+ except Exception as e:
+ log.error(f"Error cleaning audio directory {audio_dir}: {e}")
+
+ if deleted_count > 0:
+ size_mb = total_size_deleted / (1024 * 1024)
+ log.info(
+ f"Deleted {deleted_count} audio cache files ({size_mb:.1f} MB), older than {max_age_days} days"
+ )
+
+
@router.post("/", response_model=bool)
async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
"""
@@ -507,38 +462,27 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
- If True: Delete notes from deleted users
- delete_orphaned_folders: bool = True
- If True: Delete folders from deleted users
+ - audio_cache_max_age_days: Optional[int] = 30
+ - If None: Skip audio cache cleanup
+ - If >= 0: Delete audio cache files (TTS, STT) older than specified days
"""
try:
log.info("Starting data pruning process")
- # Stage 1: Delete old chats based on user criteria (optional)
+ # Stage 1: Delete old chats based on user criteria
if form_data.days is not None:
cutoff_time = int(time.time()) - (form_data.days * 86400)
chats_to_delete = []
for chat in Chats.get_chats():
if chat.updated_at < cutoff_time:
- # Check exemption conditions
if form_data.exempt_archived_chats and chat.archived:
- log.debug(f"Exempting archived chat: {chat.id}")
continue
if form_data.exempt_chats_in_folders and (
getattr(chat, "folder_id", None) is not None
or getattr(chat, "pinned", False)
):
- folder_status = (
- f"folder_id: {getattr(chat, 'folder_id', None)}"
- if getattr(chat, "folder_id", None)
- else "not in folder"
- )
- pinned_status = f"pinned: {getattr(chat, 'pinned', False)}"
- log.debug(
- f"Exempting chat in folder or pinned: {chat.id} ({folder_status}, {pinned_status})"
- )
continue
- log.debug(
- f"Chat {chat.id} will be deleted - archived: {getattr(chat, 'archived', False)}, folder_id: {getattr(chat, 'folder_id', None)}, pinned: {getattr(chat, 'pinned', False)}"
- )
chats_to_delete.append(chat)
if chats_to_delete:
@@ -552,14 +496,12 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping chat deletion (days parameter is None)")
- # Stage 2: Build ground truth of what should be preserved
+ # Stage 2: Build preservation set
log.info("Building preservation set")
- # Get all active users
active_user_ids = {user.id for user in Users.get_users()["users"]}
log.info(f"Found {len(active_user_ids)} active users")
- # Get all active knowledge bases and their file references
active_kb_ids = set()
knowledge_bases = Knowledges.get_knowledge_bases()
@@ -569,13 +511,11 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
log.info(f"Found {len(active_kb_ids)} active knowledge bases")
- # Get all files that should be preserved (NOW COMPREHENSIVE!)
active_file_ids = get_active_file_ids()
# Stage 3: Delete orphaned database records
log.info("Deleting orphaned database records")
- # Delete files not referenced by any knowledge base or belonging to deleted users
deleted_files = 0
for file_record in Files.get_files():
should_delete = (
@@ -590,7 +530,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
if deleted_files > 0:
log.info(f"Deleted {deleted_files} orphaned files")
- # Delete knowledge bases from deleted users (if enabled)
deleted_kbs = 0
if form_data.delete_orphaned_knowledge_bases:
for kb in knowledge_bases:
@@ -604,10 +543,8 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping knowledge base deletion (disabled)")
- # Delete other user-owned resources from deleted users (conditional)
deleted_others = 0
- # Delete orphaned chats of deleted users (conditional)
if form_data.delete_orphaned_chats:
chats_deleted = 0
for chat in Chats.get_chats():
@@ -620,7 +557,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping orphaned chat deletion (disabled)")
- # Delete orphaned tools of deleted users (conditional)
if form_data.delete_orphaned_tools:
tools_deleted = 0
for tool in Tools.get_tools():
@@ -633,7 +569,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping tool deletion (disabled)")
- # Delete orphaned functions of deleted users (conditional)
if form_data.delete_orphaned_functions:
functions_deleted = 0
for function in Functions.get_functions():
@@ -646,7 +581,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping function deletion (disabled)")
- # Delete orphaned notes of deleted users (conditional)
if form_data.delete_orphaned_notes:
notes_deleted = 0
for note in Notes.get_notes():
@@ -659,7 +593,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping note deletion (disabled)")
- # Delete orphaned prompts of deleted users (conditional)
if form_data.delete_orphaned_prompts:
prompts_deleted = 0
for prompt in Prompts.get_prompts():
@@ -672,7 +605,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping prompt deletion (disabled)")
- # Delete orphaned models of deleted users (conditional)
if form_data.delete_orphaned_models:
models_deleted = 0
for model in Models.get_all_models():
@@ -685,7 +617,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
else:
log.info("Skipping model deletion (disabled)")
- # Delete orphaned folders of deleted users (conditional)
if form_data.delete_orphaned_folders:
folders_deleted = 0
for folder in Folders.get_all_folders():
@@ -706,28 +637,25 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
# Stage 4: Clean up orphaned physical files
log.info("Cleaning up orphaned physical files")
- # Rebuild active sets after database cleanup
final_active_file_ids = get_active_file_ids()
final_active_kb_ids = {kb.id for kb in Knowledges.get_knowledge_bases()}
- # Clean uploads directory
cleanup_orphaned_uploads(final_active_file_ids)
-
- # Clean vector collections
cleanup_orphaned_vector_collections(final_active_file_ids, final_active_kb_ids)
- # Stage 5: Database optimization
+ # Stage 5: Audio cache cleanup
+ log.info("Cleaning audio cache")
+ cleanup_audio_cache(form_data.audio_cache_max_age_days)
+
+ # Stage 6: Database optimization
log.info("Optimizing database")
- # Vacuum main database
try:
with get_db() as db:
db.execute(text("VACUUM"))
- log.debug("Vacuumed main database")
except Exception as e:
log.error(f"Failed to vacuum main database: {e}")
- # Vacuum ChromaDB database if it exists
if "chroma" in VECTOR_DB.lower():
chroma_db_path = Path(CACHE_DIR).parent / "vector_db" / "chroma.sqlite3"
if chroma_db_path.exists():
@@ -736,7 +664,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
with sqlite3.connect(str(chroma_db_path)) as conn:
conn.execute("VACUUM")
- log.debug("Vacuumed ChromaDB database")
except Exception as e:
log.error(f"Failed to vacuum ChromaDB database: {e}")
From 2818b4643aa28ea8a41d57f4b71b55ff25582839 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 12 Aug 2025 14:58:33 +0200
Subject: [PATCH 11/43] Update folders.py
---
backend/open_webui/models/folders.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/backend/open_webui/models/folders.py b/backend/open_webui/models/folders.py
index 8b631f88de..b597074e81 100644
--- a/backend/open_webui/models/folders.py
+++ b/backend/open_webui/models/folders.py
@@ -137,7 +137,9 @@ class FolderTable:
def get_all_folders(self) -> list[FolderModel]:
with get_db() as db:
- return [FolderModel.model_validate(folder) for folder in db.query(Folder).all()]
+ return [
+ FolderModel.model_validate(folder) for folder in db.query(Folder).all()
+ ]
def get_folder_by_parent_id_and_user_id_and_name(
self, parent_id: Optional[str], user_id: str, name: str
From adda47ab04b4eaea43b2d3656d78aec1c85f15d4 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 12 Aug 2025 22:06:10 +0200
Subject: [PATCH 12/43] move import
---
backend/open_webui/routers/prune.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/backend/open_webui/routers/prune.py b/backend/open_webui/routers/prune.py
index da08037046..05318f9be9 100644
--- a/backend/open_webui/routers/prune.py
+++ b/backend/open_webui/routers/prune.py
@@ -4,6 +4,7 @@ import os
import shutil
import json
import re
+import sqlite3
from typing import Optional, Set
from pathlib import Path
@@ -320,7 +321,6 @@ def cleanup_orphaned_vector_collections(
uuid_to_collection = {}
try:
- import sqlite3
with sqlite3.connect(str(chroma_db_path)) as conn:
collection_id_to_name = {}
@@ -660,7 +660,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
chroma_db_path = Path(CACHE_DIR).parent / "vector_db" / "chroma.sqlite3"
if chroma_db_path.exists():
try:
- import sqlite3
with sqlite3.connect(str(chroma_db_path)) as conn:
conn.execute("VACUUM")
From 4e6e5819a69d669800b8f0a8b080164a51196b00 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Fri, 22 Aug 2025 15:38:41 +0200
Subject: [PATCH 13/43] Update prune.ts
---
src/lib/apis/prune.ts | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/src/lib/apis/prune.ts b/src/lib/apis/prune.ts
index 8413ca24c0..63c251b801 100644
--- a/src/lib/apis/prune.ts
+++ b/src/lib/apis/prune.ts
@@ -13,7 +13,10 @@ export const pruneData = async (
delete_orphaned_models: boolean = true,
delete_orphaned_notes: boolean = true,
delete_orphaned_folders: boolean = true,
- audio_cache_max_age_days: number | null = 30
+ audio_cache_max_age_days: number | null = 30,
+ delete_inactive_users_days: number | null = null,
+ exempt_admin_users: boolean = true,
+ exempt_pending_users: boolean = true
) => {
let error = null;
@@ -35,7 +38,10 @@ export const pruneData = async (
delete_orphaned_models,
delete_orphaned_notes,
delete_orphaned_folders,
- audio_cache_max_age_days
+ audio_cache_max_age_days,
+ delete_inactive_users_days,
+ exempt_admin_users,
+ exempt_pending_users
})
})
.then(async (res) => {
From daed47db03ed9de681c59cfb980db2561166e2b4 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Fri, 22 Aug 2025 15:38:53 +0200
Subject: [PATCH 14/43] Update Database.svelte
---
.../components/admin/Settings/Database.svelte | 28 +++++++++++--------
1 file changed, 17 insertions(+), 11 deletions(-)
diff --git a/src/lib/components/admin/Settings/Database.svelte b/src/lib/components/admin/Settings/Database.svelte
index 6139ecc7af..2a8f221aa5 100644
--- a/src/lib/components/admin/Settings/Database.svelte
+++ b/src/lib/components/admin/Settings/Database.svelte
@@ -37,7 +37,10 @@
delete_orphaned_models,
delete_orphaned_notes,
delete_orphaned_folders,
- audio_cache_max_age_days
+ audio_cache_max_age_days,
+ delete_inactive_users_days,
+ exempt_admin_users,
+ exempt_pending_users
} = event.detail;
const res = await pruneData(
@@ -53,7 +56,10 @@
delete_orphaned_models,
delete_orphaned_notes,
delete_orphaned_folders,
- audio_cache_max_age_days
+ audio_cache_max_age_days,
+ delete_inactive_users_days,
+ exempt_admin_users,
+ exempt_pending_users
).catch((error) => {
toast.error(`${error}`);
return null;
@@ -265,15 +271,15 @@
-
-
-
- {$i18n.t('Export Users')}
-
-
- {/if}
+ clip-rule="evenodd"
+ />
+
+
+
+ {$i18n.t('Export Users')}
+
+
+ {/if}
Date: Fri, 22 Aug 2025 15:42:05 +0200
Subject: [PATCH 15/43] Update PruneDataDialog.svelte
---
.../components/common/PruneDataDialog.svelte | 182 +++++++++++++++++-
1 file changed, 176 insertions(+), 6 deletions(-)
diff --git a/src/lib/components/common/PruneDataDialog.svelte b/src/lib/components/common/PruneDataDialog.svelte
index 1dd11e984a..9f9a8fa9b0 100644
--- a/src/lib/components/common/PruneDataDialog.svelte
+++ b/src/lib/components/common/PruneDataDialog.svelte
@@ -12,6 +12,12 @@
let exempt_archived_chats = true;
let exempt_chats_in_folders = false;
+ // Inactive user deletion
+ let deleteInactiveUsers = false;
+ let delete_inactive_users_days = 90;
+ let exempt_admin_users = true;
+ let exempt_pending_users = true;
+
// Orphaned resource deletion toggles
let delete_orphaned_chats = true;
let delete_orphaned_tools = false;
@@ -27,8 +33,8 @@
let audio_cache_max_age_days = 30;
let showDetailsExpanded = false;
- let activeDetailsTab = 'chats';
- let activeSettingsTab = 'chats';
+ let activeDetailsTab = 'users';
+ let activeSettingsTab = 'users';
let showApiPreview = false;
const dispatch = createEventDispatcher();
@@ -46,7 +52,10 @@
delete_orphaned_models,
delete_orphaned_notes,
delete_orphaned_folders,
- audio_cache_max_age_days: cleanupAudioCache ? audio_cache_max_age_days : null
+ audio_cache_max_age_days: cleanupAudioCache ? audio_cache_max_age_days : null,
+ delete_inactive_users_days: deleteInactiveUsers ? delete_inactive_users_days : null,
+ exempt_admin_users,
+ exempt_pending_users
});
show = false;
};
@@ -68,9 +77,15 @@ Authorization: Bearer
"delete_orphaned_models": ${delete_orphaned_models},
"delete_orphaned_notes": ${delete_orphaned_notes},
"delete_orphaned_folders": ${delete_orphaned_folders},
- "audio_cache_max_age_days": ${cleanupAudioCache ? audio_cache_max_age_days : null}
+ "audio_cache_max_age_days": ${cleanupAudioCache ? audio_cache_max_age_days : null},
+ "delete_inactive_users_days": ${deleteInactiveUsers ? delete_inactive_users_days : null},
+ "exempt_admin_users": ${exempt_admin_users},
+ "exempt_pending_users": ${exempt_pending_users}
}`;
+ // Warning for short inactive user deletion periods
+ $: shortUserDeletionWarning = deleteInactiveUsers && delete_inactive_users_days < 30;
+
const copyApiCall = () => {
navigator.clipboard.writeText(apiCallPreview).then(() => {
// Could add a toast notification here
@@ -146,6 +161,12 @@ Authorization: Bearer
+
activeDetailsTab = 'users'}
+ >
+ {$i18n.t('Users')}
+
activeDetailsTab = 'chats'}
@@ -180,7 +201,27 @@ Authorization: Bearer
- {#if activeDetailsTab === 'chats'}
+ {#if activeDetailsTab === 'users'}
+
+
{$i18n.t('Inactive User Account Deletion:')}
+
• {$i18n.t('Removes user accounts that have been inactive for a specified period based on their last activity timestamp')}
+
• {$i18n.t('When a user account is deleted, ALL associated data is permanently removed:')}
+
◦ {$i18n.t('All conversations and chat history')}
+
◦ {$i18n.t('All uploaded files and documents')}
+
◦ {$i18n.t('All custom models, prompts, tools, and functions')}
+
◦ {$i18n.t('All knowledge bases and vector embeddings')}
+
◦ {$i18n.t('All notes, folders, and workspace items')}
+
+
{$i18n.t('Safety Exemptions:')}
+
• {$i18n.t('Admin users: Can be exempted from deletion (recommended)')}
+
• {$i18n.t('Pending users: Can be exempted from deletion (recommended)')}
+
+
{$i18n.t('⚠️ CRITICAL WARNING:')}
+
• {$i18n.t('User deletion is irreversible and cascades to ALL user data')}
+
• {$i18n.t('This is the most destructive operation in the pruning system')}
+
• {$i18n.t('Always verify inactive periods and exemptions before use')}
+
+ {:else if activeDetailsTab === 'chats'}
{$i18n.t('Age-Based Chat Deletion:')}
• {$i18n.t('Removes conversations older than specified days based on when they were last modified or updated (not when they were created)')}
@@ -269,6 +310,12 @@ Authorization: Bearer
+
activeSettingsTab = 'users'}
+ >
+ {$i18n.t('Users')}
+
activeSettingsTab = 'chats'}
@@ -291,7 +338,130 @@ Authorization: Bearer
- {#if activeSettingsTab === 'chats'}
+ {#if activeSettingsTab === 'users'}
+
+
+
+
+
+
+
+
+
+
{$i18n.t('Delete inactive user accounts')}
+
+
+
+
+
+
{$i18n.t('⚠️ MOST DESTRUCTIVE OPERATION')}
+
+
{$i18n.t('Deleting users removes ALL their data:')}
+
• {$i18n.t('Chats, files, models, prompts')}
+
• {$i18n.t('Knowledge bases, tools, notes')}
+
• {$i18n.t('This action is irreversible!')}
+
+
+
+
+
+
+ {$i18n.t('Remove user accounts inactive for specified days')}
+
+
+
+
+
+
+ {#if deleteInactiveUsers}
+
+
+
+ {$i18n.t('Delete users inactive for more than')}
+
+
+
+ {$i18n.t('days')}
+
+
+ {$i18n.t('Based on last_active_at timestamp. Minimum 1 day.')}
+
+
+
+ {#if shortUserDeletionWarning}
+
+
+
+
+
+ {$i18n.t('⚠️ Warning: Deletion period less than 30 days!')}
+
+
+ {$i18n.t('Very short periods may accidentally delete active users. Consider using 30+ days for safety.')}
+
+
+
+
+ {/if}
+
+
+
+
+
+
+
+
+
+ {$i18n.t('Exempt admin users')}
+
+
+ {$i18n.t('Never delete admin users (strongly recommended)')}
+
+
+
+
+
+
+
+
+
+
+
+
+ {$i18n.t('Exempt pending users')}
+
+
+ {$i18n.t('Never delete pending/unapproved users (recommended)')}
+
+
+
+
+
+
+
+ {$i18n.t('User Deletion Impact:')}
+
+
+
• {$i18n.t('Complete Data Loss:')} {$i18n.t('All user data is permanently deleted')}
+
• {$i18n.t('Cascading Effect:')} {$i18n.t('Removes chats, files, models, knowledge bases')}
+
• {$i18n.t('Irreversible:')} {$i18n.t('Cannot be undone - backup before use')}
+
+
+
+ {/if}
+
+
+ {:else if activeSettingsTab === 'chats'}
From 74bfead38b3801d60b16ad1e733191dc09baccc4 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Fri, 22 Aug 2025 15:42:56 +0200
Subject: [PATCH 16/43] Update prune.py
---
backend/open_webui/routers/prune.py | 75 +++++++++++++++++++++++++++++
1 file changed, 75 insertions(+)
diff --git a/backend/open_webui/routers/prune.py b/backend/open_webui/routers/prune.py
index 05318f9be9..7cb0498523 100644
--- a/backend/open_webui/routers/prune.py
+++ b/backend/open_webui/routers/prune.py
@@ -48,6 +48,9 @@ class PruneDataForm(BaseModel):
delete_orphaned_notes: bool = True
delete_orphaned_folders: bool = True
audio_cache_max_age_days: Optional[int] = 30
+ delete_inactive_users_days: Optional[int] = None
+ exempt_admin_users: bool = True
+ exempt_pending_users: bool = True
def get_active_file_ids() -> Set[str]:
@@ -385,6 +388,55 @@ def cleanup_orphaned_vector_collections(
log.info(f"Deleted {deleted_count} orphaned vector collections")
+def delete_inactive_users(
+ inactive_days: int,
+ exempt_admin: bool = True,
+ exempt_pending: bool = True
+) -> int:
+ """
+ Delete users who have been inactive for the specified number of days.
+
+ Returns the number of users deleted.
+ """
+ if inactive_days is None:
+ return 0
+
+ cutoff_time = int(time.time()) - (inactive_days * 86400)
+ deleted_count = 0
+
+ try:
+ users_to_delete = []
+
+ # Get all users and check activity
+ all_users = Users.get_users()["users"]
+
+ for user in all_users:
+ # Skip if user is exempt
+ if exempt_admin and user.role == "admin":
+ continue
+ if exempt_pending and user.role == "pending":
+ continue
+
+ # Check if user is inactive based on last_active_at
+ if user.last_active_at < cutoff_time:
+ users_to_delete.append(user)
+
+ # Delete inactive users
+ for user in users_to_delete:
+ try:
+ # Delete the user - this will cascade to all their data
+ Users.delete_user_by_id(user.id)
+ deleted_count += 1
+ log.info(f"Deleted inactive user: {user.email} (last active: {user.last_active_at})")
+ except Exception as e:
+ log.error(f"Failed to delete user {user.id}: {e}")
+
+ except Exception as e:
+ log.error(f"Error during inactive user deletion: {e}")
+
+ return deleted_count
+
+
def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
"""
Clean up audio cache files older than specified days.
@@ -465,10 +517,33 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
- audio_cache_max_age_days: Optional[int] = 30
- If None: Skip audio cache cleanup
- If >= 0: Delete audio cache files (TTS, STT) older than specified days
+ - delete_inactive_users_days: Optional[int] = None
+ - If None: Skip inactive user deletion
+ - If >= 1: Delete users inactive for more than specified days
+ - exempt_admin_users: bool = True
+ - If True: Exempt admin users from deletion (recommended for safety)
+ - exempt_pending_users: bool = True
+ - If True: Exempt pending users from deletion (recommended for safety)
"""
try:
log.info("Starting data pruning process")
+ # Stage 0: Delete inactive users (if enabled)
+ deleted_users = 0
+ if form_data.delete_inactive_users_days is not None:
+ log.info(f"Deleting users inactive for more than {form_data.delete_inactive_users_days} days")
+ deleted_users = delete_inactive_users(
+ form_data.delete_inactive_users_days,
+ form_data.exempt_admin_users,
+ form_data.exempt_pending_users
+ )
+ if deleted_users > 0:
+ log.info(f"Deleted {deleted_users} inactive users")
+ else:
+ log.info("No inactive users found to delete")
+ else:
+ log.info("Skipping inactive user deletion (disabled)")
+
# Stage 1: Delete old chats based on user criteria
if form_data.days is not None:
cutoff_time = int(time.time()) - (form_data.days * 86400)
From 233167a041bc92baa24a2abb1ff23eba215a6c7d Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Fri, 22 Aug 2025 15:49:55 +0200
Subject: [PATCH 17/43] Update PruneDataDialog.svelte
---
.../components/common/PruneDataDialog.svelte | 26 ++-----------------
1 file changed, 2 insertions(+), 24 deletions(-)
diff --git a/src/lib/components/common/PruneDataDialog.svelte b/src/lib/components/common/PruneDataDialog.svelte
index 9f9a8fa9b0..fb9e711ef8 100644
--- a/src/lib/components/common/PruneDataDialog.svelte
+++ b/src/lib/components/common/PruneDataDialog.svelte
@@ -205,21 +205,10 @@ Authorization: Bearer
{$i18n.t('Inactive User Account Deletion:')}
• {$i18n.t('Removes user accounts that have been inactive for a specified period based on their last activity timestamp')}
-
• {$i18n.t('When a user account is deleted, ALL associated data is permanently removed:')}
-
◦ {$i18n.t('All conversations and chat history')}
-
◦ {$i18n.t('All uploaded files and documents')}
-
◦ {$i18n.t('All custom models, prompts, tools, and functions')}
-
◦ {$i18n.t('All knowledge bases and vector embeddings')}
-
◦ {$i18n.t('All notes, folders, and workspace items')}
{$i18n.t('Safety Exemptions:')}
• {$i18n.t('Admin users: Can be exempted from deletion (recommended)')}
-
• {$i18n.t('Pending users: Can be exempted from deletion (recommended)')}
-
-
{$i18n.t('⚠️ CRITICAL WARNING:')}
-
• {$i18n.t('User deletion is irreversible and cascades to ALL user data')}
-
• {$i18n.t('This is the most destructive operation in the pruning system')}
-
• {$i18n.t('Always verify inactive periods and exemptions before use')}
+
• {$i18n.t('Pending users: Can be exempted from deletion')}
{:else if activeDetailsTab === 'chats'}
@@ -441,22 +430,11 @@ Authorization: Bearer
{$i18n.t('Exempt pending users')}
- {$i18n.t('Never delete pending/unapproved users (recommended)')}
+ {$i18n.t('Never delete pending/unapproved users')}
-
-
-
- {$i18n.t('User Deletion Impact:')}
-
-
-
• {$i18n.t('Complete Data Loss:')} {$i18n.t('All user data is permanently deleted')}
-
• {$i18n.t('Cascading Effect:')} {$i18n.t('Removes chats, files, models, knowledge bases')}
-
• {$i18n.t('Irreversible:')} {$i18n.t('Cannot be undone - backup before use')}
-
-
{/if}
From 5aa93ab97d828e012dc8895c5ea20e0286788dea Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Fri, 22 Aug 2025 15:53:15 +0200
Subject: [PATCH 18/43] Update PruneDataDialog.svelte
---
.../components/common/PruneDataDialog.svelte | 19 ++-----------------
1 file changed, 2 insertions(+), 17 deletions(-)
diff --git a/src/lib/components/common/PruneDataDialog.svelte b/src/lib/components/common/PruneDataDialog.svelte
index fb9e711ef8..f16de00a31 100644
--- a/src/lib/components/common/PruneDataDialog.svelte
+++ b/src/lib/components/common/PruneDataDialog.svelte
@@ -336,23 +336,8 @@ Authorization: Bearer
-
-
{$i18n.t('Delete inactive user accounts')}
-
-
-
-
-
-
{$i18n.t('⚠️ MOST DESTRUCTIVE OPERATION')}
-
-
{$i18n.t('Deleting users removes ALL their data:')}
-
• {$i18n.t('Chats, files, models, prompts')}
-
• {$i18n.t('Knowledge bases, tools, notes')}
-
• {$i18n.t('This action is irreversible!')}
-
-
-
-
+
+ {$i18n.t('Delete inactive user accounts')}
{$i18n.t('Remove user accounts inactive for specified days')}
From 544f8b72dc210f6a67d7c24839b1680b98afd35a Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Fri, 22 Aug 2025 16:16:14 +0200
Subject: [PATCH 19/43] Update PruneDataDialog.svelte
---
.../components/common/PruneDataDialog.svelte | 88 +++++++++++++------
1 file changed, 62 insertions(+), 26 deletions(-)
diff --git a/src/lib/components/common/PruneDataDialog.svelte b/src/lib/components/common/PruneDataDialog.svelte
index f16de00a31..8d5d910422 100644
--- a/src/lib/components/common/PruneDataDialog.svelte
+++ b/src/lib/components/common/PruneDataDialog.svelte
@@ -39,8 +39,8 @@
const dispatch = createEventDispatcher();
- const confirm = () => {
- dispatch('confirm', {
+ const preview = () => {
+ dispatch('preview', {
days: deleteChatsByAge ? days : null,
exempt_archived_chats,
exempt_chats_in_folders,
@@ -60,28 +60,64 @@
show = false;
};
- // Generate API call preview
- $: apiCallPreview = `POST /api/v1/admin/prune
-Content-Type: application/json
-Authorization: Bearer
+ // Generate API call preview with helpful comments
+ $: apiCallPreview = `# Open WebUI Data Pruning API Call
+# Use this template for automated maintenance scripts (cron jobs, etc.)
-{
- "days": ${deleteChatsByAge ? days : null},
- "exempt_archived_chats": ${exempt_archived_chats},
- "exempt_chats_in_folders": ${exempt_chats_in_folders},
- "delete_orphaned_chats": ${delete_orphaned_chats},
- "delete_orphaned_tools": ${delete_orphaned_tools},
- "delete_orphaned_functions": ${delete_orphaned_functions},
- "delete_orphaned_prompts": ${delete_orphaned_prompts},
- "delete_orphaned_knowledge_bases": ${delete_orphaned_knowledge_bases},
- "delete_orphaned_models": ${delete_orphaned_models},
- "delete_orphaned_notes": ${delete_orphaned_notes},
- "delete_orphaned_folders": ${delete_orphaned_folders},
- "audio_cache_max_age_days": ${cleanupAudioCache ? audio_cache_max_age_days : null},
- "delete_inactive_users_days": ${deleteInactiveUsers ? delete_inactive_users_days : null},
- "exempt_admin_users": ${exempt_admin_users},
- "exempt_pending_users": ${exempt_pending_users}
-}`;
+# AUTHENTICATION: Use API Key (not JWT token) for automation
+# Get your API key from: Settings → Account → API Key → Generate new key
+# Format: sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+
+curl -X POST "${window.location.origin}/api/v1/prune/" \\
+ -H "Content-Type: application/json" \\
+ -H "Authorization: Bearer " \\
+ -d '{
+ // SAFETY: Always test with dry_run=true first to preview results
+ "dry_run": false,
+
+ // AGE-BASED CHAT DELETION (null = disabled)
+ "days": ${deleteChatsByAge ? days : null},
+ "exempt_archived_chats": ${exempt_archived_chats}, // Keep archived chats even if old
+ "exempt_chats_in_folders": ${exempt_chats_in_folders}, // Keep organized/pinned chats
+
+ // INACTIVE USER DELETION (null = disabled, VERY DESTRUCTIVE)
+ "delete_inactive_users_days": ${deleteInactiveUsers ? delete_inactive_users_days : null},
+ "exempt_admin_users": ${exempt_admin_users}, // Strongly recommended: true
+ "exempt_pending_users": ${exempt_pending_users}, // Recommended for user approval workflows
+
+ // ORPHANED DATA CLEANUP (from deleted users)
+ "delete_orphaned_chats": ${delete_orphaned_chats},
+ "delete_orphaned_tools": ${delete_orphaned_tools},
+ "delete_orphaned_functions": ${delete_orphaned_functions}, // Actions, Pipes, Filters
+ "delete_orphaned_prompts": ${delete_orphaned_prompts},
+ "delete_orphaned_knowledge_bases": ${delete_orphaned_knowledge_bases},
+ "delete_orphaned_models": ${delete_orphaned_models},
+ "delete_orphaned_notes": ${delete_orphaned_notes},
+ "delete_orphaned_folders": ${delete_orphaned_folders},
+
+ // AUDIO CACHE CLEANUP (null = disabled)
+ "audio_cache_max_age_days": ${cleanupAudioCache ? audio_cache_max_age_days : null} // TTS/STT files
+ }'
+
+# API KEY vs JWT TOKEN:
+# - API Key: Persistent, use for automation (sk-xxxxxxxx...)
+# - JWT Token: Session-bound, temporary, use for web UI only
+# - ALWAYS use API Key for scripts/cron jobs
+
+# AUTOMATION TIPS:
+# 1. Run with dry_run=true first to preview what will be deleted
+# 2. Schedule during low-usage hours to minimize performance impact
+# 3. Monitor logs: tail -f /path/to/open-webui/logs
+# 4. Consider database backup before large cleanup operations
+# 5. Test on staging environment with similar data size first
+
+# EXAMPLE CRON JOB (runs weekly on Sunday at 2 AM):
+# 0 2 * * 0 /path/to/your/prune-script.sh >> /var/log/openwebui-prune.log 2>&1
+
+# RESPONSE HANDLING:
+# - dry_run=true: Returns counts object with preview numbers
+# - dry_run=false: Returns true on success, throws error on failure
+# - Always check HTTP status code and response for errors`;
// Warning for short inactive user deletion periods
$: shortUserDeletionWarning = deleteInactiveUsers && delete_inactive_users_days < 30;
@@ -778,10 +814,10 @@ Authorization: Bearer
{$i18n.t('Cancel')}
- {$i18n.t('Prune Data')}
+ {$i18n.t('Preview')}
From 7abcc7bc590cbac7839d4554034963feeb516828 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Fri, 22 Aug 2025 16:16:31 +0200
Subject: [PATCH 20/43] Update Database.svelte
---
.../components/admin/Settings/Database.svelte | 238 +++++++++++++++---
1 file changed, 204 insertions(+), 34 deletions(-)
diff --git a/src/lib/components/admin/Settings/Database.svelte b/src/lib/components/admin/Settings/Database.svelte
index 2a8f221aa5..ea2de29e4d 100644
--- a/src/lib/components/admin/Settings/Database.svelte
+++ b/src/lib/components/admin/Settings/Database.svelte
@@ -17,6 +17,10 @@
export let saveHandler: Function;
let showPruneDataDialog = false;
+ let showPreviewResults = false;
+ let previewResults = null;
+ let lastPruneSettings = null;
+
const exportAllUserChats = async () => {
let blob = new Blob([JSON.stringify(await getAllUserChats(localStorage.token))], {
type: 'application/json'
@@ -24,48 +28,70 @@
saveAs(blob, `all-chats-export-${Date.now()}.json`);
};
- const handlePruneDataConfirm = async (event) => {
- const {
- days,
- exempt_archived_chats,
- exempt_chats_in_folders,
- delete_orphaned_chats,
- delete_orphaned_tools,
- delete_orphaned_functions,
- delete_orphaned_prompts,
- delete_orphaned_knowledge_bases,
- delete_orphaned_models,
- delete_orphaned_notes,
- delete_orphaned_folders,
- audio_cache_max_age_days,
- delete_inactive_users_days,
- exempt_admin_users,
- exempt_pending_users
- } = event.detail;
+ const handlePruneDataPreview = async (event) => {
+ const settings = event.detail;
+ lastPruneSettings = settings;
const res = await pruneData(
localStorage.token,
- days,
- exempt_archived_chats,
- exempt_chats_in_folders,
- delete_orphaned_chats,
- delete_orphaned_tools,
- delete_orphaned_functions,
- delete_orphaned_prompts,
- delete_orphaned_knowledge_bases,
- delete_orphaned_models,
- delete_orphaned_notes,
- delete_orphaned_folders,
- audio_cache_max_age_days,
- delete_inactive_users_days,
- exempt_admin_users,
- exempt_pending_users
+ settings.days,
+ settings.exempt_archived_chats,
+ settings.exempt_chats_in_folders,
+ settings.delete_orphaned_chats,
+ settings.delete_orphaned_tools,
+ settings.delete_orphaned_functions,
+ settings.delete_orphaned_prompts,
+ settings.delete_orphaned_knowledge_bases,
+ settings.delete_orphaned_models,
+ settings.delete_orphaned_notes,
+ settings.delete_orphaned_folders,
+ settings.audio_cache_max_age_days,
+ settings.delete_inactive_users_days,
+ settings.exempt_admin_users,
+ settings.exempt_pending_users,
+ true // dry_run = true for preview
).catch((error) => {
toast.error(`${error}`);
return null;
});
+
+ if (res) {
+ previewResults = res;
+ showPreviewResults = true;
+ }
+ };
+
+ const handleConfirmPrune = async () => {
+ if (!lastPruneSettings) return;
+
+ const res = await pruneData(
+ localStorage.token,
+ lastPruneSettings.days,
+ lastPruneSettings.exempt_archived_chats,
+ lastPruneSettings.exempt_chats_in_folders,
+ lastPruneSettings.delete_orphaned_chats,
+ lastPruneSettings.delete_orphaned_tools,
+ lastPruneSettings.delete_orphaned_functions,
+ lastPruneSettings.delete_orphaned_prompts,
+ lastPruneSettings.delete_orphaned_knowledge_bases,
+ lastPruneSettings.delete_orphaned_models,
+ lastPruneSettings.delete_orphaned_notes,
+ lastPruneSettings.delete_orphaned_folders,
+ lastPruneSettings.audio_cache_max_age_days,
+ lastPruneSettings.delete_inactive_users_days,
+ lastPruneSettings.exempt_admin_users,
+ lastPruneSettings.exempt_pending_users,
+ false // dry_run = false for actual pruning
+ ).catch((error) => {
+ toast.error(`${error}`);
+ return null;
+ });
+
if (res) {
toast.success('Data pruned successfully');
+ showPreviewResults = false;
+ previewResults = null;
+ lastPruneSettings = null;
}
};
@@ -97,7 +123,151 @@
});
-
+
+{#if showPreviewResults && previewResults}
+
+
+
+
+ {$i18n.t('Pruning Preview Results')}
+
+
(showPreviewResults = false)}
+ >
+
+
+
+
+
+
+
+
+
+ {$i18n.t('The following items would be deleted:')}
+
+
+ {#if previewResults.inactive_users > 0}
+
+ {$i18n.t('Inactive users')}:
+ {previewResults.inactive_users}
+
+ {/if}
+ {#if previewResults.old_chats > 0}
+
+ {$i18n.t('Old chats')}:
+ {previewResults.old_chats}
+
+ {/if}
+ {#if previewResults.orphaned_chats > 0}
+
+ {$i18n.t('Orphaned chats')}:
+ {previewResults.orphaned_chats}
+
+ {/if}
+ {#if previewResults.orphaned_files > 0}
+
+ {$i18n.t('Orphaned files')}:
+ {previewResults.orphaned_files}
+
+ {/if}
+ {#if previewResults.orphaned_tools > 0}
+
+ {$i18n.t('Orphaned tools')}:
+ {previewResults.orphaned_tools}
+
+ {/if}
+ {#if previewResults.orphaned_functions > 0}
+
+ {$i18n.t('Orphaned functions')}:
+ {previewResults.orphaned_functions}
+
+ {/if}
+ {#if previewResults.orphaned_prompts > 0}
+
+ {$i18n.t('Orphaned prompts')}:
+ {previewResults.orphaned_prompts}
+
+ {/if}
+ {#if previewResults.orphaned_knowledge_bases > 0}
+
+ {$i18n.t('Orphaned knowledge bases')}:
+ {previewResults.orphaned_knowledge_bases}
+
+ {/if}
+ {#if previewResults.orphaned_models > 0}
+
+ {$i18n.t('Orphaned models')}:
+ {previewResults.orphaned_models}
+
+ {/if}
+ {#if previewResults.orphaned_notes > 0}
+
+ {$i18n.t('Orphaned notes')}:
+ {previewResults.orphaned_notes}
+
+ {/if}
+ {#if previewResults.orphaned_folders > 0}
+
+ {$i18n.t('Orphaned folders')}:
+ {previewResults.orphaned_folders}
+
+ {/if}
+ {#if previewResults.orphaned_uploads > 0}
+
+ {$i18n.t('Orphaned upload files')}:
+ {previewResults.orphaned_uploads}
+
+ {/if}
+ {#if previewResults.orphaned_vector_collections > 0}
+
+ {$i18n.t('Orphaned vector collections')}:
+ {previewResults.orphaned_vector_collections}
+
+ {/if}
+ {#if previewResults.audio_cache_files > 0}
+
+ {$i18n.t('Audio cache files')}:
+ {previewResults.audio_cache_files}
+
+ {/if}
+
+
+ {#if Object.values(previewResults).every(count => count === 0)}
+
+
+ {$i18n.t('No items would be deleted with current settings')}
+
+
+ {$i18n.t('Your system is already clean or no cleanup options are enabled')}
+
+
+ {/if}
+
+
+
+
+ (showPreviewResults = false)}
+ >
+ {$i18n.t('Cancel')}
+
+ {#if !Object.values(previewResults).every(count => count === 0)}
+
+ {$i18n.t('Prune Data')}
+
+ {/if}
+
+
+
+
+{/if}
+
+
@@ -744,6 +757,67 @@ curl -X POST "${window.location.origin}/api/v1/prune/" \\
{/if}
+
+ {:else if activeSettingsTab === 'system'}
+
+
+
+
+
+
+
+
+
+
{$i18n.t('Run VACUUM optimization')}
+
+
+
+
+
+
{$i18n.t('Database Optimization Warning:')}
+
+
{$i18n.t('VACUUM reclaims disk space by rebuilding the database file.')}
+
{$i18n.t('⚠️ This may take a very long time on large databases and will LOCK the entire database during execution.')}
+
{$i18n.t('It is strongly recommended to NOT run this while users are actively using the platform.')}
+
{$i18n.t('💡 Best practice: Run during scheduled maintenance windows.')}
+
+
+
+
+
+
+ {$i18n.t('Reclaim disk space after cleanup (locks database during operation)')}
+
+
+
+
+
+
+ {#if run_vacuum}
+
+
+
+
+
+
+ {$i18n.t('VACUUM Enabled - Important Considerations:')}
+
+
+
• {$i18n.t('Database will be locked during VACUUM - all users will experience errors')}
+
• {$i18n.t('Operation duration depends on database size (can be 5-30+ minutes)')}
+
• {$i18n.t('Recommended only during scheduled maintenance windows')}
+
• {$i18n.t('Not required for routine cleanups - only when reclaiming disk space is critical')}
+
+
+
+
+
+ {/if}
+
{/if}
From 873b73e66873e5dd6fb44fed9520dadfec69e53f Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 11 Nov 2025 19:39:20 +0100
Subject: [PATCH 39/43] feat: Make VACUUM database optimization optional (#30)
Co-authored-by: Claude
---
backend/open_webui/routers/prune.py | 97 +++++++++++++++++++----------
1 file changed, 63 insertions(+), 34 deletions(-)
diff --git a/backend/open_webui/routers/prune.py b/backend/open_webui/routers/prune.py
index 2968764d07..112901118d 100644
--- a/backend/open_webui/routers/prune.py
+++ b/backend/open_webui/routers/prune.py
@@ -17,7 +17,7 @@ from sqlalchemy import text
from open_webui.utils.auth import get_admin_user
from open_webui.models.users import Users
-from open_webui.models.chats import Chats
+from open_webui.models.chats import Chat, ChatModel, Chats
from open_webui.models.files import Files
from open_webui.models.notes import Notes
from open_webui.models.prompts import Prompts
@@ -128,6 +128,26 @@ class JSONFileIDExtractor:
r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
)
+ @classmethod
+ def extract_file_ids(cls, json_string: str) -> Set[str]:
+ """
+ Extract file IDs from JSON string WITHOUT database validation.
+
+ Args:
+ json_string: JSON content as string (or any string to scan)
+
+ Returns:
+ Set of extracted file IDs (not validated against database)
+
+ Note:
+ Use this method when you have a preloaded set of valid file IDs
+ to validate against, avoiding N database queries.
+ """
+ potential_ids = []
+ potential_ids.extend(cls._FILE_ID_PATTERN.findall(json_string))
+ potential_ids.extend(cls._URL_PATTERN.findall(json_string))
+ return set(potential_ids)
+
@classmethod
def extract_and_validate_file_ids(cls, json_string: str) -> Set[str]:
"""
@@ -1068,6 +1088,10 @@ def get_active_file_ids() -> Set[str]:
active_file_ids = set()
try:
+ # Preload all valid file IDs to avoid N database queries during validation
+ # This is O(1) set lookup instead of O(n) DB queries
+ all_file_ids = {f.id for f in Files.get_files()}
+ log.debug(f"Preloaded {len(all_file_ids)} file IDs for validation")
# Scan knowledge bases for file references
knowledge_bases = Knowledges.get_knowledge_bases()
log.debug(f"Found {len(knowledge_bases)} knowledge bases")
@@ -1092,26 +1116,34 @@ def get_active_file_ids() -> Set[str]:
for file_id in file_ids:
if isinstance(file_id, str) and file_id.strip():
- active_file_ids.add(file_id.strip())
+ stripped_id = file_id.strip()
+ # Validate against preloaded set (O(1) lookup)
+ if stripped_id in all_file_ids:
+ active_file_ids.add(stripped_id)
# Scan chats for file references
- chats = Chats.get_chats()
- log.debug(f"Found {len(chats)} chats to scan for file references")
+ # Stream chats to avoid loading all into memory
+ chat_count = 0
+ with get_db() as db:
+ for chat_orm in db.query(Chat).yield_per(1000):
+ chat_count += 1
+ chat = ChatModel.model_validate(chat_orm)
- for chat in chats:
- if not chat.chat or not isinstance(chat.chat, dict):
- continue
+ if not chat.chat or not isinstance(chat.chat, dict):
+ continue
- try:
- chat_json_str = json.dumps(chat.chat)
- # Use utility to extract and validate file IDs
- validated_ids = JSONFileIDExtractor.extract_and_validate_file_ids(
- chat_json_str
- )
- active_file_ids.update(validated_ids)
+ try:
+ chat_json_str = json.dumps(chat.chat)
+ # Extract file IDs without DB queries
+ extracted_ids = JSONFileIDExtractor.extract_file_ids(chat_json_str)
+ # Validate against preloaded set (O(1) per ID)
+ validated_ids = extracted_ids & all_file_ids
+ active_file_ids.update(validated_ids)
- except Exception as e:
- log.debug(f"Error processing chat {chat.id} for file references: {e}")
+ except Exception as e:
+ log.debug(f"Error processing chat {chat.id} for file references: {e}")
+
+ log.debug(f"Scanned {chat_count} chats for file references")
# Scan folders for file references
try:
@@ -1121,10 +1153,10 @@ def get_active_file_ids() -> Set[str]:
if folder.items:
try:
items_str = json.dumps(folder.items)
- # Use utility to extract and validate file IDs
- validated_ids = (
- JSONFileIDExtractor.extract_and_validate_file_ids(items_str)
- )
+ # Extract file IDs without DB queries
+ extracted_ids = JSONFileIDExtractor.extract_file_ids(items_str)
+ # Validate against preloaded set (O(1) per ID)
+ validated_ids = extracted_ids & all_file_ids
active_file_ids.update(validated_ids)
except Exception as e:
log.debug(f"Error processing folder {folder.id} items: {e}")
@@ -1132,10 +1164,10 @@ def get_active_file_ids() -> Set[str]:
if hasattr(folder, "data") and folder.data:
try:
data_str = json.dumps(folder.data)
- # Use utility to extract and validate file IDs
- validated_ids = (
- JSONFileIDExtractor.extract_and_validate_file_ids(data_str)
- )
+ # Extract file IDs without DB queries
+ extracted_ids = JSONFileIDExtractor.extract_file_ids(data_str)
+ # Validate against preloaded set (O(1) per ID)
+ validated_ids = extracted_ids & all_file_ids
active_file_ids.update(validated_ids)
except Exception as e:
log.debug(f"Error processing folder {folder.id} data: {e}")
@@ -1146,11 +1178,10 @@ def get_active_file_ids() -> Set[str]:
# Scan standalone messages for file references
try:
with get_db() as db:
- message_results = db.execute(
- text("SELECT id, data FROM message WHERE data IS NOT NULL")
- ).fetchall()
+ stmt = text("SELECT id, data FROM message WHERE data IS NOT NULL")
- for message_id, message_data_json in message_results:
+ for row in db.execute(stmt).yield_per(1000):
+ message_id, message_data_json = row
if message_data_json:
try:
data_str = (
@@ -1158,12 +1189,10 @@ def get_active_file_ids() -> Set[str]:
if isinstance(message_data_json, dict)
else str(message_data_json)
)
- # Use utility to extract and validate file IDs
- validated_ids = (
- JSONFileIDExtractor.extract_and_validate_file_ids(
- data_str
- )
- )
+ # Extract file IDs without DB queries
+ extracted_ids = JSONFileIDExtractor.extract_file_ids(data_str)
+ # Validate against preloaded set (O(1) per ID)
+ validated_ids = extracted_ids & all_file_ids
active_file_ids.update(validated_ids)
except Exception as e:
log.debug(
From 20187f9a2dd64633f8e713745929439a60688bc2 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Thu, 13 Nov 2025 18:01:25 +0100
Subject: [PATCH 40/43] fix file lock (#33)
Co-authored-by: Claude
---
backend/open_webui/routers/prune.py | 81 ++++++++++++++++++-----------
1 file changed, 50 insertions(+), 31 deletions(-)
diff --git a/backend/open_webui/routers/prune.py b/backend/open_webui/routers/prune.py
index 112901118d..cd7053e7fa 100644
--- a/backend/open_webui/routers/prune.py
+++ b/backend/open_webui/routers/prune.py
@@ -1381,6 +1381,13 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
If dry_run=True (default), returns preview counts without deleting anything.
If dry_run=False, performs actual deletion and returns True on success.
"""
+ # Acquire lock to prevent concurrent operations (including previews)
+ if not PruneLock.acquire():
+ raise HTTPException(
+ status_code=status.HTTP_409_CONFLICT,
+ detail="A prune operation is already in progress. Please wait for it to complete."
+ )
+
try:
# Get vector database cleaner based on configuration
vector_cleaner = get_vector_database_cleaner()
@@ -1642,45 +1649,54 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
warnings.append(f"Vector cleanup warning: {vector_error}")
log.warning(f"Vector cleanup completed with errors: {vector_error}")
- # Stage 5: Audio cache cleanup
- log.info("Cleaning audio cache")
- cleanup_audio_cache(form_data.audio_cache_max_age_days)
+ # Use modular vector database cleanup
+ warnings = []
+ deleted_vector_count, vector_error = vector_cleaner.cleanup_orphaned_collections(
+ final_active_file_ids, final_active_kb_ids
+ )
+ if vector_error:
+ warnings.append(f"Vector cleanup warning: {vector_error}")
+ log.warning(f"Vector cleanup completed with errors: {vector_error}")
# Stage 6: Database optimization (optional)
if form_data.run_vacuum:
log.info("Optimizing database with VACUUM (this may take a while and lock the database)")
+ # Stage 6: Database optimization (optional)
+ if form_data.run_vacuum:
+ log.info("Optimizing database with VACUUM (this may take a while and lock the database)")
+
+ try:
+ with get_db() as db:
+ db.execute(text("VACUUM"))
+ log.info("Vacuumed main database")
+ except Exception as e:
+ log.error(f"Failed to vacuum main database: {e}")
+
+ # Vector database-specific optimization
+ if isinstance(vector_cleaner, ChromaDatabaseCleaner):
try:
- with get_db() as db:
- db.execute(text("VACUUM"))
- log.info("Vacuumed main database")
+ with sqlite3.connect(str(vector_cleaner.chroma_db_path)) as conn:
+ conn.execute("VACUUM")
+ log.info("Vacuumed ChromaDB database")
except Exception as e:
- log.error(f"Failed to vacuum main database: {e}")
+ log.error(f"Failed to vacuum ChromaDB database: {e}")
+ elif (
+ isinstance(vector_cleaner, PGVectorDatabaseCleaner)
+ and vector_cleaner.session
+ ):
+ try:
+ vector_cleaner.session.execute(text("VACUUM ANALYZE"))
+ vector_cleaner.session.commit()
+ log.info("Executed VACUUM ANALYZE on PostgreSQL database")
+ except Exception as e:
+ log.error(f"Failed to vacuum PostgreSQL database: {e}")
+ else:
+ log.info("Skipping VACUUM optimization (not enabled)")
- # Vector database-specific optimization
- if isinstance(vector_cleaner, ChromaDatabaseCleaner):
- try:
- with sqlite3.connect(str(vector_cleaner.chroma_db_path)) as conn:
- conn.execute("VACUUM")
- log.info("Vacuumed ChromaDB database")
- except Exception as e:
- log.error(f"Failed to vacuum ChromaDB database: {e}")
- elif (
- isinstance(vector_cleaner, PGVectorDatabaseCleaner)
- and vector_cleaner.session
- ):
- try:
- vector_cleaner.session.execute(text("VACUUM ANALYZE"))
- vector_cleaner.session.commit()
- log.info("Executed VACUUM ANALYZE on PostgreSQL database")
- except Exception as e:
- log.error(f"Failed to vacuum PostgreSQL database: {e}")
- else:
- log.info("Skipping VACUUM optimization (not enabled)")
-
- # Log any warnings collected during pruning
- if warnings:
- log.warning(f"Data pruning completed with warnings: {'; '.join(warnings)}")
+ # Log any warnings collected during pruning
+ if warnings:
+ log.warning(f"Data pruning completed with warnings: {'; '.join(warnings)}")
log.info("Data pruning completed successfully")
return True
@@ -1695,3 +1711,6 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=ERROR_MESSAGES.DEFAULT("Data pruning failed"),
)
+ finally:
+ # Always release lock, even if operation fails
+ PruneLock.release()
From c307d872629bfa4cc7f49077b2a2276bdc33d774 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Thu, 13 Nov 2025 19:13:21 +0100
Subject: [PATCH 41/43] sync (#34)
Co-authored-by: Claude
---
backend/open_webui/routers/prune.py | 178 +++++++++++++++++++---------
1 file changed, 120 insertions(+), 58 deletions(-)
diff --git a/backend/open_webui/routers/prune.py b/backend/open_webui/routers/prune.py
index cd7053e7fa..fc83cd6a9c 100644
--- a/backend/open_webui/routers/prune.py
+++ b/backend/open_webui/routers/prune.py
@@ -13,11 +13,12 @@ from abc import ABC, abstractmethod
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel
-from sqlalchemy import text
+from sqlalchemy import select, text
from open_webui.utils.auth import get_admin_user
from open_webui.models.users import Users
from open_webui.models.chats import Chat, ChatModel, Chats
+from open_webui.models.messages import Message
from open_webui.models.files import Files
from open_webui.models.notes import Notes
from open_webui.models.prompts import Prompts
@@ -25,7 +26,7 @@ from open_webui.models.models import Models
from open_webui.models.knowledge import Knowledges
from open_webui.models.functions import Functions
from open_webui.models.tools import Tools
-from open_webui.models.folders import Folders
+from open_webui.models.folders import Folder, Folders
from open_webui.retrieval.vector.factory import VECTOR_DB_CLIENT, VECTOR_DB
from open_webui.constants import ERROR_MESSAGES
from open_webui.env import SRC_LOG_LEVELS
@@ -181,6 +182,65 @@ class JSONFileIDExtractor:
return validated_ids
+# UUID pattern for direct dict traversal (Phase 1.5 optimization)
+UUID_PATTERN = re.compile(
+ r'^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$'
+)
+
+
+def collect_file_ids_from_dict(obj, out: Set[str], valid_ids: Set[str], _depth: int = 0) -> None:
+ """
+ Recursively traverse dict/list structures and collect file IDs.
+
+ This function replaces json.dumps() + regex approach with direct dict traversal,
+ reducing memory usage by ~75% on large chat databases.
+
+ Args:
+ obj: Dict, list, or any value to traverse
+ out: Set to accumulate found file IDs into
+ valid_ids: Set of known valid file IDs (for O(1) validation)
+ _depth: Current recursion depth (safety limit)
+
+ Patterns detected:
+ - {"id": "uuid"}
+ - {"file_id": "uuid"}
+ - {"fileId": "uuid"}
+ - {"file_ids": ["uuid1", "uuid2"]}
+ - {"fileIds": ["uuid1", "uuid2"]}
+ """
+ # Safety: Prevent excessive recursion
+ if _depth > 100:
+ return
+
+ if isinstance(obj, dict):
+ # Check individual file ID fields
+ for field_name in ['id', 'file_id', 'fileId']:
+ fid = obj.get(field_name)
+ if isinstance(fid, str) and UUID_PATTERN.fullmatch(fid):
+ if fid in valid_ids:
+ out.add(fid)
+
+ # Check file ID array fields
+ for field_name in ['file_ids', 'fileIds']:
+ fid_array = obj.get(field_name)
+ if isinstance(fid_array, list):
+ for fid in fid_array:
+ if isinstance(fid, str) and UUID_PATTERN.fullmatch(fid):
+ if fid in valid_ids:
+ out.add(fid)
+
+ # Recurse into all dict values
+ for value in obj.values():
+ collect_file_ids_from_dict(value, out, valid_ids, _depth + 1)
+
+ elif isinstance(obj, list):
+ # Recurse into all list items
+ for item in obj:
+ collect_file_ids_from_dict(item, out, valid_ids, _depth + 1)
+
+ # Primitives (str, int, None, etc.) - do nothing
+
+
class VectorDatabaseCleaner(ABC):
"""
Abstract base class for vector database cleanup operations.
@@ -1122,82 +1182,84 @@ def get_active_file_ids() -> Set[str]:
active_file_ids.add(stripped_id)
# Scan chats for file references
- # Stream chats to avoid loading all into memory
+ # Stream chats using Core SELECT to avoid ORM overhead
chat_count = 0
with get_db() as db:
- for chat_orm in db.query(Chat).yield_per(1000):
- chat_count += 1
- chat = ChatModel.model_validate(chat_orm)
+ stmt = select(Chat.id, Chat.chat)
+ result = db.execution_options(stream_results=True).execute(stmt)
- if not chat.chat or not isinstance(chat.chat, dict):
- continue
+ while True:
+ rows = result.fetchmany(1000)
+ if not rows:
+ break
- try:
- chat_json_str = json.dumps(chat.chat)
- # Extract file IDs without DB queries
- extracted_ids = JSONFileIDExtractor.extract_file_ids(chat_json_str)
- # Validate against preloaded set (O(1) per ID)
- validated_ids = extracted_ids & all_file_ids
- active_file_ids.update(validated_ids)
+ for chat_id, chat_dict in rows:
+ chat_count += 1
- except Exception as e:
- log.debug(f"Error processing chat {chat.id} for file references: {e}")
+ # Skip if no chat data or not a dict
+ if not chat_dict or not isinstance(chat_dict, dict):
+ continue
+
+ try:
+ # Direct dict traversal (no json.dumps needed)
+ collect_file_ids_from_dict(chat_dict, active_file_ids, all_file_ids)
+ except Exception as e:
+ log.debug(f"Error processing chat {chat_id} for file references: {e}")
log.debug(f"Scanned {chat_count} chats for file references")
# Scan folders for file references
+ # Stream folders using Core SELECT to avoid ORM overhead
try:
- folders = Folders.get_all_folders()
+ with get_db() as db:
+ stmt = select(Folder.id, Folder.items, Folder.data)
+ result = db.execution_options(stream_results=True).execute(stmt)
- for folder in folders:
- if folder.items:
- try:
- items_str = json.dumps(folder.items)
- # Extract file IDs without DB queries
- extracted_ids = JSONFileIDExtractor.extract_file_ids(items_str)
- # Validate against preloaded set (O(1) per ID)
- validated_ids = extracted_ids & all_file_ids
- active_file_ids.update(validated_ids)
- except Exception as e:
- log.debug(f"Error processing folder {folder.id} items: {e}")
+ while True:
+ rows = result.fetchmany(100)
+ if not rows:
+ break
- if hasattr(folder, "data") and folder.data:
- try:
- data_str = json.dumps(folder.data)
- # Extract file IDs without DB queries
- extracted_ids = JSONFileIDExtractor.extract_file_ids(data_str)
- # Validate against preloaded set (O(1) per ID)
- validated_ids = extracted_ids & all_file_ids
- active_file_ids.update(validated_ids)
- except Exception as e:
- log.debug(f"Error processing folder {folder.id} data: {e}")
+ for folder_id, items_dict, data_dict in rows:
+ # Process folder.items
+ if items_dict:
+ try:
+ # Direct dict traversal (no json.dumps needed)
+ collect_file_ids_from_dict(items_dict, active_file_ids, all_file_ids)
+ except Exception as e:
+ log.debug(f"Error processing folder {folder_id} items: {e}")
+
+ # Process folder.data
+ if data_dict:
+ try:
+ # Direct dict traversal (no json.dumps needed)
+ collect_file_ids_from_dict(data_dict, active_file_ids, all_file_ids)
+ except Exception as e:
+ log.debug(f"Error processing folder {folder_id} data: {e}")
except Exception as e:
log.debug(f"Error scanning folders for file references: {e}")
# Scan standalone messages for file references
+ # Stream messages using Core SELECT to avoid text() and yield_per issues
try:
with get_db() as db:
- stmt = text("SELECT id, data FROM message WHERE data IS NOT NULL")
+ stmt = select(Message.id, Message.data).where(Message.data.isnot(None))
+ result = db.execution_options(stream_results=True).execute(stmt)
+
+ while True:
+ rows = result.fetchmany(1000)
+ if not rows:
+ break
+
+ for message_id, message_data_dict in rows:
+ if message_data_dict:
+ try:
+ # Direct dict traversal (no json.dumps needed)
+ collect_file_ids_from_dict(message_data_dict, active_file_ids, all_file_ids)
+ except Exception as e:
+ log.debug(f"Error processing message {message_id} data: {e}")
- for row in db.execute(stmt).yield_per(1000):
- message_id, message_data_json = row
- if message_data_json:
- try:
- data_str = (
- json.dumps(message_data_json)
- if isinstance(message_data_json, dict)
- else str(message_data_json)
- )
- # Extract file IDs without DB queries
- extracted_ids = JSONFileIDExtractor.extract_file_ids(data_str)
- # Validate against preloaded set (O(1) per ID)
- validated_ids = extracted_ids & all_file_ids
- active_file_ids.update(validated_ids)
- except Exception as e:
- log.debug(
- f"Error processing message {message_id} data: {e}"
- )
except Exception as e:
log.debug(f"Error scanning messages for file references: {e}")
From a4ddb4b15be7d8cce09daf648df66a41b7469a9f Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Thu, 13 Nov 2025 19:29:02 +0100
Subject: [PATCH 42/43] fix (#35)
Co-authored-by: Claude
Fix #1: Remove duplicate scan in preview mode
Fix #2: Cache stat() result in audio cleanup
---
backend/open_webui/routers/prune.py | 19 +++++++++----------
1 file changed, 9 insertions(+), 10 deletions(-)
diff --git a/backend/open_webui/routers/prune.py b/backend/open_webui/routers/prune.py
index fc83cd6a9c..857832883f 100644
--- a/backend/open_webui/routers/prune.py
+++ b/backend/open_webui/routers/prune.py
@@ -999,7 +999,11 @@ def count_old_chats(
return count
-def count_orphaned_records(form_data: PruneDataForm) -> dict:
+def count_orphaned_records(
+ form_data: PruneDataForm,
+ active_file_ids: Set[str],
+ active_user_ids: Set[str]
+) -> dict:
"""Count orphaned database records that would be deleted."""
counts = {
"chats": 0,
@@ -1014,12 +1018,6 @@ def count_orphaned_records(form_data: PruneDataForm) -> dict:
}
try:
- # Get active user IDs
- active_user_ids = {user.id for user in Users.get_users()["users"]}
-
- # Get active file IDs for file orphan detection
- active_file_ids = get_active_file_ids()
-
# Count orphaned files
for file_record in Files.get_files():
should_delete = (
@@ -1415,10 +1413,11 @@ def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
if not file_path.is_file():
continue
- file_mtime = file_path.stat().st_mtime
+ stat_info = file_path.stat()
+ file_mtime = stat_info.st_mtime
if file_mtime < cutoff_time:
try:
- file_size = file_path.stat().st_size
+ file_size = stat_info.st_size
file_path.unlink()
deleted_count += 1
total_size_deleted += file_size
@@ -1466,7 +1465,7 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
if kb.user_id in active_user_ids
}
- orphaned_counts = count_orphaned_records(form_data)
+ orphaned_counts = count_orphaned_records(form_data, active_file_ids, active_user_ids)
result = PrunePreviewResult(
inactive_users=count_inactive_users(
From 81c7617508101cfe16d551b07e94716d930e9bde Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Thu, 13 Nov 2025 20:45:47 +0100
Subject: [PATCH 43/43] feat: Make VACUUM database optimization optional (#36)
Co-authored-by: Claude
Fix #1: Remove duplicate scan in preview mode
Fix #2: Cache stat() result in audio cleanup
---
backend/open_webui/routers/prune.py | 34 +++++++++++++++++++++--------
1 file changed, 25 insertions(+), 9 deletions(-)
diff --git a/backend/open_webui/routers/prune.py b/backend/open_webui/routers/prune.py
index 857832883f..c90cf8d785 100644
--- a/backend/open_webui/routers/prune.py
+++ b/backend/open_webui/routers/prune.py
@@ -948,9 +948,16 @@ class PrunePreviewResult(BaseModel):
# Counting helper functions for dry-run preview
def count_inactive_users(
- inactive_days: Optional[int], exempt_admin: bool, exempt_pending: bool
+ inactive_days: Optional[int], exempt_admin: bool, exempt_pending: bool, all_users=None
) -> int:
- """Count users that would be deleted for inactivity."""
+ """Count users that would be deleted for inactivity.
+
+ Args:
+ inactive_days: Number of days of inactivity before deletion
+ exempt_admin: Whether to exempt admin users
+ exempt_pending: Whether to exempt pending users
+ all_users: Optional pre-fetched list of users to avoid duplicate queries
+ """
if inactive_days is None:
return 0
@@ -958,7 +965,8 @@ def count_inactive_users(
count = 0
try:
- all_users = Users.get_users()["users"]
+ if all_users is None:
+ all_users = Users.get_users()["users"]
for user in all_users:
if exempt_admin and user.role == "admin":
continue
@@ -1139,9 +1147,12 @@ def count_audio_cache_files(max_age_days: Optional[int]) -> int:
return count
-def get_active_file_ids() -> Set[str]:
+def get_active_file_ids(knowledge_bases=None) -> Set[str]:
"""
Get all file IDs that are actively referenced by knowledge bases, chats, folders, and messages.
+
+ Args:
+ knowledge_bases: Optional pre-fetched list of knowledge bases to avoid duplicate queries
"""
active_file_ids = set()
@@ -1151,7 +1162,8 @@ def get_active_file_ids() -> Set[str]:
all_file_ids = {f.id for f in Files.get_files()}
log.debug(f"Preloaded {len(all_file_ids)} file IDs for validation")
# Scan knowledge bases for file references
- knowledge_bases = Knowledges.get_knowledge_bases()
+ if knowledge_bases is None:
+ knowledge_bases = Knowledges.get_knowledge_bases()
log.debug(f"Found {len(knowledge_bases)} knowledge bases")
for kb in knowledge_bases:
@@ -1457,13 +1469,16 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
log.info("Starting data pruning preview (dry run)")
# Get counts for all enabled operations
- active_file_ids = get_active_file_ids()
- active_user_ids = {user.id for user in Users.get_users()["users"]}
+ # Fetch knowledge bases and users once to avoid duplicate queries
+ knowledge_bases = Knowledges.get_knowledge_bases()
+ all_users = Users.get_users()["users"]
+ active_user_ids = {user.id for user in all_users}
active_kb_ids = {
kb.id
- for kb in Knowledges.get_knowledge_bases()
+ for kb in knowledge_bases
if kb.user_id in active_user_ids
}
+ active_file_ids = get_active_file_ids(knowledge_bases)
orphaned_counts = count_orphaned_records(form_data, active_file_ids, active_user_ids)
@@ -1472,6 +1487,7 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
form_data.delete_inactive_users_days,
form_data.exempt_admin_users,
form_data.exempt_pending_users,
+ all_users,
),
old_chats=count_old_chats(
form_data.days,
@@ -1570,7 +1586,7 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
log.info(f"Found {len(active_kb_ids)} active knowledge bases")
- active_file_ids = get_active_file_ids()
+ active_file_ids = get_active_file_ids(knowledge_bases)
# Stage 3: Delete orphaned database records
log.info("Deleting orphaned database records")