From d454e6a03359155a10fd6e8305f1a640945206ea Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Sun, 10 Aug 2025 23:40:01 +0200 Subject: [PATCH] Feat/prune orphaned data (#16) * feat: Add prune orphaned data functionality * feat: Add prune orphaned data functionality * feat: Add prune orphaned data functionality * fix: Restyle PruneDataDialog modal * feat: Add comprehensive prune orphaned data functionality and fix circular import * feat: Add comprehensive prune orphaned data functionality and fix circular import * feat: Add comprehensive prune orphaned data functionality and fix database size issues * feat: Add comprehensive prune orphaned data functionality and fix database size issues * feat: Add comprehensive prune orphaned data functionality and fix database size issues * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update folders.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Update prune.py * Delete backend/open_webui/test/test_prune.py * Update prune.ts * Update PruneDataDialog.svelte * Update prune.py * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update prune.py * Update PruneDataDialog.svelte * Update prune.ts * Update Database.svelte * Update prune.py * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update prune.py * Update prune.py * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update Database.svelte * Update prune.py * Update prune.ts * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte * Update prune.py * Update prune.ts * Update PruneDataDialog.svelte * Update files.py * Update prompts.py * Update notes.py * Update models.py * Update access_control.py * Update PruneDataDialog.svelte * Update PruneDataDialog.svelte --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> --- backend/open_webui/main.py | 2 + backend/open_webui/models/folders.py | 4 + backend/open_webui/routers/prune.py | 684 ++++++++++++++++++ src/lib/apis/prune.ts | 54 ++ .../components/admin/Settings/Database.svelte | 93 ++- .../components/common/PruneDataDialog.svelte | 589 +++++++++++++++ 6 files changed, 1402 insertions(+), 24 deletions(-) create mode 100644 backend/open_webui/routers/prune.py create mode 100644 src/lib/apis/prune.ts create mode 100644 src/lib/components/common/PruneDataDialog.svelte diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 618640486d..f6398b23fa 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -81,6 +81,7 @@ from open_webui.routers import ( models, knowledge, prompts, + prune, evaluations, tools, users, @@ -1234,6 +1235,7 @@ app.include_router( evaluations.router, prefix="/api/v1/evaluations", tags=["evaluations"] ) app.include_router(utils.router, prefix="/api/v1/utils", tags=["utils"]) +app.include_router(prune.router, prefix="/api/v1/prune", tags=["prune"]) # SCIM 2.0 API for identity management if SCIM_ENABLED: diff --git a/backend/open_webui/models/folders.py b/backend/open_webui/models/folders.py index 15deecbf42..8b631f88de 100644 --- a/backend/open_webui/models/folders.py +++ b/backend/open_webui/models/folders.py @@ -135,6 +135,10 @@ class FolderTable: for folder in db.query(Folder).filter_by(user_id=user_id).all() ] + def get_all_folders(self) -> list[FolderModel]: + with get_db() as db: + return [FolderModel.model_validate(folder) for folder in db.query(Folder).all()] + def get_folder_by_parent_id_and_user_id_and_name( self, parent_id: Optional[str], user_id: str, name: str ) -> Optional[FolderModel]: diff --git a/backend/open_webui/routers/prune.py b/backend/open_webui/routers/prune.py new file mode 100644 index 0000000000..78c333e538 --- /dev/null +++ b/backend/open_webui/routers/prune.py @@ -0,0 +1,684 @@ +import logging +import time +import os +import shutil +import json +import re +from typing import Optional, Set +from pathlib import Path + +from fastapi import APIRouter, Depends, HTTPException, status +from pydantic import BaseModel +from sqlalchemy import text + +from open_webui.utils.auth import get_admin_user +from open_webui.models.users import Users +from open_webui.models.chats import Chats +from open_webui.models.files import Files +from open_webui.models.notes import Notes +from open_webui.models.prompts import Prompts +from open_webui.models.models import Models +from open_webui.models.knowledge import Knowledges +from open_webui.models.functions import Functions +from open_webui.models.tools import Tools +from open_webui.models.folders import Folders +from open_webui.retrieval.vector.factory import VECTOR_DB_CLIENT, VECTOR_DB +from open_webui.constants import ERROR_MESSAGES +from open_webui.env import SRC_LOG_LEVELS +from open_webui.config import CACHE_DIR +from open_webui.internal.db import get_db + +log = logging.getLogger(__name__) +log.setLevel(SRC_LOG_LEVELS["MODELS"]) + +router = APIRouter() + + +class PruneDataForm(BaseModel): + days: Optional[int] = None + exempt_archived_chats: bool = False + exempt_chats_in_folders: bool = False + # Orphaned resource deletion toggles (for deleted users) + delete_orphaned_chats: bool = True + delete_orphaned_tools: bool = False + delete_orphaned_functions: bool = False + delete_orphaned_prompts: bool = True + delete_orphaned_knowledge_bases: bool = True + delete_orphaned_models: bool = True + delete_orphaned_notes: bool = True + delete_orphaned_folders: bool = True + + +def get_active_file_ids() -> Set[str]: + """ + Get all file IDs that are actively referenced by knowledge bases, chats, folders, and messages. + This is the ground truth for what files should be preserved. + """ + active_file_ids = set() + + try: + # 1. Get files referenced by knowledge bases (original logic) + knowledge_bases = Knowledges.get_knowledge_bases() + log.debug(f"Found {len(knowledge_bases)} knowledge bases") + + for kb in knowledge_bases: + if not kb.data: + continue + + # Handle different possible data structures for file references + file_ids = [] + + # Check for file_ids array + if isinstance(kb.data, dict) and "file_ids" in kb.data: + if isinstance(kb.data["file_ids"], list): + file_ids.extend(kb.data["file_ids"]) + + # Check for files array with id field + if isinstance(kb.data, dict) and "files" in kb.data: + if isinstance(kb.data["files"], list): + for file_ref in kb.data["files"]: + if isinstance(file_ref, dict) and "id" in file_ref: + file_ids.append(file_ref["id"]) + elif isinstance(file_ref, str): + file_ids.append(file_ref) + + # Add all found file IDs + for file_id in file_ids: + if isinstance(file_id, str) and file_id.strip(): + active_file_ids.add(file_id.strip()) + log.debug(f"KB {kb.id} references file {file_id}") + + # 2. Get files referenced in chats (NEW: scan chat JSON for file references) + chats = Chats.get_chats() + log.debug(f"Found {len(chats)} chats to scan for file references") + + for chat in chats: + if not chat.chat or not isinstance(chat.chat, dict): + continue + + try: + # Convert entire chat JSON to string and extract all file IDs + chat_json_str = json.dumps(chat.chat) + + # Find all file ID patterns in the JSON + # Pattern 1: "id": "uuid" where uuid looks like a file ID + file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"') + potential_file_ids = file_id_pattern.findall(chat_json_str) + + # Pattern 2: URLs containing /api/v1/files/uuid + url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})') + url_file_ids = url_pattern.findall(chat_json_str) + + # Combine and validate against actual file records + all_potential_ids = set(potential_file_ids + url_file_ids) + for file_id in all_potential_ids: + # Verify this ID exists in the file table to avoid false positives + if Files.get_file_by_id(file_id): + active_file_ids.add(file_id) + log.debug(f"Chat {chat.id}: Found active file {file_id}") + + except Exception as e: + log.debug(f"Error processing chat {chat.id} for file references: {e}") + + # 3. Get files referenced in folders (scan folder.items, folder.data, folder.meta) + try: + folders = Folders.get_all_folders() + log.debug(f"Found {len(folders)} folders to scan for file references") + + for folder in folders: + # Check folder.items JSON + if folder.items: + try: + items_str = json.dumps(folder.items) + # Look for file ID patterns in the JSON + file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"') + url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})') + + potential_ids = file_id_pattern.findall(items_str) + url_pattern.findall(items_str) + for file_id in potential_ids: + if Files.get_file_by_id(file_id): + active_file_ids.add(file_id) + log.debug(f"Folder {folder.id}: Found file {file_id} in items") + except Exception as e: + log.debug(f"Error processing folder {folder.id} items: {e}") + + # Check folder.data JSON + if hasattr(folder, 'data') and folder.data: + try: + data_str = json.dumps(folder.data) + file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"') + url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})') + + potential_ids = file_id_pattern.findall(data_str) + url_pattern.findall(data_str) + for file_id in potential_ids: + if Files.get_file_by_id(file_id): + active_file_ids.add(file_id) + log.debug(f"Folder {folder.id}: Found file {file_id} in data") + except Exception as e: + log.debug(f"Error processing folder {folder.id} data: {e}") + + except Exception as e: + log.debug(f"Error scanning folders for file references: {e}") + + # 4. Get files referenced in standalone messages (message table) + try: + # Query message table directly since we may not have a Messages model + with get_db() as db: + message_results = db.execute(text("SELECT id, data FROM message WHERE data IS NOT NULL")).fetchall() + log.debug(f"Found {len(message_results)} messages with data to scan") + + for message_id, message_data_json in message_results: + if message_data_json: + try: + # Convert JSON to string and scan for file patterns + data_str = json.dumps(message_data_json) if isinstance(message_data_json, dict) else str(message_data_json) + + file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"') + url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})') + + potential_ids = file_id_pattern.findall(data_str) + url_pattern.findall(data_str) + for file_id in potential_ids: + if Files.get_file_by_id(file_id): + active_file_ids.add(file_id) + log.debug(f"Message {message_id}: Found file {file_id}") + except Exception as e: + log.debug(f"Error processing message {message_id} data: {e}") + except Exception as e: + log.debug(f"Error scanning messages for file references: {e}") + + except Exception as e: + log.error(f"Error determining active file IDs: {e}") + # Fail safe: return empty set, which will prevent deletion + return set() + + log.info(f"Found {len(active_file_ids)} active file IDs") + return active_file_ids + + +def safe_delete_vector_collection(collection_name: str) -> bool: + """ + Safely delete a vector collection, handling both logical and physical cleanup. + """ + try: + # First, try to delete the collection through the client + try: + VECTOR_DB_CLIENT.delete_collection(collection_name=collection_name) + log.debug(f"Deleted collection from vector DB: {collection_name}") + except Exception as e: + log.debug(f"Collection {collection_name} may not exist in DB: {e}") + + # Then, handle physical cleanup for ChromaDB + if "chroma" in VECTOR_DB.lower(): + vector_dir = Path(CACHE_DIR).parent / "vector_db" / collection_name + if vector_dir.exists() and vector_dir.is_dir(): + shutil.rmtree(vector_dir) + log.debug(f"Deleted physical vector directory: {vector_dir}") + return True + + return True + + except Exception as e: + log.error(f"Error deleting vector collection {collection_name}: {e}") + return False + + +def safe_delete_file_by_id(file_id: str) -> bool: + """ + Safely delete a file record and its associated vector collection. + """ + try: + # Get file info before deletion + file_record = Files.get_file_by_id(file_id) + if not file_record: + log.debug(f"File {file_id} not found in database") + return True # Already gone + + # Delete vector collection first + collection_name = f"file-{file_id}" + safe_delete_vector_collection(collection_name) + + # Delete database record + Files.delete_file_by_id(file_id) + log.debug(f"Deleted file record: {file_id}") + + return True + + except Exception as e: + log.error(f"Error deleting file {file_id}: {e}") + return False + + +def cleanup_orphaned_uploads(active_file_ids: Set[str]) -> None: + """ + Clean up orphaned files in the uploads directory. + """ + upload_dir = Path(CACHE_DIR).parent / "uploads" + if not upload_dir.exists(): + log.debug("Uploads directory does not exist") + return + + deleted_count = 0 + + try: + for file_path in upload_dir.iterdir(): + if not file_path.is_file(): + continue + + filename = file_path.name + + # Extract file ID from filename (common patterns) + file_id = None + + # Pattern 1: UUID_filename or UUID-filename + if len(filename) > 36: + potential_id = filename[:36] + if potential_id.count('-') == 4: # UUID format + file_id = potential_id + + # Pattern 2: filename might be the file ID itself + if not file_id and filename.count('-') == 4 and len(filename) == 36: + file_id = filename + + # Pattern 3: Check if any part of filename matches active IDs + if not file_id: + for active_id in active_file_ids: + if active_id in filename: + file_id = active_id + break + + # If we found a potential file ID and it's not active, delete it + if file_id and file_id not in active_file_ids: + try: + file_path.unlink() + deleted_count += 1 + log.debug(f"Deleted orphaned upload file: {filename}") + except Exception as e: + log.error(f"Failed to delete upload file {filename}: {e}") + + except Exception as e: + log.error(f"Error cleaning uploads directory: {e}") + + if deleted_count > 0: + log.info(f"Deleted {deleted_count} orphaned upload files") + + +def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids: Set[str]) -> None: + """ + Clean up orphaned vector collections by querying ChromaDB metadata. + """ + if "chroma" not in VECTOR_DB.lower(): + return + + vector_dir = Path(CACHE_DIR).parent / "vector_db" + if not vector_dir.exists(): + log.debug("Vector DB directory does not exist") + return + + chroma_db_path = vector_dir / "chroma.sqlite3" + if not chroma_db_path.exists(): + log.debug("ChromaDB metadata file does not exist") + return + + # Build expected collection names + expected_collections = set() + + # File collections: file-{file_id} + for file_id in active_file_ids: + expected_collections.add(f"file-{file_id}") + + # Knowledge base collections: {kb_id} + for kb_id in active_kb_ids: + expected_collections.add(kb_id) + + log.debug(f"Expected collections to preserve: {expected_collections}") + + # Query ChromaDB metadata to get the complete mapping chain: + # Directory UUID -> Collection ID -> Collection Name + uuid_to_collection = {} + try: + import sqlite3 + log.debug(f"Attempting to connect to ChromaDB at: {chroma_db_path}") + + with sqlite3.connect(str(chroma_db_path)) as conn: + # First, check what tables exist + tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall() + log.debug(f"ChromaDB tables: {tables}") + + # Check the schema of collections table + schema = conn.execute("PRAGMA table_info(collections)").fetchall() + log.debug(f"Collections table schema: {schema}") + + # Get Collection ID -> Collection Name mapping + collection_id_to_name = {} + cursor = conn.execute("SELECT id, name FROM collections") + rows = cursor.fetchall() + log.debug(f"Raw ChromaDB collections query results: {rows}") + + for row in rows: + collection_id, collection_name = row + collection_id_to_name[collection_id] = collection_name + log.debug(f"Mapped collection ID {collection_id} -> name {collection_name}") + + # Get Directory UUID -> Collection ID mapping from segments table + # Only interested in VECTOR segments as those are the actual data directories + cursor = conn.execute("SELECT id, collection FROM segments WHERE scope = 'VECTOR'") + segment_rows = cursor.fetchall() + log.debug(f"Raw ChromaDB segments query results: {segment_rows}") + + for row in segment_rows: + segment_id, collection_id = row + if collection_id in collection_id_to_name: + collection_name = collection_id_to_name[collection_id] + uuid_to_collection[segment_id] = collection_name + log.debug(f"Mapped directory UUID {segment_id} -> collection {collection_name}") + + log.debug(f"Final uuid_to_collection mapping: {uuid_to_collection}") + log.info(f"Found {len(uuid_to_collection)} vector segments in ChromaDB metadata") + + except Exception as e: + log.error(f"Error reading ChromaDB metadata: {e}") + # Fail safe: don't delete anything if we can't read metadata + return + + deleted_count = 0 + + try: + for collection_dir in vector_dir.iterdir(): + if not collection_dir.is_dir(): + continue + + dir_uuid = collection_dir.name + + # Skip system/metadata files + if dir_uuid.startswith('.'): + continue + + # Get the actual collection name from metadata + collection_name = uuid_to_collection.get(dir_uuid) + + if collection_name is None: + # Directory exists but no metadata entry - it's orphaned + log.debug(f"Directory {dir_uuid} has no metadata entry, deleting") + try: + shutil.rmtree(collection_dir) + deleted_count += 1 + except Exception as e: + log.error(f"Failed to delete orphaned directory {dir_uuid}: {e}") + + elif collection_name not in expected_collections: + # Collection exists but should be deleted + log.debug(f"Collection {collection_name} (UUID: {dir_uuid}) is orphaned, deleting") + try: + shutil.rmtree(collection_dir) + deleted_count += 1 + except Exception as e: + log.error(f"Failed to delete collection directory {dir_uuid}: {e}") + + else: + # Collection should be preserved + log.debug(f"Preserving collection {collection_name} (UUID: {dir_uuid})") + + except Exception as e: + log.error(f"Error cleaning vector collections: {e}") + + if deleted_count > 0: + log.info(f"Deleted {deleted_count} orphaned vector collections") + + +@router.post("/", response_model=bool) +async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)): + """ + Prunes old and orphaned data using a safe, multi-stage process. + + Parameters: + - days: Optional[int] = None + - If None: Skip chat deletion entirely + - If 0: Delete all chats (older than 0 days = all chats) + - If >= 1: Delete chats older than specified number of days + - exempt_archived_chats: bool = False + - If True: Exempt archived chats from deletion (only applies when days is not None) + - exempt_chats_in_folders: bool = False + - If True: Exempt chats that are in folders OR pinned chats from deletion (only applies when days is not None) + Note: Pinned chats behave the same as chats in folders + - delete_orphaned_chats: bool = True + - If True: Delete chats from deleted users + - delete_orphaned_tools: bool = True + - If True: Delete tools from deleted users + - delete_orphaned_functions: bool = True + - If True: Delete functions from deleted users + - delete_orphaned_prompts: bool = True + - If True: Delete prompts from deleted users + - delete_orphaned_knowledge_bases: bool = True + - If True: Delete knowledge bases from deleted users + - delete_orphaned_models: bool = True + - If True: Delete models from deleted users + - delete_orphaned_notes: bool = True + - If True: Delete notes from deleted users + - delete_orphaned_folders: bool = True + - If True: Delete folders from deleted users + """ + try: + log.info("Starting data pruning process") + + # Stage 1: Delete old chats based on user criteria (optional) + if form_data.days is not None: + cutoff_time = int(time.time()) - (form_data.days * 86400) + chats_to_delete = [] + + for chat in Chats.get_chats(): + if chat.updated_at < cutoff_time: + # Check exemption conditions + if form_data.exempt_archived_chats and chat.archived: + log.debug(f"Exempting archived chat: {chat.id}") + continue + if form_data.exempt_chats_in_folders and (getattr(chat, 'folder_id', None) is not None or getattr(chat, 'pinned', False)): + folder_status = f"folder_id: {getattr(chat, 'folder_id', None)}" if getattr(chat, 'folder_id', None) else "not in folder" + pinned_status = f"pinned: {getattr(chat, 'pinned', False)}" + log.debug(f"Exempting chat in folder or pinned: {chat.id} ({folder_status}, {pinned_status})") + continue + log.debug(f"Chat {chat.id} will be deleted - archived: {getattr(chat, 'archived', False)}, folder_id: {getattr(chat, 'folder_id', None)}, pinned: {getattr(chat, 'pinned', False)}") + chats_to_delete.append(chat) + + if chats_to_delete: + log.info(f"Deleting {len(chats_to_delete)} old chats (older than {form_data.days} days)") + for chat in chats_to_delete: + Chats.delete_chat_by_id(chat.id) + else: + log.info(f"No chats found older than {form_data.days} days") + else: + log.info("Skipping chat deletion (days parameter is None)") + + # Stage 2: Build ground truth of what should be preserved + log.info("Building preservation set") + + # Get all active users + active_user_ids = {user.id for user in Users.get_users()["users"]} + log.info(f"Found {len(active_user_ids)} active users") + + # Get all active knowledge bases and their file references + active_kb_ids = set() + knowledge_bases = Knowledges.get_knowledge_bases() + + for kb in knowledge_bases: + if kb.user_id in active_user_ids: + active_kb_ids.add(kb.id) + + log.info(f"Found {len(active_kb_ids)} active knowledge bases") + + # Get all files that should be preserved (NOW COMPREHENSIVE!) + active_file_ids = get_active_file_ids() + + # Stage 3: Delete orphaned database records + log.info("Deleting orphaned database records") + + # Delete files not referenced by any knowledge base or belonging to deleted users + deleted_files = 0 + for file_record in Files.get_files(): + should_delete = ( + file_record.id not in active_file_ids or + file_record.user_id not in active_user_ids + ) + + if should_delete: + if safe_delete_file_by_id(file_record.id): + deleted_files += 1 + + if deleted_files > 0: + log.info(f"Deleted {deleted_files} orphaned files") + + # Delete knowledge bases from deleted users (if enabled) + deleted_kbs = 0 + if form_data.delete_orphaned_knowledge_bases: + for kb in knowledge_bases: + if kb.user_id not in active_user_ids: + if safe_delete_vector_collection(kb.id): + Knowledges.delete_knowledge_by_id(kb.id) + deleted_kbs += 1 + + if deleted_kbs > 0: + log.info(f"Deleted {deleted_kbs} orphaned knowledge bases") + else: + log.info("Skipping knowledge base deletion (disabled)") + + # Delete other user-owned resources from deleted users (conditional) + deleted_others = 0 + + # Delete orphaned chats of deleted users (conditional) + if form_data.delete_orphaned_chats: + chats_deleted = 0 + for chat in Chats.get_chats(): + if chat.user_id not in active_user_ids: + Chats.delete_chat_by_id(chat.id) + chats_deleted += 1 + deleted_others += 1 + if chats_deleted > 0: + log.info(f"Deleted {chats_deleted} orphaned chats") + else: + log.info("Skipping orphaned chat deletion (disabled)") + + # Delete orphaned tools of deleted users (conditional) + if form_data.delete_orphaned_tools: + tools_deleted = 0 + for tool in Tools.get_tools(): + if tool.user_id not in active_user_ids: + Tools.delete_tool_by_id(tool.id) + tools_deleted += 1 + deleted_others += 1 + if tools_deleted > 0: + log.info(f"Deleted {tools_deleted} orphaned tools") + else: + log.info("Skipping tool deletion (disabled)") + + # Delete orphaned functions of deleted users (conditional) + if form_data.delete_orphaned_functions: + functions_deleted = 0 + for function in Functions.get_functions(): + if function.user_id not in active_user_ids: + Functions.delete_function_by_id(function.id) + functions_deleted += 1 + deleted_others += 1 + if functions_deleted > 0: + log.info(f"Deleted {functions_deleted} orphaned functions") + else: + log.info("Skipping function deletion (disabled)") + + # Delete orphaned notes of deleted users (conditional) + if form_data.delete_orphaned_notes: + notes_deleted = 0 + for note in Notes.get_notes(): + if note.user_id not in active_user_ids: + Notes.delete_note_by_id(note.id) + notes_deleted += 1 + deleted_others += 1 + if notes_deleted > 0: + log.info(f"Deleted {notes_deleted} orphaned notes") + else: + log.info("Skipping note deletion (disabled)") + + # Delete orphaned prompts of deleted users (conditional) + if form_data.delete_orphaned_prompts: + prompts_deleted = 0 + for prompt in Prompts.get_prompts(): + if prompt.user_id not in active_user_ids: + Prompts.delete_prompt_by_command(prompt.command) + prompts_deleted += 1 + deleted_others += 1 + if prompts_deleted > 0: + log.info(f"Deleted {prompts_deleted} orphaned prompts") + else: + log.info("Skipping prompt deletion (disabled)") + + # Delete orphaned models of deleted users (conditional) + if form_data.delete_orphaned_models: + models_deleted = 0 + for model in Models.get_all_models(): + if model.user_id not in active_user_ids: + Models.delete_model_by_id(model.id) + models_deleted += 1 + deleted_others += 1 + if models_deleted > 0: + log.info(f"Deleted {models_deleted} orphaned models") + else: + log.info("Skipping model deletion (disabled)") + + # Delete orphaned folders of deleted users (conditional) + if form_data.delete_orphaned_folders: + folders_deleted = 0 + for folder in Folders.get_all_folders(): + if folder.user_id not in active_user_ids: + Folders.delete_folder_by_id_and_user_id(folder.id, folder.user_id, delete_chats=False) + folders_deleted += 1 + deleted_others += 1 + if folders_deleted > 0: + log.info(f"Deleted {folders_deleted} orphaned folders") + else: + log.info("Skipping folder deletion (disabled)") + + if deleted_others > 0: + log.info(f"Total other orphaned records deleted: {deleted_others}") + + # Stage 4: Clean up orphaned physical files + log.info("Cleaning up orphaned physical files") + + # Rebuild active sets after database cleanup + final_active_file_ids = get_active_file_ids() + final_active_kb_ids = {kb.id for kb in Knowledges.get_knowledge_bases()} + + # Clean uploads directory + cleanup_orphaned_uploads(final_active_file_ids) + + # Clean vector collections + cleanup_orphaned_vector_collections(final_active_file_ids, final_active_kb_ids) + + # Stage 5: Database optimization + log.info("Optimizing database") + + # Vacuum main database + try: + with get_db() as db: + db.execute(text("VACUUM")) + log.debug("Vacuumed main database") + except Exception as e: + log.error(f"Failed to vacuum main database: {e}") + + # Vacuum ChromaDB database if it exists + if "chroma" in VECTOR_DB.lower(): + chroma_db_path = Path(CACHE_DIR).parent / "vector_db" / "chroma.sqlite3" + if chroma_db_path.exists(): + try: + import sqlite3 + with sqlite3.connect(str(chroma_db_path)) as conn: + conn.execute("VACUUM") + log.debug("Vacuumed ChromaDB database") + except Exception as e: + log.error(f"Failed to vacuum ChromaDB database: {e}") + + log.info("Data pruning completed successfully") + return True + + except Exception as e: + log.exception(f"Error during data pruning: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=ERROR_MESSAGES.DEFAULT("Data pruning failed"), + ) diff --git a/src/lib/apis/prune.ts b/src/lib/apis/prune.ts new file mode 100644 index 0000000000..d95d662438 --- /dev/null +++ b/src/lib/apis/prune.ts @@ -0,0 +1,54 @@ +import { WEBUI_API_BASE_URL } from '$lib/constants'; + +export const pruneData = async ( + token: string, + days: number | null, + exempt_archived_chats: boolean, + exempt_chats_in_folders: boolean, + delete_orphaned_chats: boolean = true, + delete_orphaned_tools: boolean = false, + delete_orphaned_functions: boolean = false, + delete_orphaned_prompts: boolean = true, + delete_orphaned_knowledge_bases: boolean = true, + delete_orphaned_models: boolean = true, + delete_orphaned_notes: boolean = true, + delete_orphaned_folders: boolean = true +) => { + let error = null; + + const res = await fetch(`${WEBUI_API_BASE_URL}/prune/`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${token}` + }, + body: JSON.stringify({ + days, + exempt_archived_chats, + exempt_chats_in_folders, + delete_orphaned_chats, + delete_orphaned_tools, + delete_orphaned_functions, + delete_orphaned_prompts, + delete_orphaned_knowledge_bases, + delete_orphaned_models, + delete_orphaned_notes, + delete_orphaned_folders + }) + }) + .then(async (res) => { + if (!res.ok) throw await res.json(); + return res.json(); + }) + .catch((err) => { + error = err; + console.log(err); + return null; + }); + + if (error) { + throw error; + } + + return res; +}; diff --git a/src/lib/components/admin/Settings/Database.svelte b/src/lib/components/admin/Settings/Database.svelte index b2ac5553de..19ec874746 100644 --- a/src/lib/components/admin/Settings/Database.svelte +++ b/src/lib/components/admin/Settings/Database.svelte @@ -1,7 +1,6 @@ - +
{ @@ -58,7 +91,6 @@
{$i18n.t('Database')}
- { const file = e.target.files[0]; const reader = new FileReader(); - reader.onload = async (e) => { const res = await importConfig(localStorage.token, JSON.parse(e.target.result)).catch( (error) => { toast.error(`${error}`); } ); - if (res) { toast.success('Config imported successfully'); } e.target.value = null; }; - reader.readAsText(file); }} /> - - -
- {#if $config?.features.enable_admin_export ?? true}
-
-
-
{/if} +
+ - -
+ \ No newline at end of file diff --git a/src/lib/components/common/PruneDataDialog.svelte b/src/lib/components/common/PruneDataDialog.svelte new file mode 100644 index 0000000000..10a29d2594 --- /dev/null +++ b/src/lib/components/common/PruneDataDialog.svelte @@ -0,0 +1,589 @@ + + + +
+
+
+ {$i18n.t('Prune Orphaned Data')} +
+ +
+ +
+
+ +
+
+
+ + + +
+
+

+ {$i18n.t('Destructive Operation - Backup Recommended')} +

+
+

{$i18n.t('This action will permanently delete data from your database. Only orphaned or old data, based on your configuration settings, will be deleted. All active, referenced data remains completely safe.')}

+

{$i18n.t('This operation cannot be undone. Create a complete backup of your database and files before proceeding. This operation is performed entirely at your own risk - having a backup ensures you can restore any data if something unexpected occurs.')}

+ + +
+ + + {#if showDetailsExpanded} +
+

{$i18n.t('Note:')} {$i18n.t('This list provides an overview of what will be deleted during the pruning process and may not be complete or fully up-to-date.')}

+ + +
+ + + + + +
+ + +
+ {#if activeDetailsTab === 'chats'} +
+

{$i18n.t('Age-Based Chat Deletion:')}

+

• {$i18n.t('Removes conversations older than specified days based on when they were last modified or updated (not when they were created)')}

+

• {$i18n.t('Supports exemptions for:')}

+

◦ {$i18n.t('Archived chats')}

+

◦ {$i18n.t('Chats organized in folders and pinned chats')}

+ +

{$i18n.t('Orphaned Content Cleanup:')}

+

• {$i18n.t('Delete orphaned chats from deleted users')}

+

• {$i18n.t('Delete orphaned folders from deleted users')}

+
+ {:else if activeDetailsTab === 'workspace'} +
+

{$i18n.t('Orphaned Workspace Items from Deleted Users:')}

+

• {$i18n.t('Delete orphaned knowledge bases')}

+

• {$i18n.t('Delete orphaned custom tools')}

+

• {$i18n.t('Delete orphaned custom functions (Actions, Pipes, Filters)')}

+

• {$i18n.t('Delete orphaned custom prompts and templates')}

+

• {$i18n.t('Delete orphaned custom models and configurations')}

+

• {$i18n.t('Delete orphaned notes')}

+
+ {:else if activeDetailsTab === 'datavector'} +
+

{$i18n.t('Files & Vector Storage:')}

+

• {$i18n.t('Orphaned files and attachments from deleted content')}

+

• {$i18n.t('Vector embeddings and collections for removed data')}

+

• {$i18n.t('Uploaded files that lost their database references')}

+

• {$i18n.t('Vector storage directories without corresponding data')}

+
+ {:else if activeDetailsTab === 'imagesaudio'} +
+

{$i18n.t('Images & Audio Content Cleanup:')}

+

• {$i18n.t('TBD - Image cleanup functionality')}

+

• {$i18n.t('TBD - Audio cleanup functionality')}

+

• {$i18n.t('TBD - Orphaned images and audio files')}

+

• {$i18n.t('TBD - Media processing cache cleanup')}

+
+ {:else if activeDetailsTab === 'system'} +
+

{$i18n.t('Database & System Cleanup:')}

+

• {$i18n.t('Removal of broken database references and stale entries')}

+

• {$i18n.t('Disk space reclamation by database cleanup')}

+

• {$i18n.t('Synchronization of database records with actual file storage')}

+

• {$i18n.t('Fix inconsistencies between storage systems')}

+

• {$i18n.t('Database performance optimization')}

+
+ {/if} +
+
+ {/if} +
+
+
+
+
+ + +
+
+
+ + + +
+
+

+ {$i18n.t('Performance Warning: This operation may take a very long time to complete, especially if you have never cleaned your database before or if your instance stores large amounts of data. The process could take anywhere from seconds, to minutes, to half an hour and beyond depending on your data size.')} +

+
+
+
+ + +
+
+ + + +

+ {$i18n.t('Pruning Configuration')} +

+
+

+ {$i18n.t('Configure what data should be cleaned up during the pruning process.')} +

+ + +
+ + +
+ + +
+ {#if activeSettingsTab === 'chats'} + +
+
+
+
+ +
+
+
+ {$i18n.t('Delete chats by age')} +
+
+ {$i18n.t('Optionally remove old chats based on last update time')} +
+
+
+
+ + + {#if deleteChatsByAge} +
+
+ +
+ + {$i18n.t('days')} +
+

+ {$i18n.t('Set to 0 to delete all chats, or specify number of days')} +

+
+ +
+
+
+ +
+
+
+ {$i18n.t('Exempt archived chats')} +
+
+ {$i18n.t('Keep archived chats even if they are old')} +
+
+
+
+ +
+
+
+ +
+
+
+ {$i18n.t('Exempt chats in folders')} +
+
+ {$i18n.t('Keep chats that are organized in folders or pinned')} +
+
+
+
+
+ {/if} + + +
+
+
+
+ +
+
+
+ {$i18n.t('Delete orphaned chats')} +
+
+ {$i18n.t('Delete orphaned chats from deleted users')} +
+
+
+
+ +
+
+
+ +
+
+
+ {$i18n.t('Delete orphaned folders')} +
+
+ {$i18n.t('Delete orphaned folders from deleted users')} +
+
+
+
+
+
+ + {:else if activeSettingsTab === 'workspace'} +
+ +
+
+
+ +
+
+
+ {$i18n.t('Delete orphaned knowledge bases')} +
+
+ {$i18n.t('Delete orphaned knowledge bases from deleted users')} +
+
+
+
+ + +
+
+
+ +
+
+
+ {$i18n.t('Delete orphaned tools')} +
+
+ {$i18n.t('Delete orphaned custom tools from deleted users')} +
+
+
+
+ + +
+
+
+ +
+
+
+ {$i18n.t('Delete orphaned functions')} +
+ + + +
+
{$i18n.t('Admin panel functions - all functions, including:')}
+
+
• {$i18n.t('Actions')}
+
• {$i18n.t('Pipes')}
+
• {$i18n.t('Filters')}
+
+
+
+
+
+
+ {$i18n.t('Delete orphaned custom functions from deleted users')} +
+
+
+
+ + +
+
+
+ +
+
+
+ {$i18n.t('Delete orphaned prompts')} +
+
+ {$i18n.t('Delete orphaned custom prompts from deleted users')} +
+
+
+
+ + +
+
+
+ +
+
+
+ {$i18n.t('Delete orphaned models')} +
+
+ {$i18n.t('Delete orphaned custom models from deleted users')} +
+
+
+
+ + +
+
+
+ +
+
+
+ {$i18n.t('Delete orphaned notes')} +
+
+ {$i18n.t('Delete orphaned notes from deleted users')} +
+
+
+
+
+ {/if} +
+
+ + +
+
+
+ + + +
+
+

+ {$i18n.t('API Automation Helper')} +

+ + + + {#if showApiPreview} +
+

+ {$i18n.t('Use this API call configuration to automate pruning operations in your own maintenance scripts.')} +

+
+ + +
+
+ {/if} +
+
+
+
+ + +
+ + +
+
+
+