mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-12 04:15:25 +00:00
Update prune.py
This commit is contained in:
parent
709c852917
commit
34c9a8825c
1 changed files with 299 additions and 180 deletions
|
|
@ -38,6 +38,7 @@ class PruneDataForm(BaseModel):
|
||||||
days: Optional[int] = None
|
days: Optional[int] = None
|
||||||
exempt_archived_chats: bool = False
|
exempt_archived_chats: bool = False
|
||||||
exempt_chats_in_folders: bool = False
|
exempt_chats_in_folders: bool = False
|
||||||
|
# Orphaned resource deletion toggles (for deleted users)
|
||||||
delete_orphaned_chats: bool = True
|
delete_orphaned_chats: bool = True
|
||||||
delete_orphaned_tools: bool = False
|
delete_orphaned_tools: bool = False
|
||||||
delete_orphaned_functions: bool = False
|
delete_orphaned_functions: bool = False
|
||||||
|
|
@ -46,17 +47,17 @@ class PruneDataForm(BaseModel):
|
||||||
delete_orphaned_models: bool = True
|
delete_orphaned_models: bool = True
|
||||||
delete_orphaned_notes: bool = True
|
delete_orphaned_notes: bool = True
|
||||||
delete_orphaned_folders: bool = True
|
delete_orphaned_folders: bool = True
|
||||||
audio_cache_max_age_days: Optional[int] = 30
|
|
||||||
|
|
||||||
|
|
||||||
def get_active_file_ids() -> Set[str]:
|
def get_active_file_ids() -> Set[str]:
|
||||||
"""
|
"""
|
||||||
Get all file IDs that are actively referenced by knowledge bases, chats, folders, and messages.
|
Get all file IDs that are actively referenced by knowledge bases, chats, folders, and messages.
|
||||||
|
This is the ground truth for what files should be preserved.
|
||||||
"""
|
"""
|
||||||
active_file_ids = set()
|
active_file_ids = set()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Scan knowledge bases for file references
|
# 1. Get files referenced by knowledge bases (original logic)
|
||||||
knowledge_bases = Knowledges.get_knowledge_bases()
|
knowledge_bases = Knowledges.get_knowledge_bases()
|
||||||
log.debug(f"Found {len(knowledge_bases)} knowledge bases")
|
log.debug(f"Found {len(knowledge_bases)} knowledge bases")
|
||||||
|
|
||||||
|
|
@ -64,12 +65,15 @@ def get_active_file_ids() -> Set[str]:
|
||||||
if not kb.data:
|
if not kb.data:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Handle different possible data structures for file references
|
||||||
file_ids = []
|
file_ids = []
|
||||||
|
|
||||||
|
# Check for file_ids array
|
||||||
if isinstance(kb.data, dict) and "file_ids" in kb.data:
|
if isinstance(kb.data, dict) and "file_ids" in kb.data:
|
||||||
if isinstance(kb.data["file_ids"], list):
|
if isinstance(kb.data["file_ids"], list):
|
||||||
file_ids.extend(kb.data["file_ids"])
|
file_ids.extend(kb.data["file_ids"])
|
||||||
|
|
||||||
|
# Check for files array with id field
|
||||||
if isinstance(kb.data, dict) and "files" in kb.data:
|
if isinstance(kb.data, dict) and "files" in kb.data:
|
||||||
if isinstance(kb.data["files"], list):
|
if isinstance(kb.data["files"], list):
|
||||||
for file_ref in kb.data["files"]:
|
for file_ref in kb.data["files"]:
|
||||||
|
|
@ -78,11 +82,13 @@ def get_active_file_ids() -> Set[str]:
|
||||||
elif isinstance(file_ref, str):
|
elif isinstance(file_ref, str):
|
||||||
file_ids.append(file_ref)
|
file_ids.append(file_ref)
|
||||||
|
|
||||||
|
# Add all found file IDs
|
||||||
for file_id in file_ids:
|
for file_id in file_ids:
|
||||||
if isinstance(file_id, str) and file_id.strip():
|
if isinstance(file_id, str) and file_id.strip():
|
||||||
active_file_ids.add(file_id.strip())
|
active_file_ids.add(file_id.strip())
|
||||||
|
log.debug(f"KB {kb.id} references file {file_id}")
|
||||||
|
|
||||||
# Scan chats for file references
|
# 2. Get files referenced in chats (NEW: scan chat JSON for file references)
|
||||||
chats = Chats.get_chats()
|
chats = Chats.get_chats()
|
||||||
log.debug(f"Found {len(chats)} chats to scan for file references")
|
log.debug(f"Found {len(chats)} chats to scan for file references")
|
||||||
|
|
||||||
|
|
@ -91,81 +97,134 @@ def get_active_file_ids() -> Set[str]:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Convert entire chat JSON to string and extract all file IDs
|
||||||
chat_json_str = json.dumps(chat.chat)
|
chat_json_str = json.dumps(chat.chat)
|
||||||
|
|
||||||
# Extract file IDs using regex patterns
|
# Find all file ID patterns in the JSON
|
||||||
file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
|
# Pattern 1: "id": "uuid" where uuid looks like a file ID
|
||||||
url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
|
file_id_pattern = re.compile(
|
||||||
|
r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
|
||||||
|
)
|
||||||
potential_file_ids = file_id_pattern.findall(chat_json_str)
|
potential_file_ids = file_id_pattern.findall(chat_json_str)
|
||||||
|
|
||||||
|
# Pattern 2: URLs containing /api/v1/files/uuid
|
||||||
|
url_pattern = re.compile(
|
||||||
|
r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
|
||||||
|
)
|
||||||
url_file_ids = url_pattern.findall(chat_json_str)
|
url_file_ids = url_pattern.findall(chat_json_str)
|
||||||
|
|
||||||
|
# Combine and validate against actual file records
|
||||||
all_potential_ids = set(potential_file_ids + url_file_ids)
|
all_potential_ids = set(potential_file_ids + url_file_ids)
|
||||||
for file_id in all_potential_ids:
|
for file_id in all_potential_ids:
|
||||||
|
# Verify this ID exists in the file table to avoid false positives
|
||||||
if Files.get_file_by_id(file_id):
|
if Files.get_file_by_id(file_id):
|
||||||
active_file_ids.add(file_id)
|
active_file_ids.add(file_id)
|
||||||
|
log.debug(f"Chat {chat.id}: Found active file {file_id}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.debug(f"Error processing chat {chat.id} for file references: {e}")
|
log.debug(f"Error processing chat {chat.id} for file references: {e}")
|
||||||
|
|
||||||
# Scan folders for file references
|
# 3. Get files referenced in folders (scan folder.items, folder.data, folder.meta)
|
||||||
try:
|
try:
|
||||||
folders = Folders.get_all_folders()
|
folders = Folders.get_all_folders()
|
||||||
|
log.debug(f"Found {len(folders)} folders to scan for file references")
|
||||||
|
|
||||||
for folder in folders:
|
for folder in folders:
|
||||||
|
# Check folder.items JSON
|
||||||
if folder.items:
|
if folder.items:
|
||||||
try:
|
try:
|
||||||
items_str = json.dumps(folder.items)
|
items_str = json.dumps(folder.items)
|
||||||
file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
|
# Look for file ID patterns in the JSON
|
||||||
url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
|
file_id_pattern = re.compile(
|
||||||
|
r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
|
||||||
|
)
|
||||||
|
url_pattern = re.compile(
|
||||||
|
r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
|
||||||
|
)
|
||||||
|
|
||||||
potential_ids = file_id_pattern.findall(items_str) + url_pattern.findall(items_str)
|
potential_ids = file_id_pattern.findall(
|
||||||
|
items_str
|
||||||
|
) + url_pattern.findall(items_str)
|
||||||
for file_id in potential_ids:
|
for file_id in potential_ids:
|
||||||
if Files.get_file_by_id(file_id):
|
if Files.get_file_by_id(file_id):
|
||||||
active_file_ids.add(file_id)
|
active_file_ids.add(file_id)
|
||||||
|
log.debug(
|
||||||
|
f"Folder {folder.id}: Found file {file_id} in items"
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.debug(f"Error processing folder {folder.id} items: {e}")
|
log.debug(f"Error processing folder {folder.id} items: {e}")
|
||||||
|
|
||||||
if hasattr(folder, 'data') and folder.data:
|
# Check folder.data JSON
|
||||||
|
if hasattr(folder, "data") and folder.data:
|
||||||
try:
|
try:
|
||||||
data_str = json.dumps(folder.data)
|
data_str = json.dumps(folder.data)
|
||||||
file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
|
file_id_pattern = re.compile(
|
||||||
url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
|
r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
|
||||||
|
)
|
||||||
|
url_pattern = re.compile(
|
||||||
|
r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
|
||||||
|
)
|
||||||
|
|
||||||
potential_ids = file_id_pattern.findall(data_str) + url_pattern.findall(data_str)
|
potential_ids = file_id_pattern.findall(
|
||||||
|
data_str
|
||||||
|
) + url_pattern.findall(data_str)
|
||||||
for file_id in potential_ids:
|
for file_id in potential_ids:
|
||||||
if Files.get_file_by_id(file_id):
|
if Files.get_file_by_id(file_id):
|
||||||
active_file_ids.add(file_id)
|
active_file_ids.add(file_id)
|
||||||
|
log.debug(
|
||||||
|
f"Folder {folder.id}: Found file {file_id} in data"
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.debug(f"Error processing folder {folder.id} data: {e}")
|
log.debug(f"Error processing folder {folder.id} data: {e}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.debug(f"Error scanning folders for file references: {e}")
|
log.debug(f"Error scanning folders for file references: {e}")
|
||||||
|
|
||||||
# Scan standalone messages for file references
|
# 4. Get files referenced in standalone messages (message table)
|
||||||
try:
|
try:
|
||||||
|
# Query message table directly since we may not have a Messages model
|
||||||
with get_db() as db:
|
with get_db() as db:
|
||||||
message_results = db.execute(text("SELECT id, data FROM message WHERE data IS NOT NULL")).fetchall()
|
message_results = db.execute(
|
||||||
|
text("SELECT id, data FROM message WHERE data IS NOT NULL")
|
||||||
|
).fetchall()
|
||||||
|
log.debug(f"Found {len(message_results)} messages with data to scan")
|
||||||
|
|
||||||
for message_id, message_data_json in message_results:
|
for message_id, message_data_json in message_results:
|
||||||
if message_data_json:
|
if message_data_json:
|
||||||
try:
|
try:
|
||||||
data_str = json.dumps(message_data_json) if isinstance(message_data_json, dict) else str(message_data_json)
|
# Convert JSON to string and scan for file patterns
|
||||||
|
data_str = (
|
||||||
|
json.dumps(message_data_json)
|
||||||
|
if isinstance(message_data_json, dict)
|
||||||
|
else str(message_data_json)
|
||||||
|
)
|
||||||
|
|
||||||
file_id_pattern = re.compile(r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"')
|
file_id_pattern = re.compile(
|
||||||
url_pattern = re.compile(r'/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})')
|
r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
|
||||||
|
)
|
||||||
|
url_pattern = re.compile(
|
||||||
|
r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
|
||||||
|
)
|
||||||
|
|
||||||
potential_ids = file_id_pattern.findall(data_str) + url_pattern.findall(data_str)
|
potential_ids = file_id_pattern.findall(
|
||||||
|
data_str
|
||||||
|
) + url_pattern.findall(data_str)
|
||||||
for file_id in potential_ids:
|
for file_id in potential_ids:
|
||||||
if Files.get_file_by_id(file_id):
|
if Files.get_file_by_id(file_id):
|
||||||
active_file_ids.add(file_id)
|
active_file_ids.add(file_id)
|
||||||
|
log.debug(
|
||||||
|
f"Message {message_id}: Found file {file_id}"
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.debug(f"Error processing message {message_id} data: {e}")
|
log.debug(
|
||||||
|
f"Error processing message {message_id} data: {e}"
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.debug(f"Error scanning messages for file references: {e}")
|
log.debug(f"Error scanning messages for file references: {e}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Error determining active file IDs: {e}")
|
log.error(f"Error determining active file IDs: {e}")
|
||||||
|
# Fail safe: return empty set, which will prevent deletion
|
||||||
return set()
|
return set()
|
||||||
|
|
||||||
log.info(f"Found {len(active_file_ids)} active file IDs")
|
log.info(f"Found {len(active_file_ids)} active file IDs")
|
||||||
|
|
@ -177,15 +236,19 @@ def safe_delete_vector_collection(collection_name: str) -> bool:
|
||||||
Safely delete a vector collection, handling both logical and physical cleanup.
|
Safely delete a vector collection, handling both logical and physical cleanup.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
# First, try to delete the collection through the client
|
||||||
try:
|
try:
|
||||||
VECTOR_DB_CLIENT.delete_collection(collection_name=collection_name)
|
VECTOR_DB_CLIENT.delete_collection(collection_name=collection_name)
|
||||||
|
log.debug(f"Deleted collection from vector DB: {collection_name}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.debug(f"Collection {collection_name} may not exist in DB: {e}")
|
log.debug(f"Collection {collection_name} may not exist in DB: {e}")
|
||||||
|
|
||||||
|
# Then, handle physical cleanup for ChromaDB
|
||||||
if "chroma" in VECTOR_DB.lower():
|
if "chroma" in VECTOR_DB.lower():
|
||||||
vector_dir = Path(CACHE_DIR).parent / "vector_db" / collection_name
|
vector_dir = Path(CACHE_DIR).parent / "vector_db" / collection_name
|
||||||
if vector_dir.exists() and vector_dir.is_dir():
|
if vector_dir.exists() and vector_dir.is_dir():
|
||||||
shutil.rmtree(vector_dir)
|
shutil.rmtree(vector_dir)
|
||||||
|
log.debug(f"Deleted physical vector directory: {vector_dir}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
@ -200,14 +263,19 @@ def safe_delete_file_by_id(file_id: str) -> bool:
|
||||||
Safely delete a file record and its associated vector collection.
|
Safely delete a file record and its associated vector collection.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
# Get file info before deletion
|
||||||
file_record = Files.get_file_by_id(file_id)
|
file_record = Files.get_file_by_id(file_id)
|
||||||
if not file_record:
|
if not file_record:
|
||||||
return True
|
log.debug(f"File {file_id} not found in database")
|
||||||
|
return True # Already gone
|
||||||
|
|
||||||
|
# Delete vector collection first
|
||||||
collection_name = f"file-{file_id}"
|
collection_name = f"file-{file_id}"
|
||||||
safe_delete_vector_collection(collection_name)
|
safe_delete_vector_collection(collection_name)
|
||||||
|
|
||||||
|
# Delete database record
|
||||||
Files.delete_file_by_id(file_id)
|
Files.delete_file_by_id(file_id)
|
||||||
|
log.debug(f"Deleted file record: {file_id}")
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
@ -222,6 +290,7 @@ def cleanup_orphaned_uploads(active_file_ids: Set[str]) -> None:
|
||||||
"""
|
"""
|
||||||
upload_dir = Path(CACHE_DIR).parent / "uploads"
|
upload_dir = Path(CACHE_DIR).parent / "uploads"
|
||||||
if not upload_dir.exists():
|
if not upload_dir.exists():
|
||||||
|
log.debug("Uploads directory does not exist")
|
||||||
return
|
return
|
||||||
|
|
||||||
deleted_count = 0
|
deleted_count = 0
|
||||||
|
|
@ -232,27 +301,33 @@ def cleanup_orphaned_uploads(active_file_ids: Set[str]) -> None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
filename = file_path.name
|
filename = file_path.name
|
||||||
|
|
||||||
|
# Extract file ID from filename (common patterns)
|
||||||
file_id = None
|
file_id = None
|
||||||
|
|
||||||
# Extract file ID from filename patterns
|
# Pattern 1: UUID_filename or UUID-filename
|
||||||
if len(filename) > 36:
|
if len(filename) > 36:
|
||||||
potential_id = filename[:36]
|
potential_id = filename[:36]
|
||||||
if potential_id.count('-') == 4:
|
if potential_id.count("-") == 4: # UUID format
|
||||||
file_id = potential_id
|
file_id = potential_id
|
||||||
|
|
||||||
if not file_id and filename.count('-') == 4 and len(filename) == 36:
|
# Pattern 2: filename might be the file ID itself
|
||||||
|
if not file_id and filename.count("-") == 4 and len(filename) == 36:
|
||||||
file_id = filename
|
file_id = filename
|
||||||
|
|
||||||
|
# Pattern 3: Check if any part of filename matches active IDs
|
||||||
if not file_id:
|
if not file_id:
|
||||||
for active_id in active_file_ids:
|
for active_id in active_file_ids:
|
||||||
if active_id in filename:
|
if active_id in filename:
|
||||||
file_id = active_id
|
file_id = active_id
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# If we found a potential file ID and it's not active, delete it
|
||||||
if file_id and file_id not in active_file_ids:
|
if file_id and file_id not in active_file_ids:
|
||||||
try:
|
try:
|
||||||
file_path.unlink()
|
file_path.unlink()
|
||||||
deleted_count += 1
|
deleted_count += 1
|
||||||
|
log.debug(f"Deleted orphaned upload file: {filename}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Failed to delete upload file {filename}: {e}")
|
log.error(f"Failed to delete upload file {filename}: {e}")
|
||||||
|
|
||||||
|
|
@ -263,7 +338,9 @@ def cleanup_orphaned_uploads(active_file_ids: Set[str]) -> None:
|
||||||
log.info(f"Deleted {deleted_count} orphaned upload files")
|
log.info(f"Deleted {deleted_count} orphaned upload files")
|
||||||
|
|
||||||
|
|
||||||
def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids: Set[str]) -> None:
|
def cleanup_orphaned_vector_collections(
|
||||||
|
active_file_ids: Set[str], active_kb_ids: Set[str]
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Clean up orphaned vector collections by querying ChromaDB metadata.
|
Clean up orphaned vector collections by querying ChromaDB metadata.
|
||||||
"""
|
"""
|
||||||
|
|
@ -272,46 +349,84 @@ def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids
|
||||||
|
|
||||||
vector_dir = Path(CACHE_DIR).parent / "vector_db"
|
vector_dir = Path(CACHE_DIR).parent / "vector_db"
|
||||||
if not vector_dir.exists():
|
if not vector_dir.exists():
|
||||||
|
log.debug("Vector DB directory does not exist")
|
||||||
return
|
return
|
||||||
|
|
||||||
chroma_db_path = vector_dir / "chroma.sqlite3"
|
chroma_db_path = vector_dir / "chroma.sqlite3"
|
||||||
if not chroma_db_path.exists():
|
if not chroma_db_path.exists():
|
||||||
|
log.debug("ChromaDB metadata file does not exist")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Build expected collection names
|
||||||
expected_collections = set()
|
expected_collections = set()
|
||||||
|
|
||||||
|
# File collections: file-{file_id}
|
||||||
for file_id in active_file_ids:
|
for file_id in active_file_ids:
|
||||||
expected_collections.add(f"file-{file_id}")
|
expected_collections.add(f"file-{file_id}")
|
||||||
|
|
||||||
|
# Knowledge base collections: {kb_id}
|
||||||
for kb_id in active_kb_ids:
|
for kb_id in active_kb_ids:
|
||||||
expected_collections.add(kb_id)
|
expected_collections.add(kb_id)
|
||||||
|
|
||||||
|
log.debug(f"Expected collections to preserve: {expected_collections}")
|
||||||
|
|
||||||
|
# Query ChromaDB metadata to get the complete mapping chain:
|
||||||
|
# Directory UUID -> Collection ID -> Collection Name
|
||||||
uuid_to_collection = {}
|
uuid_to_collection = {}
|
||||||
try:
|
try:
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
|
||||||
|
log.debug(f"Attempting to connect to ChromaDB at: {chroma_db_path}")
|
||||||
|
|
||||||
with sqlite3.connect(str(chroma_db_path)) as conn:
|
with sqlite3.connect(str(chroma_db_path)) as conn:
|
||||||
|
# First, check what tables exist
|
||||||
|
tables = conn.execute(
|
||||||
|
"SELECT name FROM sqlite_master WHERE type='table'"
|
||||||
|
).fetchall()
|
||||||
|
log.debug(f"ChromaDB tables: {tables}")
|
||||||
|
|
||||||
|
# Check the schema of collections table
|
||||||
|
schema = conn.execute("PRAGMA table_info(collections)").fetchall()
|
||||||
|
log.debug(f"Collections table schema: {schema}")
|
||||||
|
|
||||||
|
# Get Collection ID -> Collection Name mapping
|
||||||
collection_id_to_name = {}
|
collection_id_to_name = {}
|
||||||
cursor = conn.execute("SELECT id, name FROM collections")
|
cursor = conn.execute("SELECT id, name FROM collections")
|
||||||
rows = cursor.fetchall()
|
rows = cursor.fetchall()
|
||||||
|
log.debug(f"Raw ChromaDB collections query results: {rows}")
|
||||||
|
|
||||||
for row in rows:
|
for row in rows:
|
||||||
collection_id, collection_name = row
|
collection_id, collection_name = row
|
||||||
collection_id_to_name[collection_id] = collection_name
|
collection_id_to_name[collection_id] = collection_name
|
||||||
|
log.debug(
|
||||||
|
f"Mapped collection ID {collection_id} -> name {collection_name}"
|
||||||
|
)
|
||||||
|
|
||||||
cursor = conn.execute("SELECT id, collection FROM segments WHERE scope = 'VECTOR'")
|
# Get Directory UUID -> Collection ID mapping from segments table
|
||||||
|
# Only interested in VECTOR segments as those are the actual data directories
|
||||||
|
cursor = conn.execute(
|
||||||
|
"SELECT id, collection FROM segments WHERE scope = 'VECTOR'"
|
||||||
|
)
|
||||||
segment_rows = cursor.fetchall()
|
segment_rows = cursor.fetchall()
|
||||||
|
log.debug(f"Raw ChromaDB segments query results: {segment_rows}")
|
||||||
|
|
||||||
for row in segment_rows:
|
for row in segment_rows:
|
||||||
segment_id, collection_id = row
|
segment_id, collection_id = row
|
||||||
if collection_id in collection_id_to_name:
|
if collection_id in collection_id_to_name:
|
||||||
collection_name = collection_id_to_name[collection_id]
|
collection_name = collection_id_to_name[collection_id]
|
||||||
uuid_to_collection[segment_id] = collection_name
|
uuid_to_collection[segment_id] = collection_name
|
||||||
|
log.debug(
|
||||||
|
f"Mapped directory UUID {segment_id} -> collection {collection_name}"
|
||||||
|
)
|
||||||
|
|
||||||
log.info(f"Found {len(uuid_to_collection)} vector segments in ChromaDB metadata")
|
log.debug(f"Final uuid_to_collection mapping: {uuid_to_collection}")
|
||||||
|
log.info(
|
||||||
|
f"Found {len(uuid_to_collection)} vector segments in ChromaDB metadata"
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Error reading ChromaDB metadata: {e}")
|
log.error(f"Error reading ChromaDB metadata: {e}")
|
||||||
|
# Fail safe: don't delete anything if we can't read metadata
|
||||||
return
|
return
|
||||||
|
|
||||||
deleted_count = 0
|
deleted_count = 0
|
||||||
|
|
@ -323,12 +438,16 @@ def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids
|
||||||
|
|
||||||
dir_uuid = collection_dir.name
|
dir_uuid = collection_dir.name
|
||||||
|
|
||||||
if dir_uuid.startswith('.'):
|
# Skip system/metadata files
|
||||||
|
if dir_uuid.startswith("."):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Get the actual collection name from metadata
|
||||||
collection_name = uuid_to_collection.get(dir_uuid)
|
collection_name = uuid_to_collection.get(dir_uuid)
|
||||||
|
|
||||||
if collection_name is None:
|
if collection_name is None:
|
||||||
|
# Directory exists but no metadata entry - it's orphaned
|
||||||
|
log.debug(f"Directory {dir_uuid} has no metadata entry, deleting")
|
||||||
try:
|
try:
|
||||||
shutil.rmtree(collection_dir)
|
shutil.rmtree(collection_dir)
|
||||||
deleted_count += 1
|
deleted_count += 1
|
||||||
|
|
@ -336,12 +455,20 @@ def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids
|
||||||
log.error(f"Failed to delete orphaned directory {dir_uuid}: {e}")
|
log.error(f"Failed to delete orphaned directory {dir_uuid}: {e}")
|
||||||
|
|
||||||
elif collection_name not in expected_collections:
|
elif collection_name not in expected_collections:
|
||||||
|
# Collection exists but should be deleted
|
||||||
|
log.debug(
|
||||||
|
f"Collection {collection_name} (UUID: {dir_uuid}) is orphaned, deleting"
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
shutil.rmtree(collection_dir)
|
shutil.rmtree(collection_dir)
|
||||||
deleted_count += 1
|
deleted_count += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Failed to delete collection directory {dir_uuid}: {e}")
|
log.error(f"Failed to delete collection directory {dir_uuid}: {e}")
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Collection should be preserved
|
||||||
|
log.debug(f"Preserving collection {collection_name} (UUID: {dir_uuid})")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Error cleaning vector collections: {e}")
|
log.error(f"Error cleaning vector collections: {e}")
|
||||||
|
|
||||||
|
|
@ -349,50 +476,6 @@ def cleanup_orphaned_vector_collections(active_file_ids: Set[str], active_kb_ids
|
||||||
log.info(f"Deleted {deleted_count} orphaned vector collections")
|
log.info(f"Deleted {deleted_count} orphaned vector collections")
|
||||||
|
|
||||||
|
|
||||||
def cleanup_audio_cache(max_age_days: Optional[int] = 30) -> None:
|
|
||||||
"""
|
|
||||||
Clean up audio cache files older than specified days.
|
|
||||||
"""
|
|
||||||
if max_age_days is None:
|
|
||||||
log.info("Skipping audio cache cleanup (max_age_days is None)")
|
|
||||||
return
|
|
||||||
|
|
||||||
cutoff_time = time.time() - (max_age_days * 86400)
|
|
||||||
deleted_count = 0
|
|
||||||
total_size_deleted = 0
|
|
||||||
|
|
||||||
audio_dirs = [
|
|
||||||
Path(CACHE_DIR) / "audio" / "speech",
|
|
||||||
Path(CACHE_DIR) / "audio" / "transcriptions"
|
|
||||||
]
|
|
||||||
|
|
||||||
for audio_dir in audio_dirs:
|
|
||||||
if not audio_dir.exists():
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
for file_path in audio_dir.iterdir():
|
|
||||||
if not file_path.is_file():
|
|
||||||
continue
|
|
||||||
|
|
||||||
file_mtime = file_path.stat().st_mtime
|
|
||||||
if file_mtime < cutoff_time:
|
|
||||||
try:
|
|
||||||
file_size = file_path.stat().st_size
|
|
||||||
file_path.unlink()
|
|
||||||
deleted_count += 1
|
|
||||||
total_size_deleted += file_size
|
|
||||||
except Exception as e:
|
|
||||||
log.error(f"Failed to delete audio file {file_path}: {e}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
log.error(f"Error cleaning audio directory {audio_dir}: {e}")
|
|
||||||
|
|
||||||
if deleted_count > 0:
|
|
||||||
size_mb = total_size_deleted / (1024 * 1024)
|
|
||||||
log.info(f"Deleted {deleted_count} audio cache files ({size_mb:.1f} MB), older than {max_age_days} days")
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/", response_model=bool)
|
@router.post("/", response_model=bool)
|
||||||
async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
|
async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
|
||||||
"""
|
"""
|
||||||
|
|
@ -424,28 +507,44 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
|
||||||
- If True: Delete notes from deleted users
|
- If True: Delete notes from deleted users
|
||||||
- delete_orphaned_folders: bool = True
|
- delete_orphaned_folders: bool = True
|
||||||
- If True: Delete folders from deleted users
|
- If True: Delete folders from deleted users
|
||||||
- audio_cache_max_age_days: Optional[int] = 30
|
|
||||||
- If None: Skip audio cache cleanup
|
|
||||||
- If >= 0: Delete audio cache files (TTS, STT) older than specified days
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
log.info("Starting data pruning process")
|
log.info("Starting data pruning process")
|
||||||
|
|
||||||
# Stage 1: Delete old chats based on user criteria
|
# Stage 1: Delete old chats based on user criteria (optional)
|
||||||
if form_data.days is not None:
|
if form_data.days is not None:
|
||||||
cutoff_time = int(time.time()) - (form_data.days * 86400)
|
cutoff_time = int(time.time()) - (form_data.days * 86400)
|
||||||
chats_to_delete = []
|
chats_to_delete = []
|
||||||
|
|
||||||
for chat in Chats.get_chats():
|
for chat in Chats.get_chats():
|
||||||
if chat.updated_at < cutoff_time:
|
if chat.updated_at < cutoff_time:
|
||||||
|
# Check exemption conditions
|
||||||
if form_data.exempt_archived_chats and chat.archived:
|
if form_data.exempt_archived_chats and chat.archived:
|
||||||
|
log.debug(f"Exempting archived chat: {chat.id}")
|
||||||
continue
|
continue
|
||||||
if form_data.exempt_chats_in_folders and (getattr(chat, 'folder_id', None) is not None or getattr(chat, 'pinned', False)):
|
if form_data.exempt_chats_in_folders and (
|
||||||
|
getattr(chat, "folder_id", None) is not None
|
||||||
|
or getattr(chat, "pinned", False)
|
||||||
|
):
|
||||||
|
folder_status = (
|
||||||
|
f"folder_id: {getattr(chat, 'folder_id', None)}"
|
||||||
|
if getattr(chat, "folder_id", None)
|
||||||
|
else "not in folder"
|
||||||
|
)
|
||||||
|
pinned_status = f"pinned: {getattr(chat, 'pinned', False)}"
|
||||||
|
log.debug(
|
||||||
|
f"Exempting chat in folder or pinned: {chat.id} ({folder_status}, {pinned_status})"
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
log.debug(
|
||||||
|
f"Chat {chat.id} will be deleted - archived: {getattr(chat, 'archived', False)}, folder_id: {getattr(chat, 'folder_id', None)}, pinned: {getattr(chat, 'pinned', False)}"
|
||||||
|
)
|
||||||
chats_to_delete.append(chat)
|
chats_to_delete.append(chat)
|
||||||
|
|
||||||
if chats_to_delete:
|
if chats_to_delete:
|
||||||
log.info(f"Deleting {len(chats_to_delete)} old chats (older than {form_data.days} days)")
|
log.info(
|
||||||
|
f"Deleting {len(chats_to_delete)} old chats (older than {form_data.days} days)"
|
||||||
|
)
|
||||||
for chat in chats_to_delete:
|
for chat in chats_to_delete:
|
||||||
Chats.delete_chat_by_id(chat.id)
|
Chats.delete_chat_by_id(chat.id)
|
||||||
else:
|
else:
|
||||||
|
|
@ -453,12 +552,14 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
|
||||||
else:
|
else:
|
||||||
log.info("Skipping chat deletion (days parameter is None)")
|
log.info("Skipping chat deletion (days parameter is None)")
|
||||||
|
|
||||||
# Stage 2: Build preservation set
|
# Stage 2: Build ground truth of what should be preserved
|
||||||
log.info("Building preservation set")
|
log.info("Building preservation set")
|
||||||
|
|
||||||
|
# Get all active users
|
||||||
active_user_ids = {user.id for user in Users.get_users()["users"]}
|
active_user_ids = {user.id for user in Users.get_users()["users"]}
|
||||||
log.info(f"Found {len(active_user_ids)} active users")
|
log.info(f"Found {len(active_user_ids)} active users")
|
||||||
|
|
||||||
|
# Get all active knowledge bases and their file references
|
||||||
active_kb_ids = set()
|
active_kb_ids = set()
|
||||||
knowledge_bases = Knowledges.get_knowledge_bases()
|
knowledge_bases = Knowledges.get_knowledge_bases()
|
||||||
|
|
||||||
|
|
@ -468,16 +569,18 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
|
||||||
|
|
||||||
log.info(f"Found {len(active_kb_ids)} active knowledge bases")
|
log.info(f"Found {len(active_kb_ids)} active knowledge bases")
|
||||||
|
|
||||||
|
# Get all files that should be preserved (NOW COMPREHENSIVE!)
|
||||||
active_file_ids = get_active_file_ids()
|
active_file_ids = get_active_file_ids()
|
||||||
|
|
||||||
# Stage 3: Delete orphaned database records
|
# Stage 3: Delete orphaned database records
|
||||||
log.info("Deleting orphaned database records")
|
log.info("Deleting orphaned database records")
|
||||||
|
|
||||||
|
# Delete files not referenced by any knowledge base or belonging to deleted users
|
||||||
deleted_files = 0
|
deleted_files = 0
|
||||||
for file_record in Files.get_files():
|
for file_record in Files.get_files():
|
||||||
should_delete = (
|
should_delete = (
|
||||||
file_record.id not in active_file_ids or
|
file_record.id not in active_file_ids
|
||||||
file_record.user_id not in active_user_ids
|
or file_record.user_id not in active_user_ids
|
||||||
)
|
)
|
||||||
|
|
||||||
if should_delete:
|
if should_delete:
|
||||||
|
|
@ -487,6 +590,7 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
|
||||||
if deleted_files > 0:
|
if deleted_files > 0:
|
||||||
log.info(f"Deleted {deleted_files} orphaned files")
|
log.info(f"Deleted {deleted_files} orphaned files")
|
||||||
|
|
||||||
|
# Delete knowledge bases from deleted users (if enabled)
|
||||||
deleted_kbs = 0
|
deleted_kbs = 0
|
||||||
if form_data.delete_orphaned_knowledge_bases:
|
if form_data.delete_orphaned_knowledge_bases:
|
||||||
for kb in knowledge_bases:
|
for kb in knowledge_bases:
|
||||||
|
|
@ -500,8 +604,10 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
|
||||||
else:
|
else:
|
||||||
log.info("Skipping knowledge base deletion (disabled)")
|
log.info("Skipping knowledge base deletion (disabled)")
|
||||||
|
|
||||||
|
# Delete other user-owned resources from deleted users (conditional)
|
||||||
deleted_others = 0
|
deleted_others = 0
|
||||||
|
|
||||||
|
# Delete orphaned chats of deleted users (conditional)
|
||||||
if form_data.delete_orphaned_chats:
|
if form_data.delete_orphaned_chats:
|
||||||
chats_deleted = 0
|
chats_deleted = 0
|
||||||
for chat in Chats.get_chats():
|
for chat in Chats.get_chats():
|
||||||
|
|
@ -514,6 +620,7 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
|
||||||
else:
|
else:
|
||||||
log.info("Skipping orphaned chat deletion (disabled)")
|
log.info("Skipping orphaned chat deletion (disabled)")
|
||||||
|
|
||||||
|
# Delete orphaned tools of deleted users (conditional)
|
||||||
if form_data.delete_orphaned_tools:
|
if form_data.delete_orphaned_tools:
|
||||||
tools_deleted = 0
|
tools_deleted = 0
|
||||||
for tool in Tools.get_tools():
|
for tool in Tools.get_tools():
|
||||||
|
|
@ -526,6 +633,7 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
|
||||||
else:
|
else:
|
||||||
log.info("Skipping tool deletion (disabled)")
|
log.info("Skipping tool deletion (disabled)")
|
||||||
|
|
||||||
|
# Delete orphaned functions of deleted users (conditional)
|
||||||
if form_data.delete_orphaned_functions:
|
if form_data.delete_orphaned_functions:
|
||||||
functions_deleted = 0
|
functions_deleted = 0
|
||||||
for function in Functions.get_functions():
|
for function in Functions.get_functions():
|
||||||
|
|
@ -538,6 +646,7 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
|
||||||
else:
|
else:
|
||||||
log.info("Skipping function deletion (disabled)")
|
log.info("Skipping function deletion (disabled)")
|
||||||
|
|
||||||
|
# Delete orphaned notes of deleted users (conditional)
|
||||||
if form_data.delete_orphaned_notes:
|
if form_data.delete_orphaned_notes:
|
||||||
notes_deleted = 0
|
notes_deleted = 0
|
||||||
for note in Notes.get_notes():
|
for note in Notes.get_notes():
|
||||||
|
|
@ -550,6 +659,7 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
|
||||||
else:
|
else:
|
||||||
log.info("Skipping note deletion (disabled)")
|
log.info("Skipping note deletion (disabled)")
|
||||||
|
|
||||||
|
# Delete orphaned prompts of deleted users (conditional)
|
||||||
if form_data.delete_orphaned_prompts:
|
if form_data.delete_orphaned_prompts:
|
||||||
prompts_deleted = 0
|
prompts_deleted = 0
|
||||||
for prompt in Prompts.get_prompts():
|
for prompt in Prompts.get_prompts():
|
||||||
|
|
@ -562,6 +672,7 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
|
||||||
else:
|
else:
|
||||||
log.info("Skipping prompt deletion (disabled)")
|
log.info("Skipping prompt deletion (disabled)")
|
||||||
|
|
||||||
|
# Delete orphaned models of deleted users (conditional)
|
||||||
if form_data.delete_orphaned_models:
|
if form_data.delete_orphaned_models:
|
||||||
models_deleted = 0
|
models_deleted = 0
|
||||||
for model in Models.get_all_models():
|
for model in Models.get_all_models():
|
||||||
|
|
@ -574,11 +685,14 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
|
||||||
else:
|
else:
|
||||||
log.info("Skipping model deletion (disabled)")
|
log.info("Skipping model deletion (disabled)")
|
||||||
|
|
||||||
|
# Delete orphaned folders of deleted users (conditional)
|
||||||
if form_data.delete_orphaned_folders:
|
if form_data.delete_orphaned_folders:
|
||||||
folders_deleted = 0
|
folders_deleted = 0
|
||||||
for folder in Folders.get_all_folders():
|
for folder in Folders.get_all_folders():
|
||||||
if folder.user_id not in active_user_ids:
|
if folder.user_id not in active_user_ids:
|
||||||
Folders.delete_folder_by_id_and_user_id(folder.id, folder.user_id, delete_chats=False)
|
Folders.delete_folder_by_id_and_user_id(
|
||||||
|
folder.id, folder.user_id, delete_chats=False
|
||||||
|
)
|
||||||
folders_deleted += 1
|
folders_deleted += 1
|
||||||
deleted_others += 1
|
deleted_others += 1
|
||||||
if folders_deleted > 0:
|
if folders_deleted > 0:
|
||||||
|
|
@ -592,32 +706,37 @@ async def prune_data(form_data: PruneDataForm, user=Depends(get_admin_user)):
|
||||||
# Stage 4: Clean up orphaned physical files
|
# Stage 4: Clean up orphaned physical files
|
||||||
log.info("Cleaning up orphaned physical files")
|
log.info("Cleaning up orphaned physical files")
|
||||||
|
|
||||||
|
# Rebuild active sets after database cleanup
|
||||||
final_active_file_ids = get_active_file_ids()
|
final_active_file_ids = get_active_file_ids()
|
||||||
final_active_kb_ids = {kb.id for kb in Knowledges.get_knowledge_bases()}
|
final_active_kb_ids = {kb.id for kb in Knowledges.get_knowledge_bases()}
|
||||||
|
|
||||||
|
# Clean uploads directory
|
||||||
cleanup_orphaned_uploads(final_active_file_ids)
|
cleanup_orphaned_uploads(final_active_file_ids)
|
||||||
|
|
||||||
|
# Clean vector collections
|
||||||
cleanup_orphaned_vector_collections(final_active_file_ids, final_active_kb_ids)
|
cleanup_orphaned_vector_collections(final_active_file_ids, final_active_kb_ids)
|
||||||
|
|
||||||
# Stage 5: Audio cache cleanup
|
# Stage 5: Database optimization
|
||||||
log.info("Cleaning audio cache")
|
|
||||||
cleanup_audio_cache(form_data.audio_cache_max_age_days)
|
|
||||||
|
|
||||||
# Stage 6: Database optimization
|
|
||||||
log.info("Optimizing database")
|
log.info("Optimizing database")
|
||||||
|
|
||||||
|
# Vacuum main database
|
||||||
try:
|
try:
|
||||||
with get_db() as db:
|
with get_db() as db:
|
||||||
db.execute(text("VACUUM"))
|
db.execute(text("VACUUM"))
|
||||||
|
log.debug("Vacuumed main database")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Failed to vacuum main database: {e}")
|
log.error(f"Failed to vacuum main database: {e}")
|
||||||
|
|
||||||
|
# Vacuum ChromaDB database if it exists
|
||||||
if "chroma" in VECTOR_DB.lower():
|
if "chroma" in VECTOR_DB.lower():
|
||||||
chroma_db_path = Path(CACHE_DIR).parent / "vector_db" / "chroma.sqlite3"
|
chroma_db_path = Path(CACHE_DIR).parent / "vector_db" / "chroma.sqlite3"
|
||||||
if chroma_db_path.exists():
|
if chroma_db_path.exists():
|
||||||
try:
|
try:
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
|
||||||
with sqlite3.connect(str(chroma_db_path)) as conn:
|
with sqlite3.connect(str(chroma_db_path)) as conn:
|
||||||
conn.execute("VACUUM")
|
conn.execute("VACUUM")
|
||||||
|
log.debug("Vacuumed ChromaDB database")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Failed to vacuum ChromaDB database: {e}")
|
log.error(f"Failed to vacuum ChromaDB database: {e}")
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue