Update prune.py

This commit is contained in:
Classic298 2025-08-22 17:02:36 +02:00 committed by GitHub
parent 4c7e6bd752
commit b5d93ae3db
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -35,6 +35,55 @@ log.setLevel(SRC_LOG_LEVELS["MODELS"])
router = APIRouter() router = APIRouter()
class JSONFileIDExtractor:
"""
Utility for extracting and validating file IDs from JSON content.
Replaces duplicated regex compilation and validation logic used throughout
the file scanning functions. Compiles patterns once for better performance.
"""
# Compile patterns once at class level for performance
_FILE_ID_PATTERN = re.compile(
r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
)
_URL_PATTERN = re.compile(
r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
)
@classmethod
def extract_and_validate_file_ids(cls, json_string: str) -> Set[str]:
"""
Extract file IDs from JSON string and validate they exist in database.
Args:
json_string: JSON content as string (or any string to scan)
Returns:
Set of validated file IDs that exist in the Files table
Note:
This method replaces the repeated pattern of:
1. Compiling the same regex patterns
2. Extracting potential IDs
3. Validating each ID exists via Files.get_file_by_id()
4. Building a set of validated IDs
"""
validated_ids = set()
# Extract potential IDs using both patterns
potential_ids = []
potential_ids.extend(cls._FILE_ID_PATTERN.findall(json_string))
potential_ids.extend(cls._URL_PATTERN.findall(json_string))
# Validate each ID exists in database
for file_id in potential_ids:
if Files.get_file_by_id(file_id):
validated_ids.add(file_id)
return validated_ids
class PruneDataForm(BaseModel): class PruneDataForm(BaseModel):
days: Optional[int] = None days: Optional[int] = None
exempt_archived_chats: bool = False exempt_archived_chats: bool = False
@ -354,22 +403,9 @@ def get_active_file_ids() -> Set[str]:
try: try:
chat_json_str = json.dumps(chat.chat) chat_json_str = json.dumps(chat.chat)
# Use utility to extract and validate file IDs
# Extract file IDs using regex patterns validated_ids = JSONFileIDExtractor.extract_and_validate_file_ids(chat_json_str)
file_id_pattern = re.compile( active_file_ids.update(validated_ids)
r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"'
)
url_pattern = re.compile(
r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
)
potential_file_ids = file_id_pattern.findall(chat_json_str)
url_file_ids = url_pattern.findall(chat_json_str)
all_potential_ids = set(potential_file_ids + url_file_ids)
for file_id in all_potential_ids:
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
except Exception as e: except Exception as e:
log.debug(f"Error processing chat {chat.id} for file references: {e}") log.debug(f"Error processing chat {chat.id} for file references: {e}")
@ -382,38 +418,18 @@ def get_active_file_ids() -> Set[str]:
if folder.items: if folder.items:
try: try:
items_str = json.dumps(folder.items) items_str = json.dumps(folder.items)
file_id_pattern = re.compile( # Use utility to extract and validate file IDs
r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"' validated_ids = JSONFileIDExtractor.extract_and_validate_file_ids(items_str)
) active_file_ids.update(validated_ids)
url_pattern = re.compile(
r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
)
potential_ids = file_id_pattern.findall(
items_str
) + url_pattern.findall(items_str)
for file_id in potential_ids:
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
except Exception as e: except Exception as e:
log.debug(f"Error processing folder {folder.id} items: {e}") log.debug(f"Error processing folder {folder.id} items: {e}")
if hasattr(folder, "data") and folder.data: if hasattr(folder, "data") and folder.data:
try: try:
data_str = json.dumps(folder.data) data_str = json.dumps(folder.data)
file_id_pattern = re.compile( # Use utility to extract and validate file IDs
r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"' validated_ids = JSONFileIDExtractor.extract_and_validate_file_ids(data_str)
) active_file_ids.update(validated_ids)
url_pattern = re.compile(
r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
)
potential_ids = file_id_pattern.findall(
data_str
) + url_pattern.findall(data_str)
for file_id in potential_ids:
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
except Exception as e: except Exception as e:
log.debug(f"Error processing folder {folder.id} data: {e}") log.debug(f"Error processing folder {folder.id} data: {e}")
@ -435,20 +451,9 @@ def get_active_file_ids() -> Set[str]:
if isinstance(message_data_json, dict) if isinstance(message_data_json, dict)
else str(message_data_json) else str(message_data_json)
) )
# Use utility to extract and validate file IDs
file_id_pattern = re.compile( validated_ids = JSONFileIDExtractor.extract_and_validate_file_ids(data_str)
r'"id":\s*"([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"' active_file_ids.update(validated_ids)
)
url_pattern = re.compile(
r"/api/v1/files/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
)
potential_ids = file_id_pattern.findall(
data_str
) + url_pattern.findall(data_str)
for file_id in potential_ids:
if Files.get_file_by_id(file_id):
active_file_ids.add(file_id)
except Exception as e: except Exception as e:
log.debug( log.debug(
f"Error processing message {message_id} data: {e}" f"Error processing message {message_id} data: {e}"