mirror of
https://github.com/open-webui/open-webui.git
synced 2026-01-03 23:25:21 +00:00
fix: consolidate psql cleanup logic and fix web add with cleanup (#20072)
* sequential * consolidate logic and fix for web add * Update WebSearch.svelte * Update retrieval.py * Update retrieval.py * Update WebSearch.svelte
This commit is contained in:
parent
5077676d33
commit
48ccb1e170
3 changed files with 33 additions and 15 deletions
|
|
@ -7,6 +7,7 @@ from typing import Optional
|
|||
from open_webui.internal.db import Base, get_db
|
||||
from open_webui.models.tags import TagModel, Tag, Tags
|
||||
from open_webui.models.folders import Folders
|
||||
from open_webui.utils.misc import sanitize_data_for_db, sanitize_text_for_db
|
||||
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
from sqlalchemy import BigInteger, Boolean, Column, String, Text, JSON, Index
|
||||
|
|
@ -169,18 +170,8 @@ class ChatUsageStatsListResponse(BaseModel):
|
|||
|
||||
class ChatTable:
|
||||
def _clean_null_bytes(self, obj):
|
||||
"""
|
||||
Recursively remove actual null bytes (\x00) and unicode escape \\u0000
|
||||
from strings inside dict/list structures.
|
||||
Safe for JSON objects.
|
||||
"""
|
||||
if isinstance(obj, str):
|
||||
return obj.replace("\x00", "").replace("\u0000", "")
|
||||
elif isinstance(obj, dict):
|
||||
return {k: self._clean_null_bytes(v) for k, v in obj.items()}
|
||||
elif isinstance(obj, list):
|
||||
return [self._clean_null_bytes(v) for v in obj]
|
||||
return obj
|
||||
"""Recursively remove null bytes from strings in dict/list structures."""
|
||||
return sanitize_data_for_db(obj)
|
||||
|
||||
def _sanitize_chat_row(self, chat_item):
|
||||
"""
|
||||
|
|
@ -351,7 +342,7 @@ class ChatTable:
|
|||
|
||||
# Sanitize message content for null characters before upserting
|
||||
if isinstance(message.get("content"), str):
|
||||
message["content"] = message["content"].replace("\x00", "")
|
||||
message["content"] = sanitize_text_for_db(message["content"])
|
||||
|
||||
chat = chat.chat
|
||||
history = chat.get("history", {})
|
||||
|
|
@ -771,7 +762,7 @@ class ChatTable:
|
|||
"""
|
||||
Filters chats based on a search query using Python, allowing pagination using skip and limit.
|
||||
"""
|
||||
search_text = search_text.replace("\u0000", "").lower().strip()
|
||||
search_text = sanitize_text_for_db(search_text).lower().strip()
|
||||
|
||||
if not search_text:
|
||||
return self.get_chat_list_by_user_id(
|
||||
|
|
|
|||
|
|
@ -87,6 +87,7 @@ from open_webui.retrieval.utils import (
|
|||
from open_webui.retrieval.vector.utils import filter_metadata
|
||||
from open_webui.utils.misc import (
|
||||
calculate_sha256_string,
|
||||
sanitize_text_for_db,
|
||||
)
|
||||
from open_webui.utils.auth import get_admin_user, get_verified_user
|
||||
|
||||
|
|
@ -1378,7 +1379,7 @@ def save_docs_to_vector_db(
|
|||
if len(docs) == 0:
|
||||
raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
|
||||
|
||||
texts = [doc.page_content for doc in docs]
|
||||
texts = [sanitize_text_for_db(doc.page_content) for doc in docs]
|
||||
metadatas = [
|
||||
{
|
||||
**doc.metadata,
|
||||
|
|
|
|||
|
|
@ -373,6 +373,32 @@ def sanitize_filename(file_name):
|
|||
return final_file_name
|
||||
|
||||
|
||||
def sanitize_text_for_db(text: str) -> str:
|
||||
"""Remove null bytes and invalid UTF-8 surrogates from text for PostgreSQL storage."""
|
||||
if not isinstance(text, str):
|
||||
return text
|
||||
# Remove null bytes - PostgreSQL cannot store \x00 in text fields
|
||||
text = text.replace("\x00", "")
|
||||
# Remove invalid UTF-8 surrogate characters that can cause encoding errors
|
||||
# This handles cases where binary data or encoding issues introduced surrogates
|
||||
try:
|
||||
text = text.encode("utf-8", errors="surrogatepass").decode("utf-8", errors="ignore")
|
||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||
pass
|
||||
return text
|
||||
|
||||
|
||||
def sanitize_data_for_db(obj):
|
||||
"""Recursively sanitize all strings in a data structure for database storage."""
|
||||
if isinstance(obj, str):
|
||||
return sanitize_text_for_db(obj)
|
||||
elif isinstance(obj, dict):
|
||||
return {k: sanitize_data_for_db(v) for k, v in obj.items()}
|
||||
elif isinstance(obj, list):
|
||||
return [sanitize_data_for_db(v) for v in obj]
|
||||
return obj
|
||||
|
||||
|
||||
def extract_folders_after_data_docs(path):
|
||||
# Convert the path to a Path object if it's not already
|
||||
path = Path(path)
|
||||
|
|
|
|||
Loading…
Reference in a new issue