open-webui/backend/open_webui/utils/chat_importer.py

751 lines
51 KiB
Python
Raw Normal View History

<EFBFBD><EFBFBD>from __future__ import annotations
import json
import re
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Iterable, Mapping, cast
from html.parser import HTMLParser
@dataclass(slots=True)
class ChatThread:
id: str
title: str
messages: list[dict[str, Any]]
class ChatImporterError(Exception):
pass
def detect_threads(data: Any) -> list[ChatThread]:
"""
\ՋO(uY<EFBFBD>yghVg<EFBFBD>[eQ<EFBFBD>v JSON penc0
/ec:
1. Google AI Studio (Single / Backup)
2. OpenAI Conversation (ChatGPT export)
3. Chatbox
4. Cherry Studio
5. DeepSeek
"""
for parser in (
_try_google_single,
_try_google_backup,
_try_openai_conversation,
_try_chatbox_export,
_try_cherry_studio,
_try_deepseek_export,
):
try:
threads = parser(data)
if threads:
return threads
except Exception:
# <00>_eu㉐g<19><><EFBFBD> <0C> NN*N㉐ghV
continue
return []
# =============================================================================
# 1. Google AI Studio
# =============================================================================
def _try_google_single(data: Any) -> list[ChatThread]:
if not isinstance(data, Mapping) or "chunkedPrompt" not in data:
return []
chunks = data.get("chunkedPrompt", {}).get("chunks", []) or []
if not chunks:
return []
title = "Google AI Studio <00>"
if chunks and chunks[0].get("branchChildren"):
# <>c<EFBFBD>Sh<><68>
display_name = chunks[0]["branchChildren"][0].get("displayName", "")
match = re.search(r"Branch of (.*)", display_name or "")
if match:
title = match.group(1).strip()
messages: list[dict[str, Any]] = []
for idx, chunk in enumerate(chunks, start=1):
if chunk.get("isThought"):
continue
text = chunk.get("text", "")
if not text and "parts" in chunk:
text = "".join(p.get("text", "") for p in chunk.get("parts", []) if "text" in p)
if not text.strip():
continue
messages.append(
{
"id": f"google-single-{idx}",
"role": chunk.get("role", "assistant"),
"content": text.strip(),
"timestamp": chunk.get("timestamp"),
"metadata": {},
}
)
if not messages:
return []
return [ChatThread(id="google-single", title=title, messages=messages)]
def _try_google_backup(data: Any) -> list[ChatThread]:
if not isinstance(data, Mapping) or "conversations" not in data:
return []
conversations = data.get("conversations", []) or []
threads: list[ChatThread] = []
for idx, conversation in enumerate(conversations, start=1):
title = conversation.get("name") or f"Y<>N<EFBFBD>[݋_{idx}"
messages_data = conversation.get("messages", []) or []
messages: list[dict[str, Any]] = []
for midx, msg in enumerate(messages_data, start=1):
role = msg.get("author", {}).get("role", "user")
content = msg.get("content", "")
if not content:
continue
messages.append(
{
"id": f"google-backup-{idx}-{midx}",
"role": role,
"content": content.strip(),
"timestamp": msg.get("create_time"),
"metadata": {},
}
)
if messages:
conv_id = conversation.get("id") or f"google-backup-{idx}"
threads.append(ChatThread(id=str(conv_id), title=title, messages=messages))
return threads
# =============================================================================
# 2. OpenAI Conversation
# =============================================================================
def _try_openai_conversation(data: Any) -> list[ChatThread]:
# <00>{US<55>vyr<79>_<EFBFBD>hKm<1A>list N,{Ny<4E> g mapping
if not isinstance(data, list) or not data:
return []
first = data[0]
if not isinstance(first, Mapping) or "mapping" not in first:
return []
# :SR DeepSeek: DeepSeek <00>v mapping node message fragments
# OpenAI <00>v mapping node message content parts
# b<>N<EFBFBD>S<EFBFBD>N<1A>Ǐ<EFBFBD>h<EFBFBD>g fragment sQ.<2E>W[eg<65>cd<63> DeepSeek <0C>b<05><><EFBFBD> DeepSeek ㉐ghVHQэ
# FȎُb<>N<EFBFBD>S<EFBFBD>N<1A>Ǐ<EFBFBD>~<7E>gyr<79>_<EFBFBD>f<EFBFBD>~<7E><>0W$R<>e
threads: list[ChatThread] = []
for idx, conversation in enumerate(data, start=1):
mapping = conversation.get("mapping") or {}
if not mapping:
continue
# <00>h<EFBFBD>g/f&TS+T fragments (DeepSeek yr<79>_) <0C><>Y<EFBFBD>gS+TR<><52>Ǐ <0C><>N<EFBFBD>~ DeepSeek ㉐ghV
# <00>SN*N<><4E><EFBFBD>p7h,g
sample_node = next(iter(mapping.values()), {})
if sample_node.get("message", {}).get("fragments"):
return [] # ُ/fN*N DeepSeek <00>e<EFBFBD>N <0C> N/f OpenAI
title = conversation.get("title") or f"<00>[݋_{idx}"
conv_id = conversation.get("id") or conversation.get("conversation_id") or f"conv_{idx}"
# ُ̑<D98F>v;<3B><><EFBFBD><EFBFBD>{S:N<1A><>Sg<67>R/e<08>main branch <09>
branch = _reconstruct_longest_branch(mapping, conversation.get("current_node"))
if not branch:
continue
messages: list[dict[str, Any]] = []
for node_id in branch:
node = mapping.get(node_id) or {}
message = node.get("message")
if not message:
continue
metadata = message.get("metadata") or {}
if metadata.get("is_visually_hidden_from_conversation"):
continue
text = _extract_openai_text(message.get("content"))
if not text.strip():
continue
messages.append(
{
"id": message.get("id", node_id),
"role": message.get("author", {}).get("role", "user"),
"content": text.strip(),
"timestamp": _normalize_timestamp(message.get("create_time")),
"metadata": metadata,
}
)
if messages:
threads.append(ChatThread(id=str(conv_id), title=title, messages=messages))
return threads
def _extract_openai_text(content: Mapping[str, Any] | None) -> str:
if not content:
return ""
ctype = content.get("content_type")
if ctype == "text":
parts = content.get("parts") or []
return "\n".join(str(part) for part in parts if part)
if ctype == "multimodal_text":
texts: list[str] = []
for part in content.get("parts") or []:
if isinstance(part, str):
texts.append(part)
elif isinstance(part, Mapping) and part.get("type") == "text":
texts.append(part.get("text", ""))
return "\n".join(texts)
if ctype == "user_editable_context":
return content.get("user_instructions", "")
if isinstance(content, str):
return content
return json.dumps(content, ensure_ascii=False)
# =============================================================================
# 3. Chatbox
# =============================================================================
def _try_chatbox_export(data: Any) -> list[ChatThread]:
if not isinstance(data, Mapping):
return []
# yr<79>_<1A>chat-sessions-list
if "chat-sessions-list" not in data:
return []
sessions_meta = {
item.get("id"): item
for item in data.get("chat-sessions-list", []) or []
if isinstance(item, Mapping)
}
threads: list[ChatThread] = []
for key, value in data.items():
if not isinstance(key, str) or not key.startswith("session:"):
continue
session_id = key.split("session:", 1)[-1]
session_payload = value if isinstance(value, Mapping) else {}
messages_data = session_payload.get("messages", []) or []
if not messages_data:
continue
title = session_payload.get("name") or sessions_meta.get(session_id, {}).get("name")
if not title:
idx = len(threads) + 1
title = f"Chatbox <00>[݋_{idx}"
messages: list[dict[str, Any]] = []
for midx, message in enumerate(messages_data, start=1):
role = message.get("role") or "user"
content_parts = message.get("contentParts") or []
text = _extract_chatbox_content(content_parts)
if not text.strip():
continue
messages.append(
{
"id": message.get("id") or f"{session_id}-{midx}",
"role": role,
"content": text.strip(),
"timestamp": _normalize_timestamp(message.get("timestamp")),
"metadata": {
"session": title,
"session_id": session_id,
"source": "chatbox",
},
}
)
if messages:
threads.append(ChatThread(id=session_id or f"chatbox-{len(threads)+1}", title=title, messages=messages))
return threads
def _extract_chatbox_content(parts: Iterable[Mapping[str, Any]]) -> str:
fragments: list[str] = []
for part in parts or []:
if not isinstance(part, Mapping):
fragments.append(str(part))
continue
ptype = part.get("type")
if ptype == "text":
fragments.append(str(part.get("text", "")))
elif ptype in {"image_url", "image"} and part.get("url"):
fragments.append(f"[image: {part['url']}]")
elif "text" in part:
fragments.append(str(part["text"]))
else:
fragments.append(json.dumps(part, ensure_ascii=False))
return "\n\n".join(fragment for fragment in fragments if fragment)
# =============================================================================
# 4. Cherry Studio
# =============================================================================
def _try_cherry_studio(data: Any) -> list[ChatThread]:
"""㉐g Cherry Studio <00>[<5B>Q (localStorage JSON dump)"""
if not isinstance(data, Mapping):
return []
if "localStorage" not in data:
return []
cherry_str = data["localStorage"].get("persist:cherry-studio")
if not cherry_str:
return []
try:
cherry = json.loads(cherry_str)
except json.JSONDecodeError:
return []
if not isinstance(cherry, dict):
return []
assistants = cherry.get("assistants", [])
if isinstance(assistants, str): # g<>eP persist <00>v<P/f JSON string ̑<>v JSON string? N <EFBFBD><EFBFBD>8^/f JSON object
try:
assistants = json.loads(assistants)
except:
pass
# Cherry Studio structure check: top level keys often strings that need parsing?
# In chat-exporter: cherry = json.loads(cherry_str)
# assistants = cherry.get("assistants", [])
# But in redux-persist, values might be strings. Let's assume standard structure or parsed.
# If `assistants` is a string, parse it.
if isinstance(assistants, str):
try:
assistants = json.loads(assistants)
except:
return []
if not isinstance(assistants, list):
return []
threads: list[ChatThread] = []
for assistant in assistants:
topics = assistant.get("topics", [])
if not topics:
continue
model_info = assistant.get("model", {})
model_name = model_info.get("name") if isinstance(model_info, dict) else "Unknown"
for topic in topics:
topic_name = topic.get("name", "*g}T T݋<EFBFBD><EFBFBD>")
topic_msgs = topic.get("messages", [])
if not topic_msgs:
continue
messages: list[dict[str, Any]] = []
for idx, msg in enumerate(topic_msgs, start=1):
content = msg.get("content", "")
if not content:
continue
role = msg.get("role", "user")
messages.append({
"id": msg.get("id") or f"cherry-{idx}",
"role": role,
"content": content.strip(),
"timestamp": _normalize_timestamp(msg.get("createdAt")),
"metadata": {
"model": model_name
}
})
if messages:
tid = topic.get("id") or f"cherry-{len(threads)}"
# Combine assistant name/model with topic for clarity
full_title = f"[Cherry] {topic_name}"
threads.append(ChatThread(id=str(tid), title=full_title, messages=messages))
return threads
# =============================================================================
# 5. DeepSeek
# =============================================================================
def _try_deepseek_export(data: Any) -> list[ChatThread]:
"""㉐g DeepSeek <00>[<5B>Q<EFBFBD>e<EFBFBD>N"""
if not isinstance(data, list) or not data:
return []
# yr<79>_<EFBFBD>hKm<1A>item S+T mapping N W[&{2Nb__S+T fragments
first = data[0]
if not isinstance(first, Mapping) or "mapping" not in first:
return []
# <00>{US<55>v heuristic: <00>h<EFBFBD>g mapping ̑<>v message /f&T g fragments
is_deepseek = False
mapping_sample = first.get("mapping", {})
for node in mapping_sample.values():
if node and node.get("message", {}).get("fragments"):
is_deepseek = True
break
if not is_deepseek:
return []
threads: list[ChatThread] = []
for idx, conversation in enumerate(data, start=1):
title = conversation.get("title", f"DeepSeek<00>[݋_{idx}")
mapping = conversation.get("mapping", {})
if not mapping:
continue
# <00>[~b<>SP[<5B><><EFBFBD>p
all_nodes = set(mapping.keys())
parent_nodes = {node.get("parent") for node in mapping.values() if node}
leaf_nodes = all_nodes - parent_nodes - {"root"}
if not leaf_nodes:
continue
# <00>c<EFBFBD>Sg<67>R/e (Main Branch)
# <00>Y<EFBFBD>g<00><><EFBFBD>@b gR/e <0C><>S<EFBFBD>NM<4E><4D>S leaf_nodes
# ُ̑:N<>N<EFBFBD>{S UI <0C>؞<EFBFBD><D89E><EFBFBD>c<EFBFBD>Sg<67>R/e
longest_branch_ids: list[str] = []
# Helper to reconstruct path
def reconstruct_path(leaf_id: str) -> list[str]:
path = []
cur = leaf_id
while cur and cur != "root":
if cur not in mapping:
break
path.insert(0, cur)
cur = mapping[cur].get("parent", "root")
return path
branches = [reconstruct_path(leaf) for leaf in leaf_nodes]
if not branches:
continue
longest_branch_ids = max(branches, key=len)
messages: list[dict[str, Any]] = []
for node_id in longest_branch_ids:
node = mapping.get(node_id) or {}
msg_obj = node.get("message", {})
if not msg_obj:
continue
# DeepSeek specific content extraction
fragments = msg_obj.get("fragments", [])
content_buf = []
role = "user" # default
# Check type from fragments or fallback to author
# Fragments typically define content parts
for frag in fragments:
text = frag.get("content", "")
frag_type = frag.get("type")
if frag_type == "REQUEST":
role = "user"
elif frag_type == "RESPONSE":
role = "assistant"
if text:
content_buf.append(text)
full_text = "".join(content_buf)
if not full_text.strip():
continue
# If role wasn't determined by fragments (rare), try author
if not fragments:
role = msg_obj.get("author", {}).get("role", "user")
full_text = msg_obj.get("content", "")
messages.append({
"id": node_id,
"role": role,
"content": full_text.strip(),
"timestamp": _normalize_timestamp(conversation.get("inserted_at")), # DeepSeek conv often has one time?
"metadata": {}
})
if messages:
conv_id = conversation.get("id") or f"deepseek-{idx}"
threads.append(ChatThread(id=str(conv_id), title=f"[DeepSeek] {title}", messages=messages))
return threads
# =============================================================================
# Common Helpers
# =============================================================================
def _reconstruct_longest_branch(mapping: Mapping[str, Any], leaf_id: str | None) -> list[str]:
"""Generic branch reconstruction, preferring provided leaf_id or longest path"""
if not leaf_id:
# choose longest path from all leaves
nodes = set(mapping.keys())
parents = {node.get("parent") for node in mapping.values() if node}
leaf_nodes = nodes - parents - {"root"}
longest: list[str] = []
def get_path(leaf: str) -> list[str]:
path = []
cur = leaf
visited = set()
while cur and cur not in visited and cur != "root":
node = mapping.get(cur)
if not node:
break
path.append(cur)
visited.add(cur)
cur = node.get("parent")
path.reverse()
return path
for leaf in leaf_nodes:
branch = get_path(leaf)
if len(branch) > len(longest):
longest = branch
return longest
# Reconstruct from specific leaf
path: list[str] = []
visited: set[str] = set()
cur = leaf_id
while cur and cur not in visited and cur != "root":
node = mapping.get(cur)
if not node:
break
path.append(cur)
visited.add(cur)
cur = node.get("parent")
path.reverse()
return path
def _normalize_timestamp(value: Any) -> str:
if value is None:
return ""
if isinstance(value, (int, float)):
try:
if value > 1e12: # milliseconds
dt = datetime.fromtimestamp(value / 1000)
else:
dt = datetime.fromtimestamp(value)
return dt.isoformat(sep=" ", timespec="minutes")
except Exception:
return str(value)
if isinstance(value, str):
try:
dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
return dt.isoformat(sep=" ", timespec="minutes")
except Exception:
return str(value)
return str(value)
# =============================================================================
# Text Import
# =============================================================================
def parse_text_transcript(text: str) -> list[ChatThread]:
"""
<EFBFBD>g<EFBFBD>{<EFBFBD>e,gg0
<EFBFBD>N<EFBFBD>W<EFBFBD>N(u7bc<EFBFBD>[<EFBFBD>v:_R<EFBFBD><EFBFBD>&{ۏL<EFBFBD>RR0
"""
if not text.strip():
return []
lines = text.strip().split('\n')
messages: list[dict[str, Any]] = []
# (u7bc<>[<5B>v:_R<><52>&{
# 1. [`O<><4F>]
# 2. `O<><4F>:
# 3. ChatGPT<00><>:
# 4. ChatGPT <00><>:
markers = [
(r"\[`O<><4F>\]", "user"),
(r"`O<><4F>[:<1A>]", "user"),
(r"ChatGPT\s*<00><>[:<1A>]?", "assistant"), # 9SM<53> "ChatGPT<00><>" b "ChatGPT<00><><1A>"
(r"AI\s*<00><>[:<1A>]?", "assistant"), # <00><>2<EFBFBD>'`<60>m<EFBFBD>R AI <00><>
]
# <00>OYu<59>W@x<>v User/AI <00>Q<EFBFBD>S<h_ <0C>FO<46>_{<7B>L<EFBFBD><4C><EFBFBD>
markers.extend([
(r"^User[:<1A>]", "user"),
(r"^AI[:<1A>]", "assistant"),
])
for line in lines:
line_str = line.strip()
if not line_str:
continue
# 9SM<53>҉r<D289>
matched_role = None
for pattern, role in markers:
if re.search(pattern, line_str, re.IGNORECASE):
matched_role = role
break
if matched_role:
# /f Header L<>
# <>c<EFBFBD>S<EFBFBD>Q<EFBFBD>ST<>v<EFBFBD>Q<EFBFBD>[
content = ""
parts = re.split(r"[:<1A>]", line_str, 1)
if len(parts) > 1:
content = parts[1].strip()
messages.append({
"id": f"text-{len(messages)+1}",
"role": matched_role,
"content": content,
"timestamp": None
})
else:
# /f<>Q<EFBFBD>[L<>
if messages:
if messages[-1]["content"]:
messages[-1]["content"] += "\n" + line
else:
messages[-1]["content"] = line
else:
# ,{NL<4E>1\/f<>Q<EFBFBD>[ <0C>؞<EFBFBD><D89E> User
messages.append({
"id": "text-1",
"role": "user",
"content": line,
"timestamp": None
})
if not messages:
return []
msg_count = len(messages)
preview = messages[0]["content"][:10].replace("\n", " ") if messages else ""
title = f"<00>e,g<>[eQ ({msg_count}ag) - {preview}..."
return [ChatThread(id="pasted-text", title=title, messages=messages)]
# =============================================================================
# HTML Import
# =============================================================================
class ChatGPTHTMLParser(HTMLParser):
VOID_ELEMENTS = {
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
'link', 'meta', 'param', 'source', 'track', 'wbr'
}
# <00>yd<79> meta <00>T link <0C><>V:N<>[<5B>N/f void elements <0C> NO<EFBFBD><EFBFBD><EFBFBD>S handle_endtag <EFBFBD>
# <00>[<5B><> ignore_level <00>e<EFBFBD>l YMO <EFBFBD><EFBFBD>N <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>_euT<EFBFBD>~@b g<EFBFBD>Q<EFBFBD>[0
IGNORE_TAGS = {'button', 'svg', 'style', 'script', 'head', 'title'}
def __init__(self):
super().__init__()
self.messages = []
self.current_msg = None
self.recording = False
self.depth = 0
self.turn_start_depth = 0
self.ignore_level = 0
def handle_starttag(self, tag, attrs):
if tag in self.IGNORE_TAGS:
self.ignore_level += 1
# Void elements do not increase depth because they don't have end tags
if tag not in self.VOID_ELEMENTS:
self.depth += 1
attrs_dict = dict(attrs)
# <00>hKm Turn <00>[hV (data-testid="conversation-turn-X")
if 'data-testid' in attrs_dict and attrs_dict['data-testid'].startswith('conversation-turn-'):
# <00>S<EFBFBD><53>G<EFBFBD>0R<30>e<EFBFBD>v Turn <0C>:_6R<36>OX[
NN*N<EFBFBD>2<EFBFBD>bk depth <EFBFBD><EFBFBD>$R<EFBFBD>[<EFBFBD><EFBFBD>*g<EFBFBD>OX[ <EFBFBD>
if self.current_msg:
self._save_current()
# ؞<><D89E>:N user <0C><>Y<EFBFBD>gTb<54>~b0R<30>N role h~{O<>f<EFBFBD>e
self.current_msg = {"role": "user", "content": []}
self.recording = True
self.turn_start_depth = self.depth
return
# (WU_6R<36>r` N <0C><>[~b҉r<D289>hƋeg<65>Ock҉r<D289>
if self.recording and 'data-message-author-role' in attrs_dict:
self.current_msg["role"] = attrs_dict['data-message-author-role']
def handle_endtag(self, tag):
if tag in self.IGNORE_TAGS:
if self.ignore_level > 0:
self.ignore_level -= 1
# YtbcL<63> (WW<57>~CQ }<7D>~_g<5F>e)
if self.recording and self.ignore_level == 0:
if tag in ('p', 'div', 'br', 'li', 'tr', 'pre', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
self.current_msg["content"].append("\n")
if tag not in self.VOID_ELEMENTS:
# <00><>T Turn <00>[hV
if self.recording and self.depth == self.turn_start_depth:
self._save_current()
self.depth -= 1
def handle_data(self, data):
if self.recording and self.ignore_level == 0:
# <00>OYu<59>S<EFBFBD>Y<EFBFBD>Q<EFBFBD>[ <0C> N strip <EFBFBD><EFBFBD>MQL<EFBFBD><EFBFBD>QCQ }<EFBFBD>|ޏ
self.current_msg["content"].append(data)
def close(self):
super().close()
# nx<6E>OgTNag<61>mo`<60><><EFBFBD>OX[
if self.current_msg:
self._save_current()
def _save_current(self):
if self.current_msg:
full_text = "".join(self.current_msg["content"]).strip()
# <00>{US<55>vzz}vW[&{nt
full_text = re.sub(r'\n{3,}', '\n\n', full_text)
if full_text:
self.messages.append({
"role": self.current_msg["role"],
"content": full_text
})
self.current_msg = None
self.recording = False
def parse_html_transcript(html_content: str) -> list[ChatThread]:
"""㉐g ChatGPT HTML <00>[<5B>Q<EFBFBD>e<EFBFBD>N"""
parser = ChatGPTHTMLParser()
parser.feed(html_content)
if not parser.messages:
return []
# <00>g <20> ChatThread
title = "HTML <00>[eQ<65>"
title_match = re.search(r"<title>(.*?)</title>", html_content)
if title_match:
title = title_match.group(1)
# l<>bc:Nh<>Q<EFBFBD>mo`<h_
msgs = []
for idx, msg in enumerate(parser.messages, 1):
msgs.append({
"id": f"html-{idx}",
"role": msg["role"],
"content": msg["content"],
"timestamp": None
})
return [ChatThread(id="html-import", title=title, messages=msgs)]
__all__ = [
"ChatThread",
"ChatImporterError",
"detect_threads",
"parse_text_transcript",
"parse_html_transcript",
]