open-webui/backend/open_webui/utils/chat_importer.py

749 lines
51 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import json
import re
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Iterable, Mapping, cast
from html.parser import HTMLParser
@dataclass(slots=True)
class ChatThread:
id: str
title: str
messages: list[dict[str, Any]]
class ChatImporterError(Exception):
pass
def detect_threads(data: Any) -> list[ChatThread]:
"""
尝试使用多种解析器解析导入的 JSON 数据。
支持:
1. Google AI Studio (Single / Backup)
2. OpenAI Conversation (ChatGPT export)
3. Chatbox
4. Cherry Studio
5. DeepSeek
"""
for parser in (
_try_google_single,
_try_google_backup,
_try_openai_conversation,
_try_chatbox_export,
_try_cherry_studio,
_try_deepseek_export,
):
try:
threads = parser(data)
if threads:
return threads
except Exception:
# 忽略解析错误,尝试下一个解析器
continue
return []
# =============================================================================
# 1. Google AI Studio
# =============================================================================
def _try_google_single(data: Any) -> list[ChatThread]:
if not isinstance(data, Mapping) or "chunkedPrompt" not in data:
return []
chunks = data.get("chunkedPrompt", {}).get("chunks", []) or []
if not chunks:
return []
title = "Google AI Studio 对话"
if chunks and chunks[0].get("branchChildren"):
# 尝试提取标题
display_name = chunks[0]["branchChildren"][0].get("displayName", "")
match = re.search(r"Branch of (.*)", display_name or "")
if match:
title = match.group(1).strip()
messages: list[dict[str, Any]] = []
for idx, chunk in enumerate(chunks, start=1):
if chunk.get("isThought"):
continue
text = chunk.get("text", "")
if not text and "parts" in chunk:
text = "".join(p.get("text", "") for p in chunk.get("parts", []) if "text" in p)
if not text.strip():
continue
messages.append(
{
"id": f"google-single-{idx}",
"role": chunk.get("role", "assistant"),
"content": text.strip(),
"timestamp": chunk.get("timestamp"),
"metadata": {},
}
)
if not messages:
return []
return [ChatThread(id="google-single", title=title, messages=messages)]
def _try_google_backup(data: Any) -> list[ChatThread]:
if not isinstance(data, Mapping) or "conversations" not in data:
return []
conversations = data.get("conversations", []) or []
threads: list[ChatThread] = []
for idx, conversation in enumerate(conversations, start=1):
title = conversation.get("name") or f"备份对话_{idx}"
messages_data = conversation.get("messages", []) or []
messages: list[dict[str, Any]] = []
for midx, msg in enumerate(messages_data, start=1):
role = msg.get("author", {}).get("role", "user")
content = msg.get("content", "")
if not content:
continue
messages.append(
{
"id": f"google-backup-{idx}-{midx}",
"role": role,
"content": content.strip(),
"timestamp": msg.get("create_time"),
"metadata": {},
}
)
if messages:
conv_id = conversation.get("id") or f"google-backup-{idx}"
threads.append(ChatThread(id=str(conv_id), title=title, messages=messages))
return threads
# =============================================================================
# 2. OpenAI Conversation
# =============================================================================
def _try_openai_conversation(data: Any) -> list[ChatThread]:
# 简单的特征检测list 且第一项有 mapping
if not isinstance(data, list) or not data:
return []
first = data[0]
if not isinstance(first, Mapping) or "mapping" not in first:
return []
# 区分 DeepSeek: DeepSeek 的 mapping node message fragments
# OpenAI 的 mapping node message content parts
# 我们可以通过检查 fragment 关键字来排除 DeepSeek或者让 DeepSeek 解析器先跑
# 但这里我们可以通过结构特征更细致地判断
threads: list[ChatThread] = []
for idx, conversation in enumerate(data, start=1):
mapping = conversation.get("mapping") or {}
if not mapping:
continue
# 检查是否包含 fragments (DeepSeek 特征),如果包含则跳过,交给 DeepSeek 解析器
# 取一个节点样本
sample_node = next(iter(mapping.values()), {})
if sample_node.get("message", {}).get("fragments"):
return [] # 这是一个 DeepSeek 文件,不是 OpenAI
title = conversation.get("title") or f"对话_{idx}"
conv_id = conversation.get("id") or conversation.get("conversation_id") or f"conv_{idx}"
# 这里的逻辑简化为取最长分支main branch
branch = _reconstruct_longest_branch(mapping, conversation.get("current_node"))
if not branch:
continue
messages: list[dict[str, Any]] = []
for node_id in branch:
node = mapping.get(node_id) or {}
message = node.get("message")
if not message:
continue
metadata = message.get("metadata") or {}
if metadata.get("is_visually_hidden_from_conversation"):
continue
text = _extract_openai_text(message.get("content"))
if not text.strip():
continue
messages.append(
{
"id": message.get("id", node_id),
"role": message.get("author", {}).get("role", "user"),
"content": text.strip(),
"timestamp": _normalize_timestamp(message.get("create_time")),
"metadata": metadata,
}
)
if messages:
threads.append(ChatThread(id=str(conv_id), title=title, messages=messages))
return threads
def _extract_openai_text(content: Mapping[str, Any] | None) -> str:
if not content:
return ""
ctype = content.get("content_type")
if ctype == "text":
parts = content.get("parts") or []
return "\n".join(str(part) for part in parts if part)
if ctype == "multimodal_text":
texts: list[str] = []
for part in content.get("parts") or []:
if isinstance(part, str):
texts.append(part)
elif isinstance(part, Mapping) and part.get("type") == "text":
texts.append(part.get("text", ""))
return "\n".join(texts)
if ctype == "user_editable_context":
return content.get("user_instructions", "")
if isinstance(content, str):
return content
return json.dumps(content, ensure_ascii=False)
# =============================================================================
# 3. Chatbox
# =============================================================================
def _try_chatbox_export(data: Any) -> list[ChatThread]:
if not isinstance(data, Mapping):
return []
# 特征chat-sessions-list
if "chat-sessions-list" not in data:
return []
sessions_meta = {
item.get("id"): item
for item in data.get("chat-sessions-list", []) or []
if isinstance(item, Mapping)
}
threads: list[ChatThread] = []
for key, value in data.items():
if not isinstance(key, str) or not key.startswith("session:"):
continue
session_id = key.split("session:", 1)[-1]
session_payload = value if isinstance(value, Mapping) else {}
messages_data = session_payload.get("messages", []) or []
if not messages_data:
continue
title = session_payload.get("name") or sessions_meta.get(session_id, {}).get("name")
if not title:
idx = len(threads) + 1
title = f"Chatbox 对话_{idx}"
messages: list[dict[str, Any]] = []
for midx, message in enumerate(messages_data, start=1):
role = message.get("role") or "user"
content_parts = message.get("contentParts") or []
text = _extract_chatbox_content(content_parts)
if not text.strip():
continue
messages.append(
{
"id": message.get("id") or f"{session_id}-{midx}",
"role": role,
"content": text.strip(),
"timestamp": _normalize_timestamp(message.get("timestamp")),
"metadata": {
"session": title,
"session_id": session_id,
"source": "chatbox",
},
}
)
if messages:
threads.append(ChatThread(id=session_id or f"chatbox-{len(threads)+1}", title=title, messages=messages))
return threads
def _extract_chatbox_content(parts: Iterable[Mapping[str, Any]]) -> str:
fragments: list[str] = []
for part in parts or []:
if not isinstance(part, Mapping):
fragments.append(str(part))
continue
ptype = part.get("type")
if ptype == "text":
fragments.append(str(part.get("text", "")))
elif ptype in {"image_url", "image"} and part.get("url"):
fragments.append(f"[image: {part['url']}]")
elif "text" in part:
fragments.append(str(part["text"]))
else:
fragments.append(json.dumps(part, ensure_ascii=False))
return "\n\n".join(fragment for fragment in fragments if fragment)
# =============================================================================
# 4. Cherry Studio
# =============================================================================
def _try_cherry_studio(data: Any) -> list[ChatThread]:
"""解析 Cherry Studio 导出 (localStorage JSON dump)"""
if not isinstance(data, Mapping):
return []
if "localStorage" not in data:
return []
cherry_str = data["localStorage"].get("persist:cherry-studio")
if not cherry_str:
return []
try:
cherry = json.loads(cherry_str)
except json.JSONDecodeError:
return []
if not isinstance(cherry, dict):
return []
assistants = cherry.get("assistants", [])
if isinstance(assistants, str): # 有时候 persist 的值是 JSON string 里的 JSON string? 不,通常是 JSON object
try:
assistants = json.loads(assistants)
except:
pass
# Cherry Studio structure check: top level keys often strings that need parsing?
# In chat-exporter: cherry = json.loads(cherry_str)
# assistants = cherry.get("assistants", [])
# But in redux-persist, values might be strings. Let's assume standard structure or parsed.
# If `assistants` is a string, parse it.
if isinstance(assistants, str):
try:
assistants = json.loads(assistants)
except:
return []
if not isinstance(assistants, list):
return []
threads: list[ChatThread] = []
for assistant in assistants:
topics = assistant.get("topics", [])
if not topics:
continue
model_info = assistant.get("model", {})
model_name = model_info.get("name") if isinstance(model_info, dict) else "Unknown"
for topic in topics:
topic_name = topic.get("name", "未命名话题")
topic_msgs = topic.get("messages", [])
if not topic_msgs:
continue
messages: list[dict[str, Any]] = []
for idx, msg in enumerate(topic_msgs, start=1):
content = msg.get("content", "")
if not content:
continue
role = msg.get("role", "user")
messages.append({
"id": msg.get("id") or f"cherry-{idx}",
"role": role,
"content": content.strip(),
"timestamp": _normalize_timestamp(msg.get("createdAt")),
"metadata": {
"model": model_name
}
})
if messages:
tid = topic.get("id") or f"cherry-{len(threads)}"
# Combine assistant name/model with topic for clarity
full_title = f"[Cherry] {topic_name}"
threads.append(ChatThread(id=str(tid), title=full_title, messages=messages))
return threads
# =============================================================================
# 5. DeepSeek
# =============================================================================
def _try_deepseek_export(data: Any) -> list[ChatThread]:
"""解析 DeepSeek 导出文件"""
if not isinstance(data, list) or not data:
return []
# 特征检测item 包含 mapping 且 字符串形式包含 fragments
first = data[0]
if not isinstance(first, Mapping) or "mapping" not in first:
return []
# 简单的 heuristic: 检查 mapping 里的 message 是否有 fragments
is_deepseek = False
mapping_sample = first.get("mapping", {})
for node in mapping_sample.values():
if node and node.get("message", {}).get("fragments"):
is_deepseek = True
break
if not is_deepseek:
return []
threads: list[ChatThread] = []
for idx, conversation in enumerate(data, start=1):
title = conversation.get("title", f"DeepSeek对话_{idx}")
mapping = conversation.get("mapping", {})
if not mapping:
continue
# 寻找叶子节点
all_nodes = set(mapping.keys())
parent_nodes = {node.get("parent") for node in mapping.values() if node}
leaf_nodes = all_nodes - parent_nodes - {"root"}
if not leaf_nodes:
continue
# 提取最长分支 (Main Branch)
# 如果需要所有分支,可以遍历 leaf_nodes
# 这里为了简化 UI默认提取最长分支
longest_branch_ids: list[str] = []
# Helper to reconstruct path
def reconstruct_path(leaf_id: str) -> list[str]:
path = []
cur = leaf_id
while cur and cur != "root":
if cur not in mapping:
break
path.insert(0, cur)
cur = mapping[cur].get("parent", "root")
return path
branches = [reconstruct_path(leaf) for leaf in leaf_nodes]
if not branches:
continue
longest_branch_ids = max(branches, key=len)
messages: list[dict[str, Any]] = []
for node_id in longest_branch_ids:
node = mapping.get(node_id) or {}
msg_obj = node.get("message", {})
if not msg_obj:
continue
# DeepSeek specific content extraction
fragments = msg_obj.get("fragments", [])
content_buf = []
role = "user" # default
# Check type from fragments or fallback to author
# Fragments typically define content parts
for frag in fragments:
text = frag.get("content", "")
frag_type = frag.get("type")
if frag_type == "REQUEST":
role = "user"
elif frag_type == "RESPONSE":
role = "assistant"
if text:
content_buf.append(text)
full_text = "".join(content_buf)
if not full_text.strip():
continue
# If role wasn't determined by fragments (rare), try author
if not fragments:
role = msg_obj.get("author", {}).get("role", "user")
full_text = msg_obj.get("content", "")
messages.append({
"id": node_id,
"role": role,
"content": full_text.strip(),
"timestamp": _normalize_timestamp(conversation.get("inserted_at")), # DeepSeek conv often has one time?
"metadata": {}
})
if messages:
conv_id = conversation.get("id") or f"deepseek-{idx}"
threads.append(ChatThread(id=str(conv_id), title=f"[DeepSeek] {title}", messages=messages))
return threads
# =============================================================================
# Common Helpers
# =============================================================================
def _reconstruct_longest_branch(mapping: Mapping[str, Any], leaf_id: str | None) -> list[str]:
"""Generic branch reconstruction, preferring provided leaf_id or longest path"""
if not leaf_id:
# choose longest path from all leaves
nodes = set(mapping.keys())
parents = {node.get("parent") for node in mapping.values() if node}
leaf_nodes = nodes - parents - {"root"}
longest: list[str] = []
def get_path(leaf: str) -> list[str]:
path = []
cur = leaf
visited = set()
while cur and cur not in visited and cur != "root":
node = mapping.get(cur)
if not node:
break
path.append(cur)
visited.add(cur)
cur = node.get("parent")
path.reverse()
return path
for leaf in leaf_nodes:
branch = get_path(leaf)
if len(branch) > len(longest):
longest = branch
return longest
# Reconstruct from specific leaf
path: list[str] = []
visited: set[str] = set()
cur = leaf_id
while cur and cur not in visited and cur != "root":
node = mapping.get(cur)
if not node:
break
path.append(cur)
visited.add(cur)
cur = node.get("parent")
path.reverse()
return path
def _normalize_timestamp(value: Any) -> str:
if value is None:
return ""
if isinstance(value, (int, float)):
try:
if value > 1e12: # milliseconds
dt = datetime.fromtimestamp(value / 1000)
else:
dt = datetime.fromtimestamp(value)
return dt.isoformat(sep=" ", timespec="minutes")
except Exception:
return str(value)
if isinstance(value, str):
try:
dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
return dt.isoformat(sep=" ", timespec="minutes")
except Exception:
return str(value)
return str(value)
# =============================================================================
# Text Import
# =============================================================================
def parse_text_transcript(text: str) -> list[ChatThread]:
"""
极简文本解析。
仅基于用户指定的强分隔符进行切分。
"""
if not text.strip():
return []
lines = text.strip().split('\n')
messages: list[dict[str, Any]] = []
# 用户指定的强分隔符
# 1. [你说]
# 2. 你说:
# 3. ChatGPT说:
# 4. ChatGPT 说:
markers = [
(r"\[你说\]", "user"),
(r"你说[:]", "user"),
(r"ChatGPT\s*说[:]?", "assistant"), # 匹配 "ChatGPT说" 或 "ChatGPT说"
(r"AI\s*说[:]?", "assistant"), # 预防性添加 AI 说
]
# 保留基础的 User/AI 冒号格式,但必须行首
markers.extend([
(r"^User[:]", "user"),
(r"^AI[:]", "assistant"),
])
for line in lines:
line_str = line.strip()
if not line_str:
continue
# 匹配角色
matched_role = None
for pattern, role in markers:
if re.search(pattern, line_str, re.IGNORECASE):
matched_role = role
break
if matched_role:
# 是 Header 行
# 尝试提取冒号后的内容
content = ""
parts = re.split(r"[:]", line_str, 1)
if len(parts) > 1:
content = parts[1].strip()
messages.append({
"id": f"text-{len(messages)+1}",
"role": matched_role,
"content": content,
"timestamp": None
})
else:
# 是内容行
if messages:
if messages[-1]["content"]:
messages[-1]["content"] += "\n" + line
else:
messages[-1]["content"] = line
else:
# 第一行就是内容,默认 User
messages.append({
"id": "text-1",
"role": "user",
"content": line,
"timestamp": None
})
if not messages:
return []
msg_count = len(messages)
preview = messages[0]["content"][:10].replace("\n", " ") if messages else ""
title = f"文本导入 ({msg_count}条) - {preview}..."
return [ChatThread(id="pasted-text", title=title, messages=messages)]
# =============================================================================
# HTML Import
# =============================================================================
class ChatGPTHTMLParser(HTMLParser):
VOID_ELEMENTS = {
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
'link', 'meta', 'param', 'source', 'track', 'wbr'
}
# 移除 meta 和 link因为它们是 void elements不会触发 handle_endtag
# 导致 ignore_level 无法复位,从而错误忽略后续所有内容。
IGNORE_TAGS = {'button', 'svg', 'style', 'script', 'head', 'title'}
def __init__(self):
super().__init__()
self.messages = []
self.current_msg = None
self.recording = False
self.depth = 0
self.turn_start_depth = 0
self.ignore_level = 0
def handle_starttag(self, tag, attrs):
if tag in self.IGNORE_TAGS:
self.ignore_level += 1
# Void elements do not increase depth because they don't have end tags
if tag not in self.VOID_ELEMENTS:
self.depth += 1
attrs_dict = dict(attrs)
# 检测 Turn 容器 (data-testid="conversation-turn-X")
if 'data-testid' in attrs_dict and attrs_dict['data-testid'].startswith('conversation-turn-'):
# 只要遇到新的 Turn强制保存上一个防止 depth 误判导致未保存)
if self.current_msg:
self._save_current()
# 默认为 user如果后面找到了 role 标签会更新
self.current_msg = {"role": "user", "content": []}
self.recording = True
self.turn_start_depth = self.depth
return
# 在录制状态下,寻找角色标识来修正角色
if self.recording and 'data-message-author-role' in attrs_dict:
self.current_msg["role"] = attrs_dict['data-message-author-role']
def handle_endtag(self, tag):
if tag in self.IGNORE_TAGS:
if self.ignore_level > 0:
self.ignore_level -= 1
# 处理换行 (块级元素结束时)
if self.recording and self.ignore_level == 0:
if tag in ('p', 'div', 'br', 'li', 'tr', 'pre', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
self.current_msg["content"].append("\n")
if tag not in self.VOID_ELEMENTS:
# 闭合 Turn 容器
if self.recording and self.depth == self.turn_start_depth:
self._save_current()
self.depth -= 1
def handle_data(self, data):
if self.recording and self.ignore_level == 0:
# 保留原始内容,不 strip避免行内元素粘连
self.current_msg["content"].append(data)
def close(self):
super().close()
# 确保最后一条消息被保存
if self.current_msg:
self._save_current()
def _save_current(self):
if self.current_msg:
full_text = "".join(self.current_msg["content"]).strip()
# 简单的空白字符清理
full_text = re.sub(r'\n{3,}', '\n\n', full_text)
if full_text:
self.messages.append({
"role": self.current_msg["role"],
"content": full_text
})
self.current_msg = None
self.recording = False
def parse_html_transcript(html_content: str) -> list[ChatThread]:
"""解析 ChatGPT HTML 导出文件"""
parser = ChatGPTHTMLParser()
parser.feed(html_content)
if not parser.messages:
return []
# 构造 ChatThread
title = "HTML 导入对话"
title_match = re.search(r"<title>(.*?)</title>", html_content)
if title_match:
title = title_match.group(1)
# 转换为标准消息格式
msgs = []
for idx, msg in enumerate(parser.messages, 1):
msgs.append({
"id": f"html-{idx}",
"role": msg["role"],
"content": msg["content"],
"timestamp": None
})
return [ChatThread(id="html-import", title=title, messages=msgs)]
__all__ = [
"ChatThread",
"ChatImporterError",
"detect_threads",
"parse_text_transcript",
"parse_html_transcript",
]