mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-14 21:35:19 +00:00
749 lines
51 KiB
Python
749 lines
51 KiB
Python
from __future__ import annotations
|
||
|
||
import json
|
||
import re
|
||
from dataclasses import dataclass
|
||
from datetime import datetime
|
||
from typing import Any, Iterable, Mapping, cast
|
||
from html.parser import HTMLParser
|
||
|
||
|
||
@dataclass(slots=True)
|
||
class ChatThread:
|
||
id: str
|
||
title: str
|
||
messages: list[dict[str, Any]]
|
||
|
||
|
||
class ChatImporterError(Exception):
|
||
pass
|
||
|
||
|
||
def detect_threads(data: Any) -> list[ChatThread]:
|
||
"""
|
||
尝试使用多种解析器解析导入的 JSON 数据。
|
||
支持:
|
||
1. Google AI Studio (Single / Backup)
|
||
2. OpenAI Conversation (ChatGPT export)
|
||
3. Chatbox
|
||
4. Cherry Studio
|
||
5. DeepSeek
|
||
"""
|
||
for parser in (
|
||
_try_google_single,
|
||
_try_google_backup,
|
||
_try_openai_conversation,
|
||
_try_chatbox_export,
|
||
_try_cherry_studio,
|
||
_try_deepseek_export,
|
||
):
|
||
try:
|
||
threads = parser(data)
|
||
if threads:
|
||
return threads
|
||
except Exception:
|
||
# 忽略解析错误,尝试下一个解析器
|
||
continue
|
||
return []
|
||
|
||
|
||
# =============================================================================
|
||
# 1. Google AI Studio
|
||
# =============================================================================
|
||
|
||
def _try_google_single(data: Any) -> list[ChatThread]:
|
||
if not isinstance(data, Mapping) or "chunkedPrompt" not in data:
|
||
return []
|
||
chunks = data.get("chunkedPrompt", {}).get("chunks", []) or []
|
||
if not chunks:
|
||
return []
|
||
title = "Google AI Studio 对话"
|
||
if chunks and chunks[0].get("branchChildren"):
|
||
# 尝试提取标题
|
||
display_name = chunks[0]["branchChildren"][0].get("displayName", "")
|
||
match = re.search(r"Branch of (.*)", display_name or "")
|
||
if match:
|
||
title = match.group(1).strip()
|
||
|
||
messages: list[dict[str, Any]] = []
|
||
for idx, chunk in enumerate(chunks, start=1):
|
||
if chunk.get("isThought"):
|
||
continue
|
||
text = chunk.get("text", "")
|
||
if not text and "parts" in chunk:
|
||
text = "".join(p.get("text", "") for p in chunk.get("parts", []) if "text" in p)
|
||
if not text.strip():
|
||
continue
|
||
messages.append(
|
||
{
|
||
"id": f"google-single-{idx}",
|
||
"role": chunk.get("role", "assistant"),
|
||
"content": text.strip(),
|
||
"timestamp": chunk.get("timestamp"),
|
||
"metadata": {},
|
||
}
|
||
)
|
||
if not messages:
|
||
return []
|
||
return [ChatThread(id="google-single", title=title, messages=messages)]
|
||
|
||
|
||
def _try_google_backup(data: Any) -> list[ChatThread]:
|
||
if not isinstance(data, Mapping) or "conversations" not in data:
|
||
return []
|
||
conversations = data.get("conversations", []) or []
|
||
threads: list[ChatThread] = []
|
||
for idx, conversation in enumerate(conversations, start=1):
|
||
title = conversation.get("name") or f"备份对话_{idx}"
|
||
messages_data = conversation.get("messages", []) or []
|
||
messages: list[dict[str, Any]] = []
|
||
for midx, msg in enumerate(messages_data, start=1):
|
||
role = msg.get("author", {}).get("role", "user")
|
||
content = msg.get("content", "")
|
||
if not content:
|
||
continue
|
||
messages.append(
|
||
{
|
||
"id": f"google-backup-{idx}-{midx}",
|
||
"role": role,
|
||
"content": content.strip(),
|
||
"timestamp": msg.get("create_time"),
|
||
"metadata": {},
|
||
}
|
||
)
|
||
if messages:
|
||
conv_id = conversation.get("id") or f"google-backup-{idx}"
|
||
threads.append(ChatThread(id=str(conv_id), title=title, messages=messages))
|
||
return threads
|
||
|
||
|
||
# =============================================================================
|
||
# 2. OpenAI Conversation
|
||
# =============================================================================
|
||
|
||
def _try_openai_conversation(data: Any) -> list[ChatThread]:
|
||
# 简单的特征检测:list 且第一项有 mapping
|
||
if not isinstance(data, list) or not data:
|
||
return []
|
||
first = data[0]
|
||
if not isinstance(first, Mapping) or "mapping" not in first:
|
||
return []
|
||
# 区分 DeepSeek: DeepSeek 的 mapping node message fragments
|
||
# OpenAI 的 mapping node message content parts
|
||
# 我们可以通过检查 fragment 关键字来排除 DeepSeek,或者让 DeepSeek 解析器先跑
|
||
# 但这里我们可以通过结构特征更细致地判断
|
||
|
||
threads: list[ChatThread] = []
|
||
for idx, conversation in enumerate(data, start=1):
|
||
mapping = conversation.get("mapping") or {}
|
||
if not mapping:
|
||
continue
|
||
|
||
# 检查是否包含 fragments (DeepSeek 特征),如果包含则跳过,交给 DeepSeek 解析器
|
||
# 取一个节点样本
|
||
sample_node = next(iter(mapping.values()), {})
|
||
if sample_node.get("message", {}).get("fragments"):
|
||
return [] # 这是一个 DeepSeek 文件,不是 OpenAI
|
||
|
||
title = conversation.get("title") or f"对话_{idx}"
|
||
conv_id = conversation.get("id") or conversation.get("conversation_id") or f"conv_{idx}"
|
||
|
||
# 这里的逻辑简化为:取最长分支(main branch)
|
||
branch = _reconstruct_longest_branch(mapping, conversation.get("current_node"))
|
||
if not branch:
|
||
continue
|
||
|
||
messages: list[dict[str, Any]] = []
|
||
for node_id in branch:
|
||
node = mapping.get(node_id) or {}
|
||
message = node.get("message")
|
||
if not message:
|
||
continue
|
||
metadata = message.get("metadata") or {}
|
||
if metadata.get("is_visually_hidden_from_conversation"):
|
||
continue
|
||
text = _extract_openai_text(message.get("content"))
|
||
if not text.strip():
|
||
continue
|
||
messages.append(
|
||
{
|
||
"id": message.get("id", node_id),
|
||
"role": message.get("author", {}).get("role", "user"),
|
||
"content": text.strip(),
|
||
"timestamp": _normalize_timestamp(message.get("create_time")),
|
||
"metadata": metadata,
|
||
}
|
||
)
|
||
if messages:
|
||
threads.append(ChatThread(id=str(conv_id), title=title, messages=messages))
|
||
return threads
|
||
|
||
|
||
def _extract_openai_text(content: Mapping[str, Any] | None) -> str:
|
||
if not content:
|
||
return ""
|
||
ctype = content.get("content_type")
|
||
if ctype == "text":
|
||
parts = content.get("parts") or []
|
||
return "\n".join(str(part) for part in parts if part)
|
||
if ctype == "multimodal_text":
|
||
texts: list[str] = []
|
||
for part in content.get("parts") or []:
|
||
if isinstance(part, str):
|
||
texts.append(part)
|
||
elif isinstance(part, Mapping) and part.get("type") == "text":
|
||
texts.append(part.get("text", ""))
|
||
return "\n".join(texts)
|
||
if ctype == "user_editable_context":
|
||
return content.get("user_instructions", "")
|
||
if isinstance(content, str):
|
||
return content
|
||
return json.dumps(content, ensure_ascii=False)
|
||
|
||
|
||
# =============================================================================
|
||
# 3. Chatbox
|
||
# =============================================================================
|
||
|
||
def _try_chatbox_export(data: Any) -> list[ChatThread]:
|
||
if not isinstance(data, Mapping):
|
||
return []
|
||
# 特征:chat-sessions-list
|
||
if "chat-sessions-list" not in data:
|
||
return []
|
||
|
||
sessions_meta = {
|
||
item.get("id"): item
|
||
for item in data.get("chat-sessions-list", []) or []
|
||
if isinstance(item, Mapping)
|
||
}
|
||
threads: list[ChatThread] = []
|
||
for key, value in data.items():
|
||
if not isinstance(key, str) or not key.startswith("session:"):
|
||
continue
|
||
session_id = key.split("session:", 1)[-1]
|
||
session_payload = value if isinstance(value, Mapping) else {}
|
||
messages_data = session_payload.get("messages", []) or []
|
||
if not messages_data:
|
||
continue
|
||
title = session_payload.get("name") or sessions_meta.get(session_id, {}).get("name")
|
||
if not title:
|
||
idx = len(threads) + 1
|
||
title = f"Chatbox 对话_{idx}"
|
||
messages: list[dict[str, Any]] = []
|
||
for midx, message in enumerate(messages_data, start=1):
|
||
role = message.get("role") or "user"
|
||
content_parts = message.get("contentParts") or []
|
||
text = _extract_chatbox_content(content_parts)
|
||
if not text.strip():
|
||
continue
|
||
messages.append(
|
||
{
|
||
"id": message.get("id") or f"{session_id}-{midx}",
|
||
"role": role,
|
||
"content": text.strip(),
|
||
"timestamp": _normalize_timestamp(message.get("timestamp")),
|
||
"metadata": {
|
||
"session": title,
|
||
"session_id": session_id,
|
||
"source": "chatbox",
|
||
},
|
||
}
|
||
)
|
||
if messages:
|
||
threads.append(ChatThread(id=session_id or f"chatbox-{len(threads)+1}", title=title, messages=messages))
|
||
return threads
|
||
|
||
|
||
def _extract_chatbox_content(parts: Iterable[Mapping[str, Any]]) -> str:
|
||
fragments: list[str] = []
|
||
for part in parts or []:
|
||
if not isinstance(part, Mapping):
|
||
fragments.append(str(part))
|
||
continue
|
||
ptype = part.get("type")
|
||
if ptype == "text":
|
||
fragments.append(str(part.get("text", "")))
|
||
elif ptype in {"image_url", "image"} and part.get("url"):
|
||
fragments.append(f"[image: {part['url']}]")
|
||
elif "text" in part:
|
||
fragments.append(str(part["text"]))
|
||
else:
|
||
fragments.append(json.dumps(part, ensure_ascii=False))
|
||
return "\n\n".join(fragment for fragment in fragments if fragment)
|
||
|
||
|
||
# =============================================================================
|
||
# 4. Cherry Studio
|
||
# =============================================================================
|
||
|
||
def _try_cherry_studio(data: Any) -> list[ChatThread]:
|
||
"""解析 Cherry Studio 导出 (localStorage JSON dump)"""
|
||
if not isinstance(data, Mapping):
|
||
return []
|
||
if "localStorage" not in data:
|
||
return []
|
||
cherry_str = data["localStorage"].get("persist:cherry-studio")
|
||
if not cherry_str:
|
||
return []
|
||
|
||
try:
|
||
cherry = json.loads(cherry_str)
|
||
except json.JSONDecodeError:
|
||
return []
|
||
|
||
if not isinstance(cherry, dict):
|
||
return []
|
||
|
||
assistants = cherry.get("assistants", [])
|
||
if isinstance(assistants, str): # 有时候 persist 的值是 JSON string 里的 JSON string? 不,通常是 JSON object
|
||
try:
|
||
assistants = json.loads(assistants)
|
||
except:
|
||
pass
|
||
|
||
# Cherry Studio structure check: top level keys often strings that need parsing?
|
||
# In chat-exporter: cherry = json.loads(cherry_str)
|
||
# assistants = cherry.get("assistants", [])
|
||
# But in redux-persist, values might be strings. Let's assume standard structure or parsed.
|
||
# If `assistants` is a string, parse it.
|
||
if isinstance(assistants, str):
|
||
try:
|
||
assistants = json.loads(assistants)
|
||
except:
|
||
return []
|
||
|
||
if not isinstance(assistants, list):
|
||
return []
|
||
|
||
threads: list[ChatThread] = []
|
||
for assistant in assistants:
|
||
topics = assistant.get("topics", [])
|
||
if not topics:
|
||
continue
|
||
|
||
model_info = assistant.get("model", {})
|
||
model_name = model_info.get("name") if isinstance(model_info, dict) else "Unknown"
|
||
|
||
for topic in topics:
|
||
topic_name = topic.get("name", "未命名话题")
|
||
topic_msgs = topic.get("messages", [])
|
||
if not topic_msgs:
|
||
continue
|
||
|
||
messages: list[dict[str, Any]] = []
|
||
for idx, msg in enumerate(topic_msgs, start=1):
|
||
content = msg.get("content", "")
|
||
if not content:
|
||
continue
|
||
role = msg.get("role", "user")
|
||
|
||
messages.append({
|
||
"id": msg.get("id") or f"cherry-{idx}",
|
||
"role": role,
|
||
"content": content.strip(),
|
||
"timestamp": _normalize_timestamp(msg.get("createdAt")),
|
||
"metadata": {
|
||
"model": model_name
|
||
}
|
||
})
|
||
|
||
if messages:
|
||
tid = topic.get("id") or f"cherry-{len(threads)}"
|
||
# Combine assistant name/model with topic for clarity
|
||
full_title = f"[Cherry] {topic_name}"
|
||
threads.append(ChatThread(id=str(tid), title=full_title, messages=messages))
|
||
|
||
return threads
|
||
|
||
|
||
# =============================================================================
|
||
# 5. DeepSeek
|
||
# =============================================================================
|
||
|
||
def _try_deepseek_export(data: Any) -> list[ChatThread]:
|
||
"""解析 DeepSeek 导出文件"""
|
||
if not isinstance(data, list) or not data:
|
||
return []
|
||
|
||
# 特征检测:item 包含 mapping 且 字符串形式包含 fragments
|
||
first = data[0]
|
||
if not isinstance(first, Mapping) or "mapping" not in first:
|
||
return []
|
||
# 简单的 heuristic: 检查 mapping 里的 message 是否有 fragments
|
||
is_deepseek = False
|
||
mapping_sample = first.get("mapping", {})
|
||
for node in mapping_sample.values():
|
||
if node and node.get("message", {}).get("fragments"):
|
||
is_deepseek = True
|
||
break
|
||
|
||
if not is_deepseek:
|
||
return []
|
||
|
||
threads: list[ChatThread] = []
|
||
|
||
for idx, conversation in enumerate(data, start=1):
|
||
title = conversation.get("title", f"DeepSeek对话_{idx}")
|
||
mapping = conversation.get("mapping", {})
|
||
if not mapping:
|
||
continue
|
||
|
||
# 寻找叶子节点
|
||
all_nodes = set(mapping.keys())
|
||
parent_nodes = {node.get("parent") for node in mapping.values() if node}
|
||
leaf_nodes = all_nodes - parent_nodes - {"root"}
|
||
|
||
if not leaf_nodes:
|
||
continue
|
||
|
||
# 提取最长分支 (Main Branch)
|
||
# 如果需要所有分支,可以遍历 leaf_nodes
|
||
# 这里为了简化 UI,默认提取最长分支
|
||
longest_branch_ids: list[str] = []
|
||
|
||
# Helper to reconstruct path
|
||
def reconstruct_path(leaf_id: str) -> list[str]:
|
||
path = []
|
||
cur = leaf_id
|
||
while cur and cur != "root":
|
||
if cur not in mapping:
|
||
break
|
||
path.insert(0, cur)
|
||
cur = mapping[cur].get("parent", "root")
|
||
return path
|
||
|
||
branches = [reconstruct_path(leaf) for leaf in leaf_nodes]
|
||
if not branches:
|
||
continue
|
||
|
||
longest_branch_ids = max(branches, key=len)
|
||
|
||
messages: list[dict[str, Any]] = []
|
||
for node_id in longest_branch_ids:
|
||
node = mapping.get(node_id) or {}
|
||
msg_obj = node.get("message", {})
|
||
if not msg_obj:
|
||
continue
|
||
|
||
# DeepSeek specific content extraction
|
||
fragments = msg_obj.get("fragments", [])
|
||
content_buf = []
|
||
role = "user" # default
|
||
|
||
# Check type from fragments or fallback to author
|
||
# Fragments typically define content parts
|
||
for frag in fragments:
|
||
text = frag.get("content", "")
|
||
frag_type = frag.get("type")
|
||
if frag_type == "REQUEST":
|
||
role = "user"
|
||
elif frag_type == "RESPONSE":
|
||
role = "assistant"
|
||
|
||
if text:
|
||
content_buf.append(text)
|
||
|
||
full_text = "".join(content_buf)
|
||
if not full_text.strip():
|
||
continue
|
||
|
||
# If role wasn't determined by fragments (rare), try author
|
||
if not fragments:
|
||
role = msg_obj.get("author", {}).get("role", "user")
|
||
full_text = msg_obj.get("content", "")
|
||
|
||
messages.append({
|
||
"id": node_id,
|
||
"role": role,
|
||
"content": full_text.strip(),
|
||
"timestamp": _normalize_timestamp(conversation.get("inserted_at")), # DeepSeek conv often has one time?
|
||
"metadata": {}
|
||
})
|
||
|
||
if messages:
|
||
conv_id = conversation.get("id") or f"deepseek-{idx}"
|
||
threads.append(ChatThread(id=str(conv_id), title=f"[DeepSeek] {title}", messages=messages))
|
||
|
||
return threads
|
||
|
||
|
||
# =============================================================================
|
||
# Common Helpers
|
||
# =============================================================================
|
||
|
||
def _reconstruct_longest_branch(mapping: Mapping[str, Any], leaf_id: str | None) -> list[str]:
|
||
"""Generic branch reconstruction, preferring provided leaf_id or longest path"""
|
||
if not leaf_id:
|
||
# choose longest path from all leaves
|
||
nodes = set(mapping.keys())
|
||
parents = {node.get("parent") for node in mapping.values() if node}
|
||
leaf_nodes = nodes - parents - {"root"}
|
||
longest: list[str] = []
|
||
|
||
def get_path(leaf: str) -> list[str]:
|
||
path = []
|
||
cur = leaf
|
||
visited = set()
|
||
while cur and cur not in visited and cur != "root":
|
||
node = mapping.get(cur)
|
||
if not node:
|
||
break
|
||
path.append(cur)
|
||
visited.add(cur)
|
||
cur = node.get("parent")
|
||
path.reverse()
|
||
return path
|
||
|
||
for leaf in leaf_nodes:
|
||
branch = get_path(leaf)
|
||
if len(branch) > len(longest):
|
||
longest = branch
|
||
return longest
|
||
|
||
# Reconstruct from specific leaf
|
||
path: list[str] = []
|
||
visited: set[str] = set()
|
||
cur = leaf_id
|
||
while cur and cur not in visited and cur != "root":
|
||
node = mapping.get(cur)
|
||
if not node:
|
||
break
|
||
path.append(cur)
|
||
visited.add(cur)
|
||
cur = node.get("parent")
|
||
path.reverse()
|
||
return path
|
||
|
||
|
||
def _normalize_timestamp(value: Any) -> str:
|
||
if value is None:
|
||
return ""
|
||
if isinstance(value, (int, float)):
|
||
try:
|
||
if value > 1e12: # milliseconds
|
||
dt = datetime.fromtimestamp(value / 1000)
|
||
else:
|
||
dt = datetime.fromtimestamp(value)
|
||
return dt.isoformat(sep=" ", timespec="minutes")
|
||
except Exception:
|
||
return str(value)
|
||
if isinstance(value, str):
|
||
try:
|
||
dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
|
||
return dt.isoformat(sep=" ", timespec="minutes")
|
||
except Exception:
|
||
return str(value)
|
||
return str(value)
|
||
|
||
|
||
# =============================================================================
|
||
# Text Import
|
||
# =============================================================================
|
||
|
||
def parse_text_transcript(text: str) -> list[ChatThread]:
|
||
"""
|
||
极简文本解析。
|
||
仅基于用户指定的强分隔符进行切分。
|
||
"""
|
||
if not text.strip():
|
||
return []
|
||
|
||
lines = text.strip().split('\n')
|
||
messages: list[dict[str, Any]] = []
|
||
|
||
# 用户指定的强分隔符
|
||
# 1. [你说]
|
||
# 2. 你说:
|
||
# 3. ChatGPT说:
|
||
# 4. ChatGPT 说:
|
||
|
||
markers = [
|
||
(r"\[你说\]", "user"),
|
||
(r"你说[::]", "user"),
|
||
(r"ChatGPT\s*说[::]?", "assistant"), # 匹配 "ChatGPT说" 或 "ChatGPT说:"
|
||
(r"AI\s*说[::]?", "assistant"), # 预防性添加 AI 说
|
||
]
|
||
|
||
# 保留基础的 User/AI 冒号格式,但必须行首
|
||
markers.extend([
|
||
(r"^User[::]", "user"),
|
||
(r"^AI[::]", "assistant"),
|
||
])
|
||
|
||
for line in lines:
|
||
line_str = line.strip()
|
||
if not line_str:
|
||
continue
|
||
|
||
# 匹配角色
|
||
matched_role = None
|
||
for pattern, role in markers:
|
||
if re.search(pattern, line_str, re.IGNORECASE):
|
||
matched_role = role
|
||
break
|
||
|
||
if matched_role:
|
||
# 是 Header 行
|
||
# 尝试提取冒号后的内容
|
||
content = ""
|
||
parts = re.split(r"[::]", line_str, 1)
|
||
if len(parts) > 1:
|
||
content = parts[1].strip()
|
||
|
||
messages.append({
|
||
"id": f"text-{len(messages)+1}",
|
||
"role": matched_role,
|
||
"content": content,
|
||
"timestamp": None
|
||
})
|
||
else:
|
||
# 是内容行
|
||
if messages:
|
||
if messages[-1]["content"]:
|
||
messages[-1]["content"] += "\n" + line
|
||
else:
|
||
messages[-1]["content"] = line
|
||
else:
|
||
# 第一行就是内容,默认 User
|
||
messages.append({
|
||
"id": "text-1",
|
||
"role": "user",
|
||
"content": line,
|
||
"timestamp": None
|
||
})
|
||
|
||
if not messages:
|
||
return []
|
||
|
||
msg_count = len(messages)
|
||
preview = messages[0]["content"][:10].replace("\n", " ") if messages else ""
|
||
title = f"文本导入 ({msg_count}条) - {preview}..."
|
||
|
||
return [ChatThread(id="pasted-text", title=title, messages=messages)]
|
||
|
||
|
||
# =============================================================================
|
||
# HTML Import
|
||
# =============================================================================
|
||
|
||
class ChatGPTHTMLParser(HTMLParser):
|
||
VOID_ELEMENTS = {
|
||
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
|
||
'link', 'meta', 'param', 'source', 'track', 'wbr'
|
||
}
|
||
# 移除 meta 和 link,因为它们是 void elements,不会触发 handle_endtag,
|
||
# 导致 ignore_level 无法复位,从而错误忽略后续所有内容。
|
||
IGNORE_TAGS = {'button', 'svg', 'style', 'script', 'head', 'title'}
|
||
|
||
def __init__(self):
|
||
super().__init__()
|
||
self.messages = []
|
||
self.current_msg = None
|
||
self.recording = False
|
||
self.depth = 0
|
||
self.turn_start_depth = 0
|
||
self.ignore_level = 0
|
||
|
||
def handle_starttag(self, tag, attrs):
|
||
if tag in self.IGNORE_TAGS:
|
||
self.ignore_level += 1
|
||
|
||
# Void elements do not increase depth because they don't have end tags
|
||
if tag not in self.VOID_ELEMENTS:
|
||
self.depth += 1
|
||
|
||
attrs_dict = dict(attrs)
|
||
|
||
# 检测 Turn 容器 (data-testid="conversation-turn-X")
|
||
if 'data-testid' in attrs_dict and attrs_dict['data-testid'].startswith('conversation-turn-'):
|
||
# 只要遇到新的 Turn,强制保存上一个(防止 depth 误判导致未保存)
|
||
if self.current_msg:
|
||
self._save_current()
|
||
|
||
# 默认为 user,如果后面找到了 role 标签会更新
|
||
self.current_msg = {"role": "user", "content": []}
|
||
self.recording = True
|
||
self.turn_start_depth = self.depth
|
||
return
|
||
|
||
# 在录制状态下,寻找角色标识来修正角色
|
||
if self.recording and 'data-message-author-role' in attrs_dict:
|
||
self.current_msg["role"] = attrs_dict['data-message-author-role']
|
||
|
||
def handle_endtag(self, tag):
|
||
if tag in self.IGNORE_TAGS:
|
||
if self.ignore_level > 0:
|
||
self.ignore_level -= 1
|
||
|
||
# 处理换行 (块级元素结束时)
|
||
if self.recording and self.ignore_level == 0:
|
||
if tag in ('p', 'div', 'br', 'li', 'tr', 'pre', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
||
self.current_msg["content"].append("\n")
|
||
|
||
if tag not in self.VOID_ELEMENTS:
|
||
# 闭合 Turn 容器
|
||
if self.recording and self.depth == self.turn_start_depth:
|
||
self._save_current()
|
||
self.depth -= 1
|
||
|
||
def handle_data(self, data):
|
||
if self.recording and self.ignore_level == 0:
|
||
# 保留原始内容,不 strip,避免行内元素粘连
|
||
self.current_msg["content"].append(data)
|
||
|
||
def close(self):
|
||
super().close()
|
||
# 确保最后一条消息被保存
|
||
if self.current_msg:
|
||
self._save_current()
|
||
|
||
def _save_current(self):
|
||
if self.current_msg:
|
||
full_text = "".join(self.current_msg["content"]).strip()
|
||
# 简单的空白字符清理
|
||
full_text = re.sub(r'\n{3,}', '\n\n', full_text)
|
||
|
||
if full_text:
|
||
self.messages.append({
|
||
"role": self.current_msg["role"],
|
||
"content": full_text
|
||
})
|
||
self.current_msg = None
|
||
self.recording = False
|
||
|
||
def parse_html_transcript(html_content: str) -> list[ChatThread]:
|
||
"""解析 ChatGPT HTML 导出文件"""
|
||
parser = ChatGPTHTMLParser()
|
||
parser.feed(html_content)
|
||
|
||
if not parser.messages:
|
||
return []
|
||
|
||
# 构造 ChatThread
|
||
title = "HTML 导入对话"
|
||
title_match = re.search(r"<title>(.*?)</title>", html_content)
|
||
if title_match:
|
||
title = title_match.group(1)
|
||
|
||
# 转换为标准消息格式
|
||
msgs = []
|
||
for idx, msg in enumerate(parser.messages, 1):
|
||
msgs.append({
|
||
"id": f"html-{idx}",
|
||
"role": msg["role"],
|
||
"content": msg["content"],
|
||
"timestamp": None
|
||
})
|
||
|
||
return [ChatThread(id="html-import", title=title, messages=msgs)]
|
||
|
||
|
||
__all__ = [
|
||
"ChatThread",
|
||
"ChatImporterError",
|
||
"detect_threads",
|
||
"parse_text_transcript",
|
||
"parse_html_transcript",
|
||
]
|
||
|