mirror of
https://github.com/hellodigua/ChatLab.git
synced 2026-05-06 13:06:09 +08:00
feat: 兼容weflow导出的json格式
This commit is contained in:
@@ -114,7 +114,7 @@ export const feature: FormatFeature = {
|
||||
id: 'chatlab-jsonl',
|
||||
name: 'ChatLab JSONL',
|
||||
platform: KNOWN_PLATFORMS.UNKNOWN,
|
||||
priority: 2, // 仅次于 ChatLab JSON
|
||||
priority: 51, // 低优先级,让其他格式先匹配
|
||||
extensions: ['.jsonl'],
|
||||
signatures: {
|
||||
// 第一行必须是 header 类型,包含 chatlab 信息
|
||||
|
||||
@@ -46,11 +46,10 @@ export const feature: FormatFeature = {
|
||||
id: 'chatlab',
|
||||
name: 'ChatLab JSON',
|
||||
platform: KNOWN_PLATFORMS.UNKNOWN, // ChatLab 格式可能包含多平台数据
|
||||
priority: 1, // 最高优先级
|
||||
priority: 50, // 低优先级,让其他格式先匹配
|
||||
extensions: ['.json'],
|
||||
signatures: {
|
||||
// 只要求 chatlab 字段在文件头(8KB),其他字段在解析时验证
|
||||
// 移除过于宽松的 version 签名,只保留 chatlab 对象签名
|
||||
// 只要求 chatlab 字段在文件头,其他字段在解析时验证
|
||||
head: [/"chatlab"\s*:\s*\{/],
|
||||
requiredFields: ['chatlab'],
|
||||
},
|
||||
|
||||
@@ -10,7 +10,7 @@ import chatlab from './chatlab'
|
||||
import chatlabJsonl from './chatlab-jsonl'
|
||||
import shuakamiQqExporter from './shuakami-qq-exporter'
|
||||
import shuakamiQqExporterChunked from './shuakami-qq-exporter-chunked'
|
||||
import yccccccyEchotrace from './ycccccccy-echotrace'
|
||||
import weflow from './weflow'
|
||||
import tyrrrzDiscordExporter from './tyrrrz-discord-exporter'
|
||||
import whatsappNativeTxt from './whatsapp-native-txt'
|
||||
import qqNativeTxt from './qq-native-txt'
|
||||
@@ -18,17 +18,18 @@ import instagramNative from './instagram-native'
|
||||
|
||||
/**
|
||||
* 所有支持的格式模块(按优先级排序)
|
||||
* 注意:注册时会自动按 priority 字段排序
|
||||
*/
|
||||
export const formats: FormatModule[] = [
|
||||
chatlab, // 优先级 1 - ChatLab JSON
|
||||
chatlabJsonl, // 优先级 2 - ChatLab JSONL(流式格式,支持超大文件)
|
||||
shuakamiQqExporterChunked, // 优先级 5 - shuakami/qq-chat-exporter chunked-jsonl
|
||||
shuakamiQqExporter, // 优先级 10 - shuakami/qq-chat-exporter
|
||||
yccccccyEchotrace, // 优先级 15 - ycccccccy/echotrace
|
||||
weflow, // 优先级 15 - WeFlow 微信导出
|
||||
tyrrrzDiscordExporter, // 优先级 20 - Tyrrrz/DiscordChatExporter
|
||||
instagramNative, // 优先级 25 - Instagram 官方导出
|
||||
whatsappNativeTxt, // 优先级 26 - WhatsApp 官方导出 TXT
|
||||
qqNativeTxt, // 优先级 30 - QQ 官方导出 TXT
|
||||
chatlab, // 优先级 50 - ChatLab JSON
|
||||
chatlabJsonl, // 优先级 51 - ChatLab JSONL(流式格式,支持超大文件)
|
||||
]
|
||||
|
||||
// 按名称导出,方便单独使用
|
||||
@@ -37,7 +38,7 @@ export {
|
||||
chatlabJsonl,
|
||||
shuakamiQqExporter,
|
||||
shuakamiQqExporterChunked,
|
||||
yccccccyEchotrace,
|
||||
weflow,
|
||||
tyrrrzDiscordExporter,
|
||||
instagramNative,
|
||||
whatsappNativeTxt,
|
||||
|
||||
+5
-6
@@ -1,8 +1,8 @@
|
||||
/**
|
||||
* echotrace 格式预处理器
|
||||
* WeFlow 格式预处理器
|
||||
* 用于大文件预处理,移除冗余字段
|
||||
*
|
||||
* 当前为预留实现,echotrace 格式的字段结构较为简洁,
|
||||
* 当前为预留实现,WeFlow 格式的字段结构较为简洁,
|
||||
* 暂不需要复杂的预处理逻辑。
|
||||
*
|
||||
* 如果未来发现性能问题,可在此添加:
|
||||
@@ -14,10 +14,10 @@
|
||||
import type { Preprocessor, ParseProgress } from '../types'
|
||||
|
||||
/**
|
||||
* echotrace 预处理器
|
||||
* WeFlow 预处理器
|
||||
* 当前为预留实现,返回不需要预处理
|
||||
*/
|
||||
export const echotracePreprocessor: Preprocessor = {
|
||||
export const weflowPreprocessor: Preprocessor = {
|
||||
/**
|
||||
* 判断是否需要预处理
|
||||
* 当前策略:暂不需要预处理
|
||||
@@ -54,5 +54,4 @@ export const echotracePreprocessor: Preprocessor = {
|
||||
},
|
||||
}
|
||||
|
||||
export default echotracePreprocessor
|
||||
|
||||
export default weflowPreprocessor
|
||||
+122
-53
@@ -1,12 +1,14 @@
|
||||
/**
|
||||
* ycccccccy/echotrace 导出格式解析器
|
||||
* 适配项目: https://github.com/ycccccccy/echotrace
|
||||
* WeFlow 导出格式解析器
|
||||
* 适配项目: WeFlow 聊天记录导出工具
|
||||
*
|
||||
* 特征:
|
||||
* - 顶层包含 session 和 messages 字段
|
||||
* - 顶层包含 weflow、session 和 messages 字段
|
||||
* - weflow 对象包含版本信息和导出时间
|
||||
* - session.wxid: ID(群聊以 @chatroom 结尾)
|
||||
* - session.type: "群聊" 或 "私聊"
|
||||
* - messages[].type: 中文消息类型字符串
|
||||
* - session.avatar: 群/用户头像(base64 Data URL)
|
||||
* - messages[].isSend: 1=发送者本人, 0=接收, null=系统
|
||||
* - messages[].senderUsername: 发送者ID
|
||||
* - messages[].senderDisplayName: 发送者显示名
|
||||
*
|
||||
@@ -46,21 +48,23 @@ function extractNameFromFilePath(filePath: string): string {
|
||||
// ==================== 特征定义 ====================
|
||||
|
||||
export const feature: FormatFeature = {
|
||||
id: 'ycccccccy-echotrace',
|
||||
name: 'ycccccccy/echotrace 导出',
|
||||
id: 'weflow',
|
||||
name: 'WeFlow 导出',
|
||||
platform: KNOWN_PLATFORMS.WECHAT,
|
||||
priority: 15,
|
||||
extensions: ['.json'],
|
||||
signatures: {
|
||||
// 检测顶层字段和特征
|
||||
head: [/"session"\s*:/, /"senderUsername"\s*:/, /"senderDisplayName"\s*:/],
|
||||
requiredFields: ['session', 'messages'],
|
||||
// weflow 对象是唯一识别特征
|
||||
// 注意:session.avatar 包含 base64 图片,可能很大,所以 messages 字段可能不在 8KB 文件头中
|
||||
// 只检测 weflow 和 session(它们在文件开头)
|
||||
head: [/"weflow"\s*:\s*\{/],
|
||||
requiredFields: ['weflow', 'session'],
|
||||
},
|
||||
}
|
||||
|
||||
// ==================== 消息结构 ====================
|
||||
// ==================== 数据结构 ====================
|
||||
|
||||
interface EchotraceSession {
|
||||
interface WeFlowSession {
|
||||
wxid: string
|
||||
nickname: string
|
||||
remark: string
|
||||
@@ -68,9 +72,10 @@ interface EchotraceSession {
|
||||
type: '群聊' | '私聊'
|
||||
lastTimestamp: number
|
||||
messageCount: number
|
||||
avatar?: string // 群/用户头像(base64 Data URL)
|
||||
}
|
||||
|
||||
interface EchotraceMessage {
|
||||
interface WeFlowMessage {
|
||||
localId: number
|
||||
createTime: number // Unix 时间戳(秒)
|
||||
formattedTime: string
|
||||
@@ -84,17 +89,10 @@ interface EchotraceMessage {
|
||||
source: string
|
||||
}
|
||||
|
||||
// ==================== 头像信息结构 ====================
|
||||
|
||||
interface EchotraceAvatarInfo {
|
||||
displayName: string
|
||||
base64: string // 原始 base64,不包含 Data URL 前缀
|
||||
}
|
||||
|
||||
// ==================== 消息类型映射 ====================
|
||||
|
||||
/**
|
||||
* 将 echotrace 中文消息类型转换为标准 MessageType
|
||||
* 将 WeFlow 中文消息类型转换为标准 MessageType
|
||||
*/
|
||||
function convertMessageType(typeStr: string): MessageType {
|
||||
switch (typeStr) {
|
||||
@@ -136,6 +134,10 @@ function convertMessageType(typeStr: string): MessageType {
|
||||
}
|
||||
}
|
||||
|
||||
// ==================== 头像信息结构 ====================
|
||||
// WeFlow 的 avatars 对象直接存储 base64 Data URL 字符串
|
||||
// 格式:{ "wxid": "data:image/jpeg;base64,..." }
|
||||
|
||||
// ==================== 成员信息追踪 ====================
|
||||
|
||||
interface MemberInfo {
|
||||
@@ -146,7 +148,7 @@ interface MemberInfo {
|
||||
|
||||
// ==================== 解析器实现 ====================
|
||||
|
||||
async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent, void, unknown> {
|
||||
async function* parseWeFlow(options: ParseOptions): AsyncGenerator<ParseEvent, void, unknown> {
|
||||
const { filePath, batchSize = 5000, onProgress, onLog } = options
|
||||
|
||||
const totalBytes = getFileSize(filePath)
|
||||
@@ -159,18 +161,82 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
|
||||
onProgress?.(initialProgress)
|
||||
|
||||
// 记录解析开始
|
||||
onLog?.('info', `开始解析 Echotrace 导出文件,大小: ${(totalBytes / 1024 / 1024).toFixed(2)} MB`)
|
||||
onLog?.('info', `开始解析 WeFlow 导出文件,大小: ${(totalBytes / 1024 / 1024).toFixed(2)} MB`)
|
||||
|
||||
// 读取文件头获取 session 信息
|
||||
const headContent = readFileHeadBytes(filePath, 2000)
|
||||
// 读取文件头获取基本信息
|
||||
const headContent = readFileHeadBytes(filePath, 5000)
|
||||
|
||||
// 解析 session
|
||||
let session: EchotraceSession | null = null
|
||||
// 使用流式读取获取完整的 session 对象(因为 session.avatar 可能很大)
|
||||
let session: WeFlowSession | null = null
|
||||
try {
|
||||
const sessionMatch = headContent.match(/"session"\s*:\s*(\{[^}]+\})/)
|
||||
if (sessionMatch) {
|
||||
session = JSON.parse(sessionMatch[1])
|
||||
}
|
||||
await new Promise<void>((resolve) => {
|
||||
const sessionStream = fs.createReadStream(filePath, { encoding: 'utf-8' })
|
||||
|
||||
let sessionContent = ''
|
||||
let inSession = false
|
||||
let braceDepth = 0
|
||||
let inString = false
|
||||
let escape = false
|
||||
|
||||
sessionStream.on('data', (chunk: string | Buffer) => {
|
||||
const str = typeof chunk === 'string' ? chunk : chunk.toString()
|
||||
|
||||
for (let i = 0; i < str.length; i++) {
|
||||
const char = str[i]
|
||||
|
||||
if (!inSession) {
|
||||
// 查找 "session": 的位置
|
||||
const searchStr = '"session":'
|
||||
if (str.slice(i, i + searchStr.length) === searchStr) {
|
||||
inSession = true
|
||||
i += searchStr.length - 1
|
||||
continue
|
||||
}
|
||||
} else {
|
||||
sessionContent += char
|
||||
|
||||
if (escape) {
|
||||
escape = false
|
||||
continue
|
||||
}
|
||||
|
||||
if (char === '\\' && inString) {
|
||||
escape = true
|
||||
continue
|
||||
}
|
||||
|
||||
if (char === '"') {
|
||||
inString = !inString
|
||||
continue
|
||||
}
|
||||
|
||||
if (!inString) {
|
||||
if (char === '{') braceDepth++
|
||||
if (char === '}') {
|
||||
braceDepth--
|
||||
if (braceDepth === 0) {
|
||||
sessionStream.destroy()
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
sessionStream.on('close', () => {
|
||||
if (sessionContent) {
|
||||
try {
|
||||
session = JSON.parse(sessionContent) as WeFlowSession
|
||||
} catch {
|
||||
// 解析失败
|
||||
}
|
||||
}
|
||||
resolve()
|
||||
})
|
||||
|
||||
sessionStream.on('error', () => resolve())
|
||||
})
|
||||
} catch {
|
||||
// 使用默认值
|
||||
}
|
||||
@@ -258,11 +324,11 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
|
||||
// 先尝试从文件头解析(适用于成员较少的聊天)
|
||||
const avatarsContent = extractAvatarsObject(headContent)
|
||||
if (avatarsContent) {
|
||||
const avatarsObj = JSON.parse(avatarsContent) as Record<string, EchotraceAvatarInfo>
|
||||
for (const [wxid, avatarInfo] of Object.entries(avatarsObj)) {
|
||||
if (avatarInfo && typeof avatarInfo === 'object' && avatarInfo.base64) {
|
||||
// 添加 Data URL 前缀
|
||||
avatarsMap.set(wxid, `data:image/jpeg;base64,${avatarInfo.base64}`)
|
||||
// WeFlow 的 avatars 值直接是 base64 Data URL 字符串
|
||||
const avatarsObj = JSON.parse(avatarsContent) as Record<string, string>
|
||||
for (const [wxid, avatarDataUrl] of Object.entries(avatarsObj)) {
|
||||
if (avatarDataUrl && typeof avatarDataUrl === 'string') {
|
||||
avatarsMap.set(wxid, avatarDataUrl)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -270,7 +336,7 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
|
||||
// avatars 解析失败,继续不带头像
|
||||
}
|
||||
|
||||
// 如果文件头没有完整的 avatars(可能超出 2000 字节),尝试流式读取
|
||||
// 如果文件头没有完整的 avatars(可能超出 5000 字节),尝试流式读取
|
||||
if (avatarsMap.size === 0) {
|
||||
try {
|
||||
await new Promise<void>((resolve) => {
|
||||
@@ -334,10 +400,11 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
|
||||
avatarStream.on('close', () => {
|
||||
if (avatarsContent) {
|
||||
try {
|
||||
const avatarsObj = JSON.parse(avatarsContent) as Record<string, EchotraceAvatarInfo>
|
||||
for (const [wxid, avatarInfo] of Object.entries(avatarsObj)) {
|
||||
if (avatarInfo && typeof avatarInfo === 'object' && avatarInfo.base64) {
|
||||
avatarsMap.set(wxid, `data:image/jpeg;base64,${avatarInfo.base64}`)
|
||||
// WeFlow 的 avatars 值直接是 base64 Data URL 字符串
|
||||
const avatarsObj = JSON.parse(avatarsContent) as Record<string, string>
|
||||
for (const [wxid, avatarDataUrl] of Object.entries(avatarsObj)) {
|
||||
if (avatarDataUrl && typeof avatarDataUrl === 'string') {
|
||||
avatarsMap.set(wxid, avatarDataUrl)
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
@@ -354,8 +421,8 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
|
||||
}
|
||||
}
|
||||
|
||||
// 提取群头像(从 avatars 中获取群ID对应的头像)
|
||||
const groupAvatar = groupId ? avatarsMap.get(groupId) : undefined
|
||||
// 提取群头像(优先从 session.avatar,其次从 avatars 中获取群ID对应的头像)
|
||||
const groupAvatar = session?.avatar || (groupId ? avatarsMap.get(groupId) : undefined)
|
||||
|
||||
// 快速扫描获取 ownerId(通过 isSend === 1 推断)
|
||||
let ownerId: string | undefined
|
||||
@@ -364,7 +431,7 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
|
||||
const scanStream = fs.createReadStream(filePath, { encoding: 'utf-8' })
|
||||
const scanPipeline = chain([scanStream, parser(), pick({ filter: /^messages\.\d+$/ }), streamValues()])
|
||||
|
||||
scanPipeline.on('data', ({ value }: { value: EchotraceMessage }) => {
|
||||
scanPipeline.on('data', ({ value }: { value: WeFlowMessage }) => {
|
||||
if (value.isSend === 1 && value.senderUsername && !value.senderUsername.endsWith('@chatroom')) {
|
||||
ownerId = value.senderUsername
|
||||
scanStream.destroy() // 找到后立即停止扫描
|
||||
@@ -404,7 +471,7 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
|
||||
|
||||
const pipeline = chain([readStream, parser(), pick({ filter: /^messages\.\d+$/ }), streamValues()])
|
||||
|
||||
const processMessage = (msg: EchotraceMessage): ParsedMessage | null => {
|
||||
const processMessage = (msg: WeFlowMessage): ParsedMessage | null => {
|
||||
// 验证必要字段
|
||||
if (!msg.senderUsername || msg.createTime === undefined) {
|
||||
return null
|
||||
@@ -420,7 +487,7 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
|
||||
|
||||
const accountName = msg.senderDisplayName || platformId
|
||||
|
||||
// 获取头像(优先使用 senderAvatarKey,fallback 到 senderUsername)
|
||||
// 获取头像(通过 senderAvatarKey 从 avatarsMap 查找)
|
||||
const avatarKey = msg.senderAvatarKey || msg.senderUsername
|
||||
const avatar = avatarsMap.get(avatarKey)
|
||||
|
||||
@@ -445,28 +512,30 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
|
||||
const type = convertMessageType(msg.type)
|
||||
|
||||
// 确保 content 是字符串类型(防止某些消息类型的 content 是对象)
|
||||
// 同时去除开头和结尾的空白字符
|
||||
let content: string | null = null
|
||||
if (msg.content != null) {
|
||||
content = typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content)
|
||||
const rawContent = typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content)
|
||||
content = rawContent.trim() || null
|
||||
}
|
||||
|
||||
return {
|
||||
platformMessageId: String(msg.localId), // 消息的平台原始 ID(用于回复关联查询)
|
||||
senderPlatformId: platformId,
|
||||
senderAccountName: accountName,
|
||||
// echotrace 格式没有单独的群昵称字段,使用 null 而非 undefined(SQLite 兼容)
|
||||
// WeFlow 格式没有单独的群昵称字段,使用 null 而非 undefined(SQLite 兼容)
|
||||
senderGroupNickname: null,
|
||||
timestamp: msg.createTime,
|
||||
type,
|
||||
content,
|
||||
// 注意:echotrace 导出格式不包含被引用消息的 ID,所以 replyToMessageId 为空
|
||||
// 注意:WeFlow 导出格式不包含被引用消息的 ID,所以 replyToMessageId 为空
|
||||
}
|
||||
}
|
||||
|
||||
// 用于收集批次的临时数组
|
||||
const batchCollector: ParsedMessage[] = []
|
||||
|
||||
pipeline.on('data', ({ value }: { value: EchotraceMessage }) => {
|
||||
pipeline.on('data', ({ value }: { value: WeFlowMessage }) => {
|
||||
const parsed = processMessage(value)
|
||||
if (parsed) {
|
||||
batchCollector.push(parsed)
|
||||
@@ -532,20 +601,20 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
|
||||
|
||||
export const parser_: Parser = {
|
||||
feature,
|
||||
parse: parseEchotrace,
|
||||
parse: parseWeFlow,
|
||||
}
|
||||
|
||||
// ==================== 预处理器(预留) ====================
|
||||
|
||||
import { echotracePreprocessor } from './ycccccccy-echotrace-preprocessor'
|
||||
export const preprocessor = echotracePreprocessor
|
||||
import { weflowPreprocessor } from './weflow-preprocessor'
|
||||
export const preprocessor = weflowPreprocessor
|
||||
|
||||
// ==================== 导出格式模块 ====================
|
||||
|
||||
const module_: FormatModule = {
|
||||
feature,
|
||||
parser: parser_,
|
||||
preprocessor: echotracePreprocessor,
|
||||
preprocessor: weflowPreprocessor,
|
||||
}
|
||||
|
||||
export default module_
|
||||
Reference in New Issue
Block a user