feat: 兼容weflow导出的json格式

This commit is contained in:
digua
2026-01-30 22:34:46 +08:00
committed by digua
parent 9d84c35b7a
commit fc3143563d
6 changed files with 138 additions and 70 deletions
@@ -114,7 +114,7 @@ export const feature: FormatFeature = {
id: 'chatlab-jsonl',
name: 'ChatLab JSONL',
platform: KNOWN_PLATFORMS.UNKNOWN,
priority: 2, // 仅次于 ChatLab JSON
priority: 51, // 低优先级,让其他格式先匹配
extensions: ['.jsonl'],
signatures: {
// 第一行必须是 header 类型,包含 chatlab 信息
+2 -3
View File
@@ -46,11 +46,10 @@ export const feature: FormatFeature = {
id: 'chatlab',
name: 'ChatLab JSON',
platform: KNOWN_PLATFORMS.UNKNOWN, // ChatLab 格式可能包含多平台数据
priority: 1, // 最高优先级
priority: 50, // 优先级,让其他格式先匹配
extensions: ['.json'],
signatures: {
// 只要求 chatlab 字段在文件头8KB,其他字段在解析时验证
// 移除过于宽松的 version 签名,只保留 chatlab 对象签名
// 只要求 chatlab 字段在文件头,其他字段在解析时验证
head: [/"chatlab"\s*:\s*\{/],
requiredFields: ['chatlab'],
},
+6 -5
View File
@@ -10,7 +10,7 @@ import chatlab from './chatlab'
import chatlabJsonl from './chatlab-jsonl'
import shuakamiQqExporter from './shuakami-qq-exporter'
import shuakamiQqExporterChunked from './shuakami-qq-exporter-chunked'
import yccccccyEchotrace from './ycccccccy-echotrace'
import weflow from './weflow'
import tyrrrzDiscordExporter from './tyrrrz-discord-exporter'
import whatsappNativeTxt from './whatsapp-native-txt'
import qqNativeTxt from './qq-native-txt'
@@ -18,17 +18,18 @@ import instagramNative from './instagram-native'
/**
* 所有支持的格式模块(按优先级排序)
* 注意:注册时会自动按 priority 字段排序
*/
export const formats: FormatModule[] = [
chatlab, // 优先级 1 - ChatLab JSON
chatlabJsonl, // 优先级 2 - ChatLab JSONL(流式格式,支持超大文件)
shuakamiQqExporterChunked, // 优先级 5 - shuakami/qq-chat-exporter chunked-jsonl
shuakamiQqExporter, // 优先级 10 - shuakami/qq-chat-exporter
yccccccyEchotrace, // 优先级 15 - ycccccccy/echotrace
weflow, // 优先级 15 - WeFlow 微信导出
tyrrrzDiscordExporter, // 优先级 20 - Tyrrrz/DiscordChatExporter
instagramNative, // 优先级 25 - Instagram 官方导出
whatsappNativeTxt, // 优先级 26 - WhatsApp 官方导出 TXT
qqNativeTxt, // 优先级 30 - QQ 官方导出 TXT
chatlab, // 优先级 50 - ChatLab JSON
chatlabJsonl, // 优先级 51 - ChatLab JSONL(流式格式,支持超大文件)
]
// 按名称导出,方便单独使用
@@ -37,7 +38,7 @@ export {
chatlabJsonl,
shuakamiQqExporter,
shuakamiQqExporterChunked,
yccccccyEchotrace,
weflow,
tyrrrzDiscordExporter,
instagramNative,
whatsappNativeTxt,
@@ -1,8 +1,8 @@
/**
* echotrace
* WeFlow
*
*
* echotrace
* WeFlow
*
*
*
@@ -14,10 +14,10 @@
import type { Preprocessor, ParseProgress } from '../types'
/**
* echotrace
* WeFlow
*
*/
export const echotracePreprocessor: Preprocessor = {
export const weflowPreprocessor: Preprocessor = {
/**
*
*
@@ -54,5 +54,4 @@ export const echotracePreprocessor: Preprocessor = {
},
}
export default echotracePreprocessor
export default weflowPreprocessor
@@ -1,12 +1,14 @@
/**
* ycccccccy/echotrace
* 适配项目: https://github.com/ycccccccy/echotrace
* WeFlow
* 适配项目: WeFlow
*
*
* - session messages
* - weflowsession messages
* - weflow
* - session.wxid: ID @chatroom
* - session.type: "群聊" "私聊"
* - messages[].type:
* - session.avatar: /base64 Data URL
* - messages[].isSend: 1=, 0=, null=
* - messages[].senderUsername: 发送者ID
* - messages[].senderDisplayName: 发送者显示名
*
@@ -46,21 +48,23 @@ function extractNameFromFilePath(filePath: string): string {
// ==================== 特征定义 ====================
export const feature: FormatFeature = {
id: 'ycccccccy-echotrace',
name: 'ycccccccy/echotrace 导出',
id: 'weflow',
name: 'WeFlow 导出',
platform: KNOWN_PLATFORMS.WECHAT,
priority: 15,
extensions: ['.json'],
signatures: {
// 检测顶层字段和特征
head: [/"session"\s*:/, /"senderUsername"\s*:/, /"senderDisplayName"\s*:/],
requiredFields: ['session', 'messages'],
// weflow 对象是唯一识别特征
// 注意:session.avatar 包含 base64 图片,可能很大,所以 messages 字段可能不在 8KB 文件头中
// 只检测 weflow 和 session(它们在文件开头)
head: [/"weflow"\s*:\s*\{/],
requiredFields: ['weflow', 'session'],
},
}
// ==================== 消息结构 ====================
// ==================== 数据结构 ====================
interface EchotraceSession {
interface WeFlowSession {
wxid: string
nickname: string
remark: string
@@ -68,9 +72,10 @@ interface EchotraceSession {
type: '群聊' | '私聊'
lastTimestamp: number
messageCount: number
avatar?: string // 群/用户头像(base64 Data URL
}
interface EchotraceMessage {
interface WeFlowMessage {
localId: number
createTime: number // Unix 时间戳(秒)
formattedTime: string
@@ -84,17 +89,10 @@ interface EchotraceMessage {
source: string
}
// ==================== 头像信息结构 ====================
interface EchotraceAvatarInfo {
displayName: string
base64: string // 原始 base64,不包含 Data URL 前缀
}
// ==================== 消息类型映射 ====================
/**
* echotrace MessageType
* WeFlow MessageType
*/
function convertMessageType(typeStr: string): MessageType {
switch (typeStr) {
@@ -136,6 +134,10 @@ function convertMessageType(typeStr: string): MessageType {
}
}
// ==================== 头像信息结构 ====================
// WeFlow 的 avatars 对象直接存储 base64 Data URL 字符串
// 格式:{ "wxid": "data:image/jpeg;base64,..." }
// ==================== 成员信息追踪 ====================
interface MemberInfo {
@@ -146,7 +148,7 @@ interface MemberInfo {
// ==================== 解析器实现 ====================
async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent, void, unknown> {
async function* parseWeFlow(options: ParseOptions): AsyncGenerator<ParseEvent, void, unknown> {
const { filePath, batchSize = 5000, onProgress, onLog } = options
const totalBytes = getFileSize(filePath)
@@ -159,18 +161,82 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
onProgress?.(initialProgress)
// 记录解析开始
onLog?.('info', `开始解析 Echotrace 导出文件,大小: ${(totalBytes / 1024 / 1024).toFixed(2)} MB`)
onLog?.('info', `开始解析 WeFlow 导出文件,大小: ${(totalBytes / 1024 / 1024).toFixed(2)} MB`)
// 读取文件头获取 session 信息
const headContent = readFileHeadBytes(filePath, 2000)
// 读取文件头获取基本信息
const headContent = readFileHeadBytes(filePath, 5000)
// 解析 session
let session: EchotraceSession | null = null
// 使用流式读取获取完整的 session 对象(因为 session.avatar 可能很大)
let session: WeFlowSession | null = null
try {
const sessionMatch = headContent.match(/"session"\s*:\s*(\{[^}]+\})/)
if (sessionMatch) {
session = JSON.parse(sessionMatch[1])
}
await new Promise<void>((resolve) => {
const sessionStream = fs.createReadStream(filePath, { encoding: 'utf-8' })
let sessionContent = ''
let inSession = false
let braceDepth = 0
let inString = false
let escape = false
sessionStream.on('data', (chunk: string | Buffer) => {
const str = typeof chunk === 'string' ? chunk : chunk.toString()
for (let i = 0; i < str.length; i++) {
const char = str[i]
if (!inSession) {
// 查找 "session": 的位置
const searchStr = '"session":'
if (str.slice(i, i + searchStr.length) === searchStr) {
inSession = true
i += searchStr.length - 1
continue
}
} else {
sessionContent += char
if (escape) {
escape = false
continue
}
if (char === '\\' && inString) {
escape = true
continue
}
if (char === '"') {
inString = !inString
continue
}
if (!inString) {
if (char === '{') braceDepth++
if (char === '}') {
braceDepth--
if (braceDepth === 0) {
sessionStream.destroy()
return
}
}
}
}
}
})
sessionStream.on('close', () => {
if (sessionContent) {
try {
session = JSON.parse(sessionContent) as WeFlowSession
} catch {
// 解析失败
}
}
resolve()
})
sessionStream.on('error', () => resolve())
})
} catch {
// 使用默认值
}
@@ -258,11 +324,11 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
// 先尝试从文件头解析(适用于成员较少的聊天)
const avatarsContent = extractAvatarsObject(headContent)
if (avatarsContent) {
const avatarsObj = JSON.parse(avatarsContent) as Record<string, EchotraceAvatarInfo>
for (const [wxid, avatarInfo] of Object.entries(avatarsObj)) {
if (avatarInfo && typeof avatarInfo === 'object' && avatarInfo.base64) {
// 添加 Data URL 前缀
avatarsMap.set(wxid, `data:image/jpeg;base64,${avatarInfo.base64}`)
// WeFlow 的 avatars 值直接是 base64 Data URL 字符串
const avatarsObj = JSON.parse(avatarsContent) as Record<string, string>
for (const [wxid, avatarDataUrl] of Object.entries(avatarsObj)) {
if (avatarDataUrl && typeof avatarDataUrl === 'string') {
avatarsMap.set(wxid, avatarDataUrl)
}
}
}
@@ -270,7 +336,7 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
// avatars 解析失败,继续不带头像
}
// 如果文件头没有完整的 avatars(可能超出 2000 字节),尝试流式读取
// 如果文件头没有完整的 avatars(可能超出 5000 字节),尝试流式读取
if (avatarsMap.size === 0) {
try {
await new Promise<void>((resolve) => {
@@ -334,10 +400,11 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
avatarStream.on('close', () => {
if (avatarsContent) {
try {
const avatarsObj = JSON.parse(avatarsContent) as Record<string, EchotraceAvatarInfo>
for (const [wxid, avatarInfo] of Object.entries(avatarsObj)) {
if (avatarInfo && typeof avatarInfo === 'object' && avatarInfo.base64) {
avatarsMap.set(wxid, `data:image/jpeg;base64,${avatarInfo.base64}`)
// WeFlow 的 avatars 值直接是 base64 Data URL 字符串
const avatarsObj = JSON.parse(avatarsContent) as Record<string, string>
for (const [wxid, avatarDataUrl] of Object.entries(avatarsObj)) {
if (avatarDataUrl && typeof avatarDataUrl === 'string') {
avatarsMap.set(wxid, avatarDataUrl)
}
}
} catch {
@@ -354,8 +421,8 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
}
}
// 提取群头像(从 avatars 中获取群ID对应的头像)
const groupAvatar = groupId ? avatarsMap.get(groupId) : undefined
// 提取群头像(优先从 session.avatar,其次从 avatars 中获取群ID对应的头像)
const groupAvatar = session?.avatar || (groupId ? avatarsMap.get(groupId) : undefined)
// 快速扫描获取 ownerId(通过 isSend === 1 推断)
let ownerId: string | undefined
@@ -364,7 +431,7 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
const scanStream = fs.createReadStream(filePath, { encoding: 'utf-8' })
const scanPipeline = chain([scanStream, parser(), pick({ filter: /^messages\.\d+$/ }), streamValues()])
scanPipeline.on('data', ({ value }: { value: EchotraceMessage }) => {
scanPipeline.on('data', ({ value }: { value: WeFlowMessage }) => {
if (value.isSend === 1 && value.senderUsername && !value.senderUsername.endsWith('@chatroom')) {
ownerId = value.senderUsername
scanStream.destroy() // 找到后立即停止扫描
@@ -404,7 +471,7 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
const pipeline = chain([readStream, parser(), pick({ filter: /^messages\.\d+$/ }), streamValues()])
const processMessage = (msg: EchotraceMessage): ParsedMessage | null => {
const processMessage = (msg: WeFlowMessage): ParsedMessage | null => {
// 验证必要字段
if (!msg.senderUsername || msg.createTime === undefined) {
return null
@@ -420,7 +487,7 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
const accountName = msg.senderDisplayName || platformId
// 获取头像(优先使用 senderAvatarKeyfallback 到 senderUsername
// 获取头像(通过 senderAvatarKey 从 avatarsMap 查找
const avatarKey = msg.senderAvatarKey || msg.senderUsername
const avatar = avatarsMap.get(avatarKey)
@@ -445,28 +512,30 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
const type = convertMessageType(msg.type)
// 确保 content 是字符串类型(防止某些消息类型的 content 是对象)
// 同时去除开头和结尾的空白字符
let content: string | null = null
if (msg.content != null) {
content = typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content)
const rawContent = typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content)
content = rawContent.trim() || null
}
return {
platformMessageId: String(msg.localId), // 消息的平台原始 ID(用于回复关联查询)
senderPlatformId: platformId,
senderAccountName: accountName,
// echotrace 格式没有单独的群昵称字段,使用 null 而非 undefinedSQLite 兼容)
// WeFlow 格式没有单独的群昵称字段,使用 null 而非 undefinedSQLite 兼容)
senderGroupNickname: null,
timestamp: msg.createTime,
type,
content,
// 注意:echotrace 导出格式不包含被引用消息的 ID,所以 replyToMessageId 为空
// 注意:WeFlow 导出格式不包含被引用消息的 ID,所以 replyToMessageId 为空
}
}
// 用于收集批次的临时数组
const batchCollector: ParsedMessage[] = []
pipeline.on('data', ({ value }: { value: EchotraceMessage }) => {
pipeline.on('data', ({ value }: { value: WeFlowMessage }) => {
const parsed = processMessage(value)
if (parsed) {
batchCollector.push(parsed)
@@ -532,20 +601,20 @@ async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent
export const parser_: Parser = {
feature,
parse: parseEchotrace,
parse: parseWeFlow,
}
// ==================== 预处理器(预留) ====================
import { echotracePreprocessor } from './ycccccccy-echotrace-preprocessor'
export const preprocessor = echotracePreprocessor
import { weflowPreprocessor } from './weflow-preprocessor'
export const preprocessor = weflowPreprocessor
// ==================== 导出格式模块 ====================
const module_: FormatModule = {
feature,
parser: parser_,
preprocessor: echotracePreprocessor,
preprocessor: weflowPreprocessor,
}
export default module_