Files
ChatLab/electron/main/parser/formats/ycccccccy-echotrace.ts
2025-12-21 17:20:06 +08:00

516 lines
14 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* ycccccccy/echotrace 微信导出格式解析器
* 适配项目: https://github.com/ycccccccy/echotrace
*
* 特征:
* - 顶层包含 session 和 messages 字段
* - session.wxid: 微信ID群聊以 @chatroom 结尾)
* - session.type: "群聊" 或 "私聊"
* - messages[].type: 中文消息类型字符串
* - messages[].senderUsername: 发送者微信ID
* - messages[].senderDisplayName: 发送者显示名
*
* 注意localType 字段不可信,不使用
*/
import * as fs from 'fs'
import * as path from 'path'
import { parser } from 'stream-json'
import { pick } from 'stream-json/filters/Pick'
import { streamValues } from 'stream-json/streamers/StreamValues'
import { chain } from 'stream-chain'
import { KNOWN_PLATFORMS, ChatType, MessageType } from '../../../../src/types/base'
import type {
FormatFeature,
FormatModule,
Parser,
ParseOptions,
ParseEvent,
ParsedMeta,
ParsedMember,
ParsedMessage,
} from '../types'
import { getFileSize, createProgress, readFileHeadBytes } from '../utils'
// ==================== 辅助函数 ====================
/**
* 从文件名提取聊天名称
*/
function extractNameFromFilePath(filePath: string): string {
const basename = path.basename(filePath)
const name = basename.replace(/\.json$/i, '')
return name || '未知聊天'
}
// ==================== 特征定义 ====================
export const feature: FormatFeature = {
id: 'ycccccccy-echotrace',
name: 'ycccccccy/echotrace 微信导出',
platform: KNOWN_PLATFORMS.WECHAT,
priority: 15,
extensions: ['.json'],
signatures: {
// 检测顶层字段和特征
head: [/"session"\s*:/, /"senderUsername"\s*:/, /"senderDisplayName"\s*:/],
requiredFields: ['session', 'messages'],
},
}
// ==================== 消息结构 ====================
interface EchotraceSession {
wxid: string
nickname: string
remark: string
displayName: string
type: '群聊' | '私聊'
lastTimestamp: number
messageCount: number
}
interface EchotraceMessage {
localId: number
createTime: number // Unix 时间戳(秒)
formattedTime: string
type: string // 中文消息类型
localType: number // 不可信,不使用
content: string
isSend: number | null // 0=接收, 1=发送, null=系统
senderUsername: string // 发送者微信ID
senderDisplayName: string // 发送者显示名
senderAvatarKey: string // 头像查找 key通常与 senderUsername 相同)
source: string
}
// ==================== 头像信息结构 ====================
interface EchotraceAvatarInfo {
displayName: string
base64: string // 原始 base64不包含 Data URL 前缀
}
// ==================== 消息类型映射 ====================
/**
* 将 echotrace 中文消息类型转换为标准 MessageType
*/
function convertMessageType(typeStr: string): MessageType {
switch (typeStr) {
case '文本消息':
return MessageType.TEXT
case '图片消息':
return MessageType.IMAGE
case '语音消息':
return MessageType.VOICE
case '视频消息':
return MessageType.VIDEO
case '文件消息':
return MessageType.FILE
case '动画表情':
return MessageType.EMOJI
case '名片消息':
return MessageType.CONTACT
case '卡片式链接':
case '图文消息':
return MessageType.LINK
case '位置消息':
return MessageType.LOCATION
case '红包卡片':
return MessageType.RED_PACKET
case '转账卡片':
return MessageType.TRANSFER
case '小程序分享':
case '视频号直播卡片':
return MessageType.SHARE
case '引用消息':
return MessageType.REPLY
case '聊天记录合并转发':
return MessageType.FORWARD
case '系统消息':
return MessageType.SYSTEM
default:
// 未知类型(xxxxx) 或其他
return MessageType.OTHER
}
}
// ==================== 成员信息追踪 ====================
interface MemberInfo {
platformId: string
accountName: string
avatar: string | undefined // 头像base64 Data URL
}
// ==================== 解析器实现 ====================
async function* parseEchotrace(options: ParseOptions): AsyncGenerator<ParseEvent, void, unknown> {
const { filePath, batchSize = 5000, onProgress } = options
const totalBytes = getFileSize(filePath)
let bytesRead = 0
let messagesProcessed = 0
// 发送初始进度
const initialProgress = createProgress('parsing', 0, totalBytes, 0, '开始解析...')
yield { type: 'progress', data: initialProgress }
onProgress?.(initialProgress)
// 读取文件头获取 session 信息
const headContent = readFileHeadBytes(filePath, 2000)
// 解析 session
let session: EchotraceSession | null = null
try {
const sessionMatch = headContent.match(/"session"\s*:\s*(\{[^}]+\})/)
if (sessionMatch) {
session = JSON.parse(sessionMatch[1])
}
} catch {
// 使用默认值
}
// 确定聊天类型
// 1. 优先使用 session.type
// 2. 或者通过 wxid 是否以 @chatroom 结尾判断
let chatType = ChatType.GROUP
if (session) {
if (session.type === '私聊') {
chatType = ChatType.PRIVATE
} else if (session.type === '群聊') {
chatType = ChatType.GROUP
} else if (session.wxid && !session.wxid.endsWith('@chatroom')) {
chatType = ChatType.PRIVATE
}
}
// 确定聊天名称
const chatName = session?.displayName || session?.nickname || extractNameFromFilePath(filePath)
// 提取群ID群聊类型时有值
// 群ID 格式:以 @chatroom 结尾
const groupId = chatType === ChatType.GROUP && session?.wxid ? session.wxid : undefined
// 解析 avatars 对象(头像)
// avatars 格式:{ "wxid": { "displayName": "...", "base64": "..." } }
// 注意base64 不包含 Data URL 前缀,需要添加
const avatarsMap = new Map<string, string>()
/**
* 从字符串中提取 avatars 对象内容
* 正确处理 JSON 字符串中的花括号匹配(考虑字符串内的转义字符)
*/
function extractAvatarsObject(content: string): string | null {
const searchStr = '"avatars":'
const startIdx = content.indexOf(searchStr)
if (startIdx === -1) return null
let i = startIdx + searchStr.length
// 跳过空白字符
while (i < content.length && /\s/.test(content[i])) i++
if (content[i] !== '{') return null
// 从 { 开始匹配
let braceDepth = 0
let inString = false
let escape = false
const objStart = i
for (; i < content.length; i++) {
const char = content[i]
if (escape) {
escape = false
continue
}
if (char === '\\' && inString) {
escape = true
continue
}
if (char === '"') {
inString = !inString
continue
}
if (!inString) {
if (char === '{') braceDepth++
if (char === '}') {
braceDepth--
if (braceDepth === 0) {
return content.slice(objStart, i + 1)
}
}
}
}
return null
}
try {
// 先尝试从文件头解析(适用于成员较少的聊天)
const avatarsContent = extractAvatarsObject(headContent)
if (avatarsContent) {
const avatarsObj = JSON.parse(avatarsContent) as Record<string, EchotraceAvatarInfo>
for (const [wxid, avatarInfo] of Object.entries(avatarsObj)) {
if (avatarInfo && typeof avatarInfo === 'object' && avatarInfo.base64) {
// 添加 Data URL 前缀
avatarsMap.set(wxid, `data:image/jpeg;base64,${avatarInfo.base64}`)
}
}
}
} catch {
// avatars 解析失败,继续不带头像
}
// 如果文件头没有完整的 avatars可能超出 2000 字节),尝试流式读取
if (avatarsMap.size === 0) {
try {
await new Promise<void>((resolve) => {
const avatarStream = fs.createReadStream(filePath, { encoding: 'utf-8' })
let avatarsContent = ''
let inAvatars = false
let braceDepth = 0
let inString = false
let escape = false
avatarStream.on('data', (chunk: string | Buffer) => {
const str = typeof chunk === 'string' ? chunk : chunk.toString()
for (let i = 0; i < str.length; i++) {
const char = str[i]
if (!inAvatars) {
// 查找 "avatars": 的位置
const searchStr = '"avatars":'
if (str.slice(i, i + searchStr.length) === searchStr) {
inAvatars = true
// 跳过 "avatars": 和可能的空白
i += searchStr.length - 1
continue
}
} else {
// 开始收集 avatars 对象内容
avatarsContent += char
if (escape) {
escape = false
continue
}
if (char === '\\' && inString) {
escape = true
continue
}
if (char === '"') {
inString = !inString
continue
}
if (!inString) {
if (char === '{') braceDepth++
if (char === '}') {
braceDepth--
if (braceDepth === 0) {
// avatars 对象结束
avatarStream.destroy()
return
}
}
}
}
}
})
avatarStream.on('close', () => {
if (avatarsContent) {
try {
const avatarsObj = JSON.parse(avatarsContent) as Record<string, EchotraceAvatarInfo>
for (const [wxid, avatarInfo] of Object.entries(avatarsObj)) {
if (avatarInfo && typeof avatarInfo === 'object' && avatarInfo.base64) {
avatarsMap.set(wxid, `data:image/jpeg;base64,${avatarInfo.base64}`)
}
}
} catch {
// 解析失败
}
}
resolve()
})
avatarStream.on('error', () => resolve())
})
} catch {
// 流式解析失败,继续不带头像
}
}
// 提取群头像(从 avatars 中获取群ID对应的头像
const groupAvatar = groupId ? avatarsMap.get(groupId) : undefined
// 发送 meta
const meta: ParsedMeta = {
name: chatName,
platform: KNOWN_PLATFORMS.WECHAT,
type: chatType,
groupId,
groupAvatar,
}
yield { type: 'meta', data: meta }
// 收集成员和消息
const memberMap = new Map<string, MemberInfo>()
let messageBatch: ParsedMessage[] = []
// 流式解析
await new Promise<void>((resolve, reject) => {
const readStream = fs.createReadStream(filePath, { encoding: 'utf-8' })
readStream.on('data', (chunk: string | Buffer) => {
bytesRead += typeof chunk === 'string' ? Buffer.byteLength(chunk) : chunk.length
})
const pipeline = chain([readStream, parser(), pick({ filter: /^messages\.\d+$/ }), streamValues()])
const processMessage = (msg: EchotraceMessage): ParsedMessage | null => {
// 验证必要字段
if (!msg.senderUsername || msg.createTime === undefined) {
return null
}
const platformId = msg.senderUsername
// 跳过群"成员"群ID以 @chatroom 结尾的消息)
// 这些通常是系统消息,发送者是群本身,不是真正的成员
if (platformId.endsWith('@chatroom')) {
return null
}
const accountName = msg.senderDisplayName || platformId
// 获取头像(优先使用 senderAvatarKeyfallback 到 senderUsername
const avatarKey = msg.senderAvatarKey || msg.senderUsername
const avatar = avatarsMap.get(avatarKey)
// 更新成员信息
if (!memberMap.has(platformId)) {
memberMap.set(platformId, {
platformId,
accountName,
avatar,
})
} else {
// 更新为最新的显示名
const existing = memberMap.get(platformId)!
existing.accountName = accountName
// 头像使用最新的(覆盖更新)
if (avatar) {
existing.avatar = avatar
}
}
// 转换消息类型
const type = convertMessageType(msg.type)
return {
senderPlatformId: platformId,
senderAccountName: accountName,
// echotrace 格式没有单独的群昵称字段
senderGroupNickname: undefined,
timestamp: msg.createTime,
type,
content: msg.content || null,
}
}
// 用于收集批次的临时数组
const batchCollector: ParsedMessage[] = []
pipeline.on('data', ({ value }: { value: EchotraceMessage }) => {
const parsed = processMessage(value)
if (parsed) {
batchCollector.push(parsed)
messagesProcessed++
// 达到批次大小
if (batchCollector.length >= batchSize) {
messageBatch.push(...batchCollector)
batchCollector.length = 0
const progress = createProgress(
'parsing',
bytesRead,
totalBytes,
messagesProcessed,
`已处理 ${messagesProcessed} 条消息...`
)
onProgress?.(progress)
}
}
})
pipeline.on('end', () => {
// 收集剩余消息
if (batchCollector.length > 0) {
messageBatch.push(...batchCollector)
}
resolve()
})
pipeline.on('error', reject)
})
// 发送成员
const members: ParsedMember[] = Array.from(memberMap.values()).map((m) => ({
platformId: m.platformId,
accountName: m.accountName,
avatar: m.avatar,
}))
yield { type: 'members', data: members }
// 分批发送消息
for (let i = 0; i < messageBatch.length; i += batchSize) {
const batch = messageBatch.slice(i, i + batchSize)
yield { type: 'messages', data: batch }
}
// 完成
const doneProgress = createProgress('done', totalBytes, totalBytes, messagesProcessed, '解析完成')
yield { type: 'progress', data: doneProgress }
onProgress?.(doneProgress)
yield {
type: 'done',
data: { messageCount: messagesProcessed, memberCount: memberMap.size },
}
}
// ==================== 导出解析器 ====================
export const parser_: Parser = {
feature,
parse: parseEchotrace,
}
// ==================== 预处理器(预留) ====================
import { echotracePreprocessor } from './echotrace-preprocessor'
export const preprocessor = echotracePreprocessor
// ==================== 导出格式模块 ====================
const module_: FormatModule = {
feature,
parser: parser_,
preprocessor: echotracePreprocessor,
}
export default module_