From c3e28d39dda1ad233b791f4b7732d633b8f753ec Mon Sep 17 00:00:00 2001 From: digua Date: Mon, 12 Jan 2026 01:01:35 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=85=BC=E5=AE=B9shuakami-jsonl?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- electron/main/parser/formats/index.ts | 3 + .../formats/shuakami-qq-exporter-chunked.ts | 494 ++++++++++++++++++ 2 files changed, 497 insertions(+) create mode 100644 electron/main/parser/formats/shuakami-qq-exporter-chunked.ts diff --git a/electron/main/parser/formats/index.ts b/electron/main/parser/formats/index.ts index 80af7f0..13a7058 100644 --- a/electron/main/parser/formats/index.ts +++ b/electron/main/parser/formats/index.ts @@ -9,6 +9,7 @@ import type { FormatModule } from '../types' import chatlab from './chatlab' import chatlabJsonl from './chatlab-jsonl' import shuakamiQqExporter from './shuakami-qq-exporter' +import shuakamiQqExporterChunked from './shuakami-qq-exporter-chunked' import yccccccyEchotrace from './ycccccccy-echotrace' import tyrrrzDiscordExporter from './tyrrrz-discord-exporter' import whatsappNativeTxt from './whatsapp-native-txt' @@ -20,6 +21,7 @@ import qqNativeTxt from './qq-native-txt' export const formats: FormatModule[] = [ chatlab, // 优先级 1 - ChatLab JSON chatlabJsonl, // 优先级 2 - ChatLab JSONL(流式格式,支持超大文件) + shuakamiQqExporterChunked, // 优先级 5 - shuakami/qq-chat-exporter chunked-jsonl shuakamiQqExporter, // 优先级 10 - shuakami/qq-chat-exporter yccccccyEchotrace, // 优先级 15 - ycccccccy/echotrace tyrrrzDiscordExporter, // 优先级 20 - Tyrrrz/DiscordChatExporter @@ -32,6 +34,7 @@ export { chatlab, chatlabJsonl, shuakamiQqExporter, + shuakamiQqExporterChunked, yccccccyEchotrace, tyrrrzDiscordExporter, qqNativeTxt, diff --git a/electron/main/parser/formats/shuakami-qq-exporter-chunked.ts b/electron/main/parser/formats/shuakami-qq-exporter-chunked.ts new file mode 100644 index 0000000..d34d1da --- /dev/null +++ b/electron/main/parser/formats/shuakami-qq-exporter-chunked.ts @@ -0,0 +1,494 @@ +/** + * shuakami/qq-chat-exporter chunked-jsonl 格式解析器 + * 适配项目: https://github.com/shuakami/qq-chat-exporter + * 版本: 5.x(chunked-jsonl 分块格式) + * + * 文件结构: + * - manifest.json: 元数据和分块信息 + * - metadata: 导出元数据 + * - chatInfo: 聊天信息 + * - chunked.chunks[]: 分块文件列表 + * - avatars?: 头像文件信息(V5.5+) + * - chunks/: 分块目录 + * - chunk_0001.jsonl: 每行一条消息 + * - chunk_0002.jsonl: ... + * - avatars.json: 头像数据(V5.5+,可选) + * + * 消息格式(sender 字段): + * - uid: 用户 UID + * - uin: QQ 号 + * - name: 展示名(优先群昵称,否则QQ昵称) + * - nickname: QQ 昵称 + * - groupCard: 群昵称(群聊时存在) + * - remark: 好友备注 + */ + +import * as fs from 'fs' +import * as path from 'path' +import * as readline from 'readline' +import { KNOWN_PLATFORMS, ChatType, MessageType } from '../../../../src/types/base' +import type { + FormatFeature, + FormatModule, + Parser, + ParseOptions, + ParseEvent, + ParsedMeta, + ParsedMember, + ParsedMessage, +} from '../types' +import { createProgress, parseTimestamp, isValidYear } from '../utils' + +// ==================== 特征定义 ==================== + +export const feature: FormatFeature = { + id: 'shuakami-qq-exporter-chunked', + name: 'shuakami/qq-chat-exporter (chunked)', + platform: KNOWN_PLATFORMS.QQ, + priority: 5, // 比单文件版本优先级更高 + extensions: ['.json'], + signatures: { + head: [/"format"\s*:\s*"chunked-jsonl"/, /"chunked"\s*:/], + requiredFields: ['metadata', 'chatInfo', 'chunked'], + }, +} + +// ==================== 类型定义 ==================== + +interface ChunkInfo { + // V5.0 格式 + file?: string + messages?: number + bytes?: number + // V5.5+ 格式 + index?: number + fileName?: string + relativePath?: string + count?: number + start?: string + end?: string +} + +interface Manifest { + metadata: { + name?: string + copyright?: string + exportTime: string + version: string + format: string + } + chatInfo: { + name: string + type: string + selfUid?: string + selfUin?: string + selfName?: string + } + statistics: { + totalMessages: number + chunkCount?: number + timeRange?: { + start: string + end: string + durationDays: number + } + messageTypes?: Record + senders?: Array<{ + uid: string + name: string + messageCount: number + percentage: number + }> + } + chunked: { + format: string + chunksDir: string + chunkFileExt: string + maxMessagesPerChunk: number + maxBytesPerChunk: number + chunks: ChunkInfo[] + } + avatars?: { + file: string + count: number + } +} + +interface ChunkedMessage { + id?: string + seq?: string + timestamp: number + time?: string + sender: { + uid?: string + uin?: string + name: string + nickname?: string // QQ 昵称 + groupCard?: string // 群昵称 + remark?: string // 好友备注 + } + type: string + content: { + text: string + html?: string + elements?: Array<{ type: string; data?: Record }> + resources?: Array<{ type: string }> + mentions?: Array<{ uid: string; name: string }> + } + recalled?: boolean + system?: boolean +} + +interface MemberInfo { + platformId: string + accountName: string + groupNickname: string | undefined + avatar: string | undefined +} + +// ==================== 消息类型转换 ==================== + +function convertMessageType( + msgType: string, + content: ChunkedMessage['content'], + isRecalled?: boolean +): MessageType { + if (isRecalled) return MessageType.RECALL + + // 系统消息 + if (msgType === 'system') return MessageType.SYSTEM + + // 检查资源类型 + if (content.resources?.length) { + const resourceType = content.resources[0].type + switch (resourceType) { + case 'image': + return MessageType.IMAGE + case 'video': + return MessageType.VIDEO + case 'voice': + case 'audio': + return MessageType.VOICE + case 'file': + return MessageType.FILE + case 'location': + return MessageType.LOCATION + } + } + + // 检查表情 + if (content.elements?.some((e) => e.type === 'face' || e.type === 'market_face' || e.type === 'marketFace')) { + return MessageType.EMOJI + } + + // 根据文本内容判断 + const text = content.text?.trim() || '' + if (text.includes('QQ红包') || text.includes('发出了红包') || text === '[红包]') return MessageType.RED_PACKET + if (text.includes('转账') || text === '[转账]') return MessageType.TRANSFER + if (text.includes('拍了拍') || text.includes('戳了戳') || text === '[拍一拍]') return MessageType.POKE + if (text.includes('语音通话') || text.includes('视频通话') || text.includes('通话时长')) return MessageType.CALL + if (text === '[分享]' || text === '[音乐]' || text === '[小程序]') return MessageType.SHARE + if (text === '[链接]' || text === '[卡片消息]') return MessageType.LINK + if (text === '[位置]' || text === '[地理位置]') return MessageType.LOCATION + if (text === '[转发]' || text === '[聊天记录]') return MessageType.FORWARD + + return MessageType.TEXT +} + +// ==================== 辅助函数 ==================== + +/** + * 读取并解析 manifest.json + */ +function readManifest(manifestPath: string): Manifest { + const content = fs.readFileSync(manifestPath, 'utf-8') + return JSON.parse(content) as Manifest +} + +/** + * 获取 chunk 文件的相对路径(兼容新旧格式) + */ +function getChunkRelativePath(chunk: ChunkInfo): string { + // V5.5+ 使用 relativePath + if (chunk.relativePath) return chunk.relativePath + // V5.0 使用 file + if (chunk.file) return chunk.file + // 后备:使用 fileName 拼接 + if (chunk.fileName) return `chunks/${chunk.fileName}` + throw new Error('无法获取 chunk 文件路径') +} + +/** + * 获取 chunk 的消息数量(兼容新旧格式) + */ +function getChunkMessageCount(chunk: ChunkInfo): number { + // V5.5+ 使用 count + if (chunk.count !== undefined) return chunk.count + // V5.0 使用 messages + if (chunk.messages !== undefined) return chunk.messages + return 0 +} + +/** + * 计算所有 chunk 文件的总字节数 + */ +function calculateTotalBytes(manifest: Manifest, baseDir: string): number { + let total = 0 + for (const chunk of manifest.chunked.chunks) { + const relativePath = getChunkRelativePath(chunk) + const chunkPath = path.join(baseDir, relativePath) + if (fs.existsSync(chunkPath)) { + total += fs.statSync(chunkPath).size + } + } + return total +} + +/** + * 读取 avatars.json 文件 + */ +function readAvatars(baseDir: string, avatarsInfo?: { file: string; count: number }): Map { + const avatarsMap = new Map() + if (!avatarsInfo?.file) return avatarsMap + + const avatarsPath = path.join(baseDir, avatarsInfo.file) + if (!fs.existsSync(avatarsPath)) return avatarsMap + + try { + const content = fs.readFileSync(avatarsPath, 'utf-8') + const avatars = JSON.parse(content) as Record + for (const [uin, avatar] of Object.entries(avatars)) { + if (avatar && typeof avatar === 'string' && avatar.startsWith('data:image/')) { + avatarsMap.set(uin, avatar) + } + } + } catch { + // 头像读取失败,继续不带头像 + } + + return avatarsMap +} + +/** + * 流式读取 JSONL 文件 + */ +async function* readJsonlFile(filePath: string): AsyncGenerator { + const fileStream = fs.createReadStream(filePath, { encoding: 'utf-8' }) + const rl = readline.createInterface({ + input: fileStream, + crlfDelay: Infinity, + }) + + for await (const line of rl) { + const trimmed = line.trim() + if (!trimmed) continue + try { + yield JSON.parse(trimmed) as ChunkedMessage + } catch { + // 跳过无效的 JSON 行 + } + } +} + +// ==================== 解析器实现 ==================== + +async function* parseChunkedJsonl(options: ParseOptions): AsyncGenerator { + const { filePath, batchSize = 5000, onProgress, onLog } = options + + // 确定 manifest.json 路径和基础目录 + const manifestPath = filePath + const baseDir = path.dirname(manifestPath) + + // 读取 manifest + let manifest: Manifest + try { + manifest = readManifest(manifestPath) + } catch (error) { + yield { type: 'error', data: new Error(`无法读取 manifest.json: ${error}`) } + return + } + + // 验证格式 + if (manifest.metadata.format !== 'chunked-jsonl') { + yield { type: 'error', data: new Error(`不支持的格式: ${manifest.metadata.format}`) } + return + } + + const totalBytes = calculateTotalBytes(manifest, baseDir) + let bytesRead = 0 + let messagesProcessed = 0 + let skippedMessages = 0 + + // 发送初始进度 + const initialProgress = createProgress('parsing', 0, totalBytes, 0, '') + yield { type: 'progress', data: initialProgress } + onProgress?.(initialProgress) + + onLog?.( + 'info', + `开始解析 chunked-jsonl 格式 (V${manifest.metadata.version}),共 ${manifest.chunked.chunks.length} 个分块,预计 ${manifest.statistics.totalMessages} 条消息` + ) + + // 读取头像文件(如果存在) + const avatarsMap = readAvatars(baseDir, manifest.avatars) + if (avatarsMap.size > 0) { + onLog?.('info', `已加载 ${avatarsMap.size} 个用户头像`) + } + + // 发送 meta + const chatType = manifest.chatInfo.type === 'group' ? ChatType.GROUP : ChatType.PRIVATE + const meta: ParsedMeta = { + name: manifest.chatInfo.name || '未知群聊', + platform: KNOWN_PLATFORMS.QQ, + type: chatType, + ownerId: manifest.chatInfo.selfUin || manifest.chatInfo.selfUid, + } + yield { type: 'meta', data: meta } + + // 收集成员和消息 + const memberMap = new Map() + const messageBatch: ParsedMessage[] = [] + + // 遍历所有 chunk 文件 + for (const chunkInfo of manifest.chunked.chunks) { + const relativePath = getChunkRelativePath(chunkInfo) + const chunkPath = path.join(baseDir, relativePath) + const chunkMessageCount = getChunkMessageCount(chunkInfo) + + if (!fs.existsSync(chunkPath)) { + onLog?.('error', `分块文件不存在: ${chunkPath}`) + continue + } + + const chunkSize = fs.statSync(chunkPath).size + let chunkMessagesRead = 0 + + onLog?.('info', `正在解析分块: ${relativePath} (${chunkMessageCount} 条消息)`) + + // 流式读取 JSONL 文件 + for await (const msg of readJsonlFile(chunkPath)) { + chunkMessagesRead++ + + // 获取 platformId + const platformId = msg.sender.uin || msg.sender.uid + if (!platformId || platformId === '0' || platformId === '未知') { + skippedMessages++ + continue + } + + // 获取名字信息 + // nickname: QQ 昵称(原始昵称) + // groupCard: 群昵称 + // name: 展示名(一般是 groupCard || nickname) + const accountName = msg.sender.nickname || msg.sender.name || platformId + const groupNickname = msg.sender.groupCard || undefined + + // 更新成员信息 + const existingMember = memberMap.get(platformId) + if (!existingMember) { + memberMap.set(platformId, { + platformId, + accountName, + groupNickname, + avatar: avatarsMap.get(platformId), + }) + } else { + existingMember.accountName = accountName + if (groupNickname) existingMember.groupNickname = groupNickname + if (!existingMember.avatar) existingMember.avatar = avatarsMap.get(platformId) + } + + // 解析时间戳(chunked 格式的时间戳是毫秒) + const timestamp = + typeof msg.timestamp === 'number' ? Math.floor(msg.timestamp / 1000) : parseTimestamp(msg.timestamp) + + if (timestamp === null || !isValidYear(timestamp)) { + skippedMessages++ + continue + } + + // 消息类型 + const type = msg.system ? MessageType.SYSTEM : convertMessageType(msg.type, msg.content, msg.recalled) + + // 文本内容 + let textContent = msg.content?.text || '' + if (msg.recalled) textContent = '[已撤回] ' + textContent + + messageBatch.push({ + platformMessageId: msg.id, + senderPlatformId: platformId, + senderAccountName: accountName, + senderGroupNickname: groupNickname, + timestamp, + type, + content: textContent || null, + }) + + messagesProcessed++ + + // 定期发送进度 + if (messagesProcessed % batchSize === 0) { + // 估算字节读取进度 + const chunkProgress = chunkMessageCount > 0 ? chunkMessagesRead / chunkMessageCount : 0 + const chunkBytesRead = Math.floor(chunkProgress * chunkSize) + const currentBytesRead = bytesRead + chunkBytesRead + const progress = createProgress( + 'parsing', + currentBytesRead, + totalBytes, + messagesProcessed, + `已处理 ${messagesProcessed} 条消息...` + ) + yield { type: 'progress', data: progress } + onProgress?.(progress) + } + } + + // 更新总字节读取 + bytesRead += chunkSize + } + + // 发送成员(包含头像) + const members: ParsedMember[] = Array.from(memberMap.values()).map((m) => ({ + platformId: m.platformId, + accountName: m.accountName, + groupNickname: m.groupNickname, + avatar: m.avatar, + })) + yield { type: 'members', data: members } + + // 分批发送消息 + for (let i = 0; i < messageBatch.length; i += batchSize) { + const batch = messageBatch.slice(i, i + batchSize) + yield { type: 'messages', data: batch } + } + + // 完成 + const doneProgress = createProgress('done', totalBytes, totalBytes, messagesProcessed, '') + yield { type: 'progress', data: doneProgress } + onProgress?.(doneProgress) + + onLog?.('info', `解析完成: ${messagesProcessed} 条消息, ${memberMap.size} 个成员`) + if (skippedMessages > 0) { + onLog?.('info', `跳过 ${skippedMessages} 条无效消息(缺少发送者ID或时间戳无效)`) + } + + yield { + type: 'done', + data: { messageCount: messagesProcessed, memberCount: memberMap.size }, + } +} + +// ==================== 导出 ==================== + +export const parser_: Parser = { + feature, + parse: parseChunkedJsonl, +} + +const module_: FormatModule = { + feature, + parser: parser_, +} + +export default module_