From c3e28d39dda1ad233b791f4b7732d633b8f753ec Mon Sep 17 00:00:00 2001
From: digua <i@digua.me>
Date: Mon, 12 Jan 2026 01:01:35 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E5=85=BC=E5=AE=B9shuakami-jsonl?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 electron/main/parser/formats/index.ts         |   3 +
 .../formats/shuakami-qq-exporter-chunked.ts   | 494 ++++++++++++++++++
 2 files changed, 497 insertions(+)
 create mode 100644 electron/main/parser/formats/shuakami-qq-exporter-chunked.ts

diff --git a/electron/main/parser/formats/index.ts b/electron/main/parser/formats/index.ts
index 80af7f0..13a7058 100644
--- a/electron/main/parser/formats/index.ts
+++ b/electron/main/parser/formats/index.ts
@@ -9,6 +9,7 @@ import type { FormatModule } from '../types'
 import chatlab from './chatlab'
 import chatlabJsonl from './chatlab-jsonl'
 import shuakamiQqExporter from './shuakami-qq-exporter'
+import shuakamiQqExporterChunked from './shuakami-qq-exporter-chunked'
 import yccccccyEchotrace from './ycccccccy-echotrace'
 import tyrrrzDiscordExporter from './tyrrrz-discord-exporter'
 import whatsappNativeTxt from './whatsapp-native-txt'
@@ -20,6 +21,7 @@ import qqNativeTxt from './qq-native-txt'
 export const formats: FormatModule[] = [
   chatlab, // 优先级 1 - ChatLab JSON
   chatlabJsonl, // 优先级 2 - ChatLab JSONL（流式格式，支持超大文件）
+  shuakamiQqExporterChunked, // 优先级 5 - shuakami/qq-chat-exporter chunked-jsonl
   shuakamiQqExporter, // 优先级 10 - shuakami/qq-chat-exporter
   yccccccyEchotrace, // 优先级 15 - ycccccccy/echotrace
   tyrrrzDiscordExporter, // 优先级 20 - Tyrrrz/DiscordChatExporter
@@ -32,6 +34,7 @@ export {
   chatlab,
   chatlabJsonl,
   shuakamiQqExporter,
+  shuakamiQqExporterChunked,
   yccccccyEchotrace,
   tyrrrzDiscordExporter,
   qqNativeTxt,
diff --git a/electron/main/parser/formats/shuakami-qq-exporter-chunked.ts b/electron/main/parser/formats/shuakami-qq-exporter-chunked.ts
new file mode 100644
index 0000000..d34d1da
--- /dev/null
+++ b/electron/main/parser/formats/shuakami-qq-exporter-chunked.ts
@@ -0,0 +1,494 @@
+/**
+ * shuakami/qq-chat-exporter chunked-jsonl 格式解析器
+ * 适配项目: https://github.com/shuakami/qq-chat-exporter
+ * 版本: 5.x（chunked-jsonl 分块格式）
+ *
+ * 文件结构：
+ * - manifest.json: 元数据和分块信息
+ *   - metadata: 导出元数据
+ *   - chatInfo: 聊天信息
+ *   - chunked.chunks[]: 分块文件列表
+ *   - avatars?: 头像文件信息（V5.5+）
+ * - chunks/: 分块目录
+ *   - chunk_0001.jsonl: 每行一条消息
+ *   - chunk_0002.jsonl: ...
+ * - avatars.json: 头像数据（V5.5+，可选）
+ *
+ * 消息格式（sender 字段）：
+ * - uid: 用户 UID
+ * - uin: QQ 号
+ * - name: 展示名（优先群昵称，否则QQ昵称）
+ * - nickname: QQ 昵称
+ * - groupCard: 群昵称（群聊时存在）
+ * - remark: 好友备注
+ */
+
+import * as fs from 'fs'
+import * as path from 'path'
+import * as readline from 'readline'
+import { KNOWN_PLATFORMS, ChatType, MessageType } from '../../../../src/types/base'
+import type {
+  FormatFeature,
+  FormatModule,
+  Parser,
+  ParseOptions,
+  ParseEvent,
+  ParsedMeta,
+  ParsedMember,
+  ParsedMessage,
+} from '../types'
+import { createProgress, parseTimestamp, isValidYear } from '../utils'
+
+// ==================== 特征定义 ====================
+
+export const feature: FormatFeature = {
+  id: 'shuakami-qq-exporter-chunked',
+  name: 'shuakami/qq-chat-exporter (chunked)',
+  platform: KNOWN_PLATFORMS.QQ,
+  priority: 5, // 比单文件版本优先级更高
+  extensions: ['.json'],
+  signatures: {
+    head: [/"format"\s*:\s*"chunked-jsonl"/, /"chunked"\s*:/],
+    requiredFields: ['metadata', 'chatInfo', 'chunked'],
+  },
+}
+
+// ==================== 类型定义 ====================
+
+interface ChunkInfo {
+  // V5.0 格式
+  file?: string
+  messages?: number
+  bytes?: number
+  // V5.5+ 格式
+  index?: number
+  fileName?: string
+  relativePath?: string
+  count?: number
+  start?: string
+  end?: string
+}
+
+interface Manifest {
+  metadata: {
+    name?: string
+    copyright?: string
+    exportTime: string
+    version: string
+    format: string
+  }
+  chatInfo: {
+    name: string
+    type: string
+    selfUid?: string
+    selfUin?: string
+    selfName?: string
+  }
+  statistics: {
+    totalMessages: number
+    chunkCount?: number
+    timeRange?: {
+      start: string
+      end: string
+      durationDays: number
+    }
+    messageTypes?: Record<string, number>
+    senders?: Array<{
+      uid: string
+      name: string
+      messageCount: number
+      percentage: number
+    }>
+  }
+  chunked: {
+    format: string
+    chunksDir: string
+    chunkFileExt: string
+    maxMessagesPerChunk: number
+    maxBytesPerChunk: number
+    chunks: ChunkInfo[]
+  }
+  avatars?: {
+    file: string
+    count: number
+  }
+}
+
+interface ChunkedMessage {
+  id?: string
+  seq?: string
+  timestamp: number
+  time?: string
+  sender: {
+    uid?: string
+    uin?: string
+    name: string
+    nickname?: string // QQ 昵称
+    groupCard?: string // 群昵称
+    remark?: string // 好友备注
+  }
+  type: string
+  content: {
+    text: string
+    html?: string
+    elements?: Array<{ type: string; data?: Record<string, unknown> }>
+    resources?: Array<{ type: string }>
+    mentions?: Array<{ uid: string; name: string }>
+  }
+  recalled?: boolean
+  system?: boolean
+}
+
+interface MemberInfo {
+  platformId: string
+  accountName: string
+  groupNickname: string | undefined
+  avatar: string | undefined
+}
+
+// ==================== 消息类型转换 ====================
+
+function convertMessageType(
+  msgType: string,
+  content: ChunkedMessage['content'],
+  isRecalled?: boolean
+): MessageType {
+  if (isRecalled) return MessageType.RECALL
+
+  // 系统消息
+  if (msgType === 'system') return MessageType.SYSTEM
+
+  // 检查资源类型
+  if (content.resources?.length) {
+    const resourceType = content.resources[0].type
+    switch (resourceType) {
+      case 'image':
+        return MessageType.IMAGE
+      case 'video':
+        return MessageType.VIDEO
+      case 'voice':
+      case 'audio':
+        return MessageType.VOICE
+      case 'file':
+        return MessageType.FILE
+      case 'location':
+        return MessageType.LOCATION
+    }
+  }
+
+  // 检查表情
+  if (content.elements?.some((e) => e.type === 'face' || e.type === 'market_face' || e.type === 'marketFace')) {
+    return MessageType.EMOJI
+  }
+
+  // 根据文本内容判断
+  const text = content.text?.trim() || ''
+  if (text.includes('QQ红包') || text.includes('发出了红包') || text === '[红包]') return MessageType.RED_PACKET
+  if (text.includes('转账') || text === '[转账]') return MessageType.TRANSFER
+  if (text.includes('拍了拍') || text.includes('戳了戳') || text === '[拍一拍]') return MessageType.POKE
+  if (text.includes('语音通话') || text.includes('视频通话') || text.includes('通话时长')) return MessageType.CALL
+  if (text === '[分享]' || text === '[音乐]' || text === '[小程序]') return MessageType.SHARE
+  if (text === '[链接]' || text === '[卡片消息]') return MessageType.LINK
+  if (text === '[位置]' || text === '[地理位置]') return MessageType.LOCATION
+  if (text === '[转发]' || text === '[聊天记录]') return MessageType.FORWARD
+
+  return MessageType.TEXT
+}
+
+// ==================== 辅助函数 ====================
+
+/**
+ * 读取并解析 manifest.json
+ */
+function readManifest(manifestPath: string): Manifest {
+  const content = fs.readFileSync(manifestPath, 'utf-8')
+  return JSON.parse(content) as Manifest
+}
+
+/**
+ * 获取 chunk 文件的相对路径（兼容新旧格式）
+ */
+function getChunkRelativePath(chunk: ChunkInfo): string {
+  // V5.5+ 使用 relativePath
+  if (chunk.relativePath) return chunk.relativePath
+  // V5.0 使用 file
+  if (chunk.file) return chunk.file
+  // 后备：使用 fileName 拼接
+  if (chunk.fileName) return `chunks/${chunk.fileName}`
+  throw new Error('无法获取 chunk 文件路径')
+}
+
+/**
+ * 获取 chunk 的消息数量（兼容新旧格式）
+ */
+function getChunkMessageCount(chunk: ChunkInfo): number {
+  // V5.5+ 使用 count
+  if (chunk.count !== undefined) return chunk.count
+  // V5.0 使用 messages
+  if (chunk.messages !== undefined) return chunk.messages
+  return 0
+}
+
+/**
+ * 计算所有 chunk 文件的总字节数
+ */
+function calculateTotalBytes(manifest: Manifest, baseDir: string): number {
+  let total = 0
+  for (const chunk of manifest.chunked.chunks) {
+    const relativePath = getChunkRelativePath(chunk)
+    const chunkPath = path.join(baseDir, relativePath)
+    if (fs.existsSync(chunkPath)) {
+      total += fs.statSync(chunkPath).size
+    }
+  }
+  return total
+}
+
+/**
+ * 读取 avatars.json 文件
+ */
+function readAvatars(baseDir: string, avatarsInfo?: { file: string; count: number }): Map<string, string> {
+  const avatarsMap = new Map<string, string>()
+  if (!avatarsInfo?.file) return avatarsMap
+
+  const avatarsPath = path.join(baseDir, avatarsInfo.file)
+  if (!fs.existsSync(avatarsPath)) return avatarsMap
+
+  try {
+    const content = fs.readFileSync(avatarsPath, 'utf-8')
+    const avatars = JSON.parse(content) as Record<string, string>
+    for (const [uin, avatar] of Object.entries(avatars)) {
+      if (avatar && typeof avatar === 'string' && avatar.startsWith('data:image/')) {
+        avatarsMap.set(uin, avatar)
+      }
+    }
+  } catch {
+    // 头像读取失败，继续不带头像
+  }
+
+  return avatarsMap
+}
+
+/**
+ * 流式读取 JSONL 文件
+ */
+async function* readJsonlFile(filePath: string): AsyncGenerator<ChunkedMessage> {
+  const fileStream = fs.createReadStream(filePath, { encoding: 'utf-8' })
+  const rl = readline.createInterface({
+    input: fileStream,
+    crlfDelay: Infinity,
+  })
+
+  for await (const line of rl) {
+    const trimmed = line.trim()
+    if (!trimmed) continue
+    try {
+      yield JSON.parse(trimmed) as ChunkedMessage
+    } catch {
+      // 跳过无效的 JSON 行
+    }
+  }
+}
+
+// ==================== 解析器实现 ====================
+
+async function* parseChunkedJsonl(options: ParseOptions): AsyncGenerator<ParseEvent, void, unknown> {
+  const { filePath, batchSize = 5000, onProgress, onLog } = options
+
+  // 确定 manifest.json 路径和基础目录
+  const manifestPath = filePath
+  const baseDir = path.dirname(manifestPath)
+
+  // 读取 manifest
+  let manifest: Manifest
+  try {
+    manifest = readManifest(manifestPath)
+  } catch (error) {
+    yield { type: 'error', data: new Error(`无法读取 manifest.json: ${error}`) }
+    return
+  }
+
+  // 验证格式
+  if (manifest.metadata.format !== 'chunked-jsonl') {
+    yield { type: 'error', data: new Error(`不支持的格式: ${manifest.metadata.format}`) }
+    return
+  }
+
+  const totalBytes = calculateTotalBytes(manifest, baseDir)
+  let bytesRead = 0
+  let messagesProcessed = 0
+  let skippedMessages = 0
+
+  // 发送初始进度
+  const initialProgress = createProgress('parsing', 0, totalBytes, 0, '')
+  yield { type: 'progress', data: initialProgress }
+  onProgress?.(initialProgress)
+
+  onLog?.(
+    'info',
+    `开始解析 chunked-jsonl 格式 (V${manifest.metadata.version})，共 ${manifest.chunked.chunks.length} 个分块，预计 ${manifest.statistics.totalMessages} 条消息`
+  )
+
+  // 读取头像文件（如果存在）
+  const avatarsMap = readAvatars(baseDir, manifest.avatars)
+  if (avatarsMap.size > 0) {
+    onLog?.('info', `已加载 ${avatarsMap.size} 个用户头像`)
+  }
+
+  // 发送 meta
+  const chatType = manifest.chatInfo.type === 'group' ? ChatType.GROUP : ChatType.PRIVATE
+  const meta: ParsedMeta = {
+    name: manifest.chatInfo.name || '未知群聊',
+    platform: KNOWN_PLATFORMS.QQ,
+    type: chatType,
+    ownerId: manifest.chatInfo.selfUin || manifest.chatInfo.selfUid,
+  }
+  yield { type: 'meta', data: meta }
+
+  // 收集成员和消息
+  const memberMap = new Map<string, MemberInfo>()
+  const messageBatch: ParsedMessage[] = []
+
+  // 遍历所有 chunk 文件
+  for (const chunkInfo of manifest.chunked.chunks) {
+    const relativePath = getChunkRelativePath(chunkInfo)
+    const chunkPath = path.join(baseDir, relativePath)
+    const chunkMessageCount = getChunkMessageCount(chunkInfo)
+
+    if (!fs.existsSync(chunkPath)) {
+      onLog?.('error', `分块文件不存在: ${chunkPath}`)
+      continue
+    }
+
+    const chunkSize = fs.statSync(chunkPath).size
+    let chunkMessagesRead = 0
+
+    onLog?.('info', `正在解析分块: ${relativePath} (${chunkMessageCount} 条消息)`)
+
+    // 流式读取 JSONL 文件
+    for await (const msg of readJsonlFile(chunkPath)) {
+      chunkMessagesRead++
+
+      // 获取 platformId
+      const platformId = msg.sender.uin || msg.sender.uid
+      if (!platformId || platformId === '0' || platformId === '未知') {
+        skippedMessages++
+        continue
+      }
+
+      // 获取名字信息
+      // nickname: QQ 昵称（原始昵称）
+      // groupCard: 群昵称
+      // name: 展示名（一般是 groupCard || nickname）
+      const accountName = msg.sender.nickname || msg.sender.name || platformId
+      const groupNickname = msg.sender.groupCard || undefined
+
+      // 更新成员信息
+      const existingMember = memberMap.get(platformId)
+      if (!existingMember) {
+        memberMap.set(platformId, {
+          platformId,
+          accountName,
+          groupNickname,
+          avatar: avatarsMap.get(platformId),
+        })
+      } else {
+        existingMember.accountName = accountName
+        if (groupNickname) existingMember.groupNickname = groupNickname
+        if (!existingMember.avatar) existingMember.avatar = avatarsMap.get(platformId)
+      }
+
+      // 解析时间戳（chunked 格式的时间戳是毫秒）
+      const timestamp =
+        typeof msg.timestamp === 'number' ? Math.floor(msg.timestamp / 1000) : parseTimestamp(msg.timestamp)
+
+      if (timestamp === null || !isValidYear(timestamp)) {
+        skippedMessages++
+        continue
+      }
+
+      // 消息类型
+      const type = msg.system ? MessageType.SYSTEM : convertMessageType(msg.type, msg.content, msg.recalled)
+
+      // 文本内容
+      let textContent = msg.content?.text || ''
+      if (msg.recalled) textContent = '[已撤回] ' + textContent
+
+      messageBatch.push({
+        platformMessageId: msg.id,
+        senderPlatformId: platformId,
+        senderAccountName: accountName,
+        senderGroupNickname: groupNickname,
+        timestamp,
+        type,
+        content: textContent || null,
+      })
+
+      messagesProcessed++
+
+      // 定期发送进度
+      if (messagesProcessed % batchSize === 0) {
+        // 估算字节读取进度
+        const chunkProgress = chunkMessageCount > 0 ? chunkMessagesRead / chunkMessageCount : 0
+        const chunkBytesRead = Math.floor(chunkProgress * chunkSize)
+        const currentBytesRead = bytesRead + chunkBytesRead
+        const progress = createProgress(
+          'parsing',
+          currentBytesRead,
+          totalBytes,
+          messagesProcessed,
+          `已处理 ${messagesProcessed} 条消息...`
+        )
+        yield { type: 'progress', data: progress }
+        onProgress?.(progress)
+      }
+    }
+
+    // 更新总字节读取
+    bytesRead += chunkSize
+  }
+
+  // 发送成员（包含头像）
+  const members: ParsedMember[] = Array.from(memberMap.values()).map((m) => ({
+    platformId: m.platformId,
+    accountName: m.accountName,
+    groupNickname: m.groupNickname,
+    avatar: m.avatar,
+  }))
+  yield { type: 'members', data: members }
+
+  // 分批发送消息
+  for (let i = 0; i < messageBatch.length; i += batchSize) {
+    const batch = messageBatch.slice(i, i + batchSize)
+    yield { type: 'messages', data: batch }
+  }
+
+  // 完成
+  const doneProgress = createProgress('done', totalBytes, totalBytes, messagesProcessed, '')
+  yield { type: 'progress', data: doneProgress }
+  onProgress?.(doneProgress)
+
+  onLog?.('info', `解析完成: ${messagesProcessed} 条消息, ${memberMap.size} 个成员`)
+  if (skippedMessages > 0) {
+    onLog?.('info', `跳过 ${skippedMessages} 条无效消息（缺少发送者ID或时间戳无效）`)
+  }
+
+  yield {
+    type: 'done',
+    data: { messageCount: messagesProcessed, memberCount: memberMap.size },
+  }
+}
+
+// ==================== 导出 ====================
+
+export const parser_: Parser = {
+  feature,
+  parse: parseChunkedJsonl,
+}
+
+const module_: FormatModule = {
+  feature,
+  parser: parser_,
+}
+
+export default module_