feat: 兼容shuakami-jsonl

This commit is contained in:
digua
2026-01-12 01:01:35 +08:00
committed by digua
parent 5372c05c37
commit c3e28d39dd
2 changed files with 497 additions and 0 deletions
+3
View File
@@ -9,6 +9,7 @@ import type { FormatModule } from '../types'
import chatlab from './chatlab'
import chatlabJsonl from './chatlab-jsonl'
import shuakamiQqExporter from './shuakami-qq-exporter'
import shuakamiQqExporterChunked from './shuakami-qq-exporter-chunked'
import yccccccyEchotrace from './ycccccccy-echotrace'
import tyrrrzDiscordExporter from './tyrrrz-discord-exporter'
import whatsappNativeTxt from './whatsapp-native-txt'
@@ -20,6 +21,7 @@ import qqNativeTxt from './qq-native-txt'
export const formats: FormatModule[] = [
chatlab, // 优先级 1 - ChatLab JSON
chatlabJsonl, // 优先级 2 - ChatLab JSONL(流式格式,支持超大文件)
shuakamiQqExporterChunked, // 优先级 5 - shuakami/qq-chat-exporter chunked-jsonl
shuakamiQqExporter, // 优先级 10 - shuakami/qq-chat-exporter
yccccccyEchotrace, // 优先级 15 - ycccccccy/echotrace
tyrrrzDiscordExporter, // 优先级 20 - Tyrrrz/DiscordChatExporter
@@ -32,6 +34,7 @@ export {
chatlab,
chatlabJsonl,
shuakamiQqExporter,
shuakamiQqExporterChunked,
yccccccyEchotrace,
tyrrrzDiscordExporter,
qqNativeTxt,
@@ -0,0 +1,494 @@
/**
* shuakami/qq-chat-exporter chunked-jsonl 格式解析器
* 适配项目: https://github.com/shuakami/qq-chat-exporter
* 版本: 5.xchunked-jsonl 分块格式)
*
* 文件结构:
* - manifest.json: 元数据和分块信息
* - metadata: 导出元数据
* - chatInfo: 聊天信息
* - chunked.chunks[]: 分块文件列表
* - avatars?: 头像文件信息(V5.5+
* - chunks/: 分块目录
* - chunk_0001.jsonl: 每行一条消息
* - chunk_0002.jsonl: ...
* - avatars.json: 头像数据(V5.5+,可选)
*
* 消息格式(sender 字段):
* - uid: 用户 UID
* - uin: QQ 号
* - name: 展示名(优先群昵称,否则QQ昵称)
* - nickname: QQ 昵称
* - groupCard: 群昵称(群聊时存在)
* - remark: 好友备注
*/
import * as fs from 'fs'
import * as path from 'path'
import * as readline from 'readline'
import { KNOWN_PLATFORMS, ChatType, MessageType } from '../../../../src/types/base'
import type {
FormatFeature,
FormatModule,
Parser,
ParseOptions,
ParseEvent,
ParsedMeta,
ParsedMember,
ParsedMessage,
} from '../types'
import { createProgress, parseTimestamp, isValidYear } from '../utils'
// ==================== 特征定义 ====================
export const feature: FormatFeature = {
id: 'shuakami-qq-exporter-chunked',
name: 'shuakami/qq-chat-exporter (chunked)',
platform: KNOWN_PLATFORMS.QQ,
priority: 5, // 比单文件版本优先级更高
extensions: ['.json'],
signatures: {
head: [/"format"\s*:\s*"chunked-jsonl"/, /"chunked"\s*:/],
requiredFields: ['metadata', 'chatInfo', 'chunked'],
},
}
// ==================== 类型定义 ====================
interface ChunkInfo {
// V5.0 格式
file?: string
messages?: number
bytes?: number
// V5.5+ 格式
index?: number
fileName?: string
relativePath?: string
count?: number
start?: string
end?: string
}
interface Manifest {
metadata: {
name?: string
copyright?: string
exportTime: string
version: string
format: string
}
chatInfo: {
name: string
type: string
selfUid?: string
selfUin?: string
selfName?: string
}
statistics: {
totalMessages: number
chunkCount?: number
timeRange?: {
start: string
end: string
durationDays: number
}
messageTypes?: Record<string, number>
senders?: Array<{
uid: string
name: string
messageCount: number
percentage: number
}>
}
chunked: {
format: string
chunksDir: string
chunkFileExt: string
maxMessagesPerChunk: number
maxBytesPerChunk: number
chunks: ChunkInfo[]
}
avatars?: {
file: string
count: number
}
}
interface ChunkedMessage {
id?: string
seq?: string
timestamp: number
time?: string
sender: {
uid?: string
uin?: string
name: string
nickname?: string // QQ 昵称
groupCard?: string // 群昵称
remark?: string // 好友备注
}
type: string
content: {
text: string
html?: string
elements?: Array<{ type: string; data?: Record<string, unknown> }>
resources?: Array<{ type: string }>
mentions?: Array<{ uid: string; name: string }>
}
recalled?: boolean
system?: boolean
}
interface MemberInfo {
platformId: string
accountName: string
groupNickname: string | undefined
avatar: string | undefined
}
// ==================== 消息类型转换 ====================
function convertMessageType(
msgType: string,
content: ChunkedMessage['content'],
isRecalled?: boolean
): MessageType {
if (isRecalled) return MessageType.RECALL
// 系统消息
if (msgType === 'system') return MessageType.SYSTEM
// 检查资源类型
if (content.resources?.length) {
const resourceType = content.resources[0].type
switch (resourceType) {
case 'image':
return MessageType.IMAGE
case 'video':
return MessageType.VIDEO
case 'voice':
case 'audio':
return MessageType.VOICE
case 'file':
return MessageType.FILE
case 'location':
return MessageType.LOCATION
}
}
// 检查表情
if (content.elements?.some((e) => e.type === 'face' || e.type === 'market_face' || e.type === 'marketFace')) {
return MessageType.EMOJI
}
// 根据文本内容判断
const text = content.text?.trim() || ''
if (text.includes('QQ红包') || text.includes('发出了红包') || text === '[红包]') return MessageType.RED_PACKET
if (text.includes('转账') || text === '[转账]') return MessageType.TRANSFER
if (text.includes('拍了拍') || text.includes('戳了戳') || text === '[拍一拍]') return MessageType.POKE
if (text.includes('语音通话') || text.includes('视频通话') || text.includes('通话时长')) return MessageType.CALL
if (text === '[分享]' || text === '[音乐]' || text === '[小程序]') return MessageType.SHARE
if (text === '[链接]' || text === '[卡片消息]') return MessageType.LINK
if (text === '[位置]' || text === '[地理位置]') return MessageType.LOCATION
if (text === '[转发]' || text === '[聊天记录]') return MessageType.FORWARD
return MessageType.TEXT
}
// ==================== 辅助函数 ====================
/**
* 读取并解析 manifest.json
*/
function readManifest(manifestPath: string): Manifest {
const content = fs.readFileSync(manifestPath, 'utf-8')
return JSON.parse(content) as Manifest
}
/**
* 获取 chunk 文件的相对路径(兼容新旧格式)
*/
function getChunkRelativePath(chunk: ChunkInfo): string {
// V5.5+ 使用 relativePath
if (chunk.relativePath) return chunk.relativePath
// V5.0 使用 file
if (chunk.file) return chunk.file
// 后备:使用 fileName 拼接
if (chunk.fileName) return `chunks/${chunk.fileName}`
throw new Error('无法获取 chunk 文件路径')
}
/**
* 获取 chunk 的消息数量(兼容新旧格式)
*/
function getChunkMessageCount(chunk: ChunkInfo): number {
// V5.5+ 使用 count
if (chunk.count !== undefined) return chunk.count
// V5.0 使用 messages
if (chunk.messages !== undefined) return chunk.messages
return 0
}
/**
* 计算所有 chunk 文件的总字节数
*/
function calculateTotalBytes(manifest: Manifest, baseDir: string): number {
let total = 0
for (const chunk of manifest.chunked.chunks) {
const relativePath = getChunkRelativePath(chunk)
const chunkPath = path.join(baseDir, relativePath)
if (fs.existsSync(chunkPath)) {
total += fs.statSync(chunkPath).size
}
}
return total
}
/**
* 读取 avatars.json 文件
*/
function readAvatars(baseDir: string, avatarsInfo?: { file: string; count: number }): Map<string, string> {
const avatarsMap = new Map<string, string>()
if (!avatarsInfo?.file) return avatarsMap
const avatarsPath = path.join(baseDir, avatarsInfo.file)
if (!fs.existsSync(avatarsPath)) return avatarsMap
try {
const content = fs.readFileSync(avatarsPath, 'utf-8')
const avatars = JSON.parse(content) as Record<string, string>
for (const [uin, avatar] of Object.entries(avatars)) {
if (avatar && typeof avatar === 'string' && avatar.startsWith('data:image/')) {
avatarsMap.set(uin, avatar)
}
}
} catch {
// 头像读取失败,继续不带头像
}
return avatarsMap
}
/**
* 流式读取 JSONL 文件
*/
async function* readJsonlFile(filePath: string): AsyncGenerator<ChunkedMessage> {
const fileStream = fs.createReadStream(filePath, { encoding: 'utf-8' })
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity,
})
for await (const line of rl) {
const trimmed = line.trim()
if (!trimmed) continue
try {
yield JSON.parse(trimmed) as ChunkedMessage
} catch {
// 跳过无效的 JSON 行
}
}
}
// ==================== 解析器实现 ====================
async function* parseChunkedJsonl(options: ParseOptions): AsyncGenerator<ParseEvent, void, unknown> {
const { filePath, batchSize = 5000, onProgress, onLog } = options
// 确定 manifest.json 路径和基础目录
const manifestPath = filePath
const baseDir = path.dirname(manifestPath)
// 读取 manifest
let manifest: Manifest
try {
manifest = readManifest(manifestPath)
} catch (error) {
yield { type: 'error', data: new Error(`无法读取 manifest.json: ${error}`) }
return
}
// 验证格式
if (manifest.metadata.format !== 'chunked-jsonl') {
yield { type: 'error', data: new Error(`不支持的格式: ${manifest.metadata.format}`) }
return
}
const totalBytes = calculateTotalBytes(manifest, baseDir)
let bytesRead = 0
let messagesProcessed = 0
let skippedMessages = 0
// 发送初始进度
const initialProgress = createProgress('parsing', 0, totalBytes, 0, '')
yield { type: 'progress', data: initialProgress }
onProgress?.(initialProgress)
onLog?.(
'info',
`开始解析 chunked-jsonl 格式 (V${manifest.metadata.version}),共 ${manifest.chunked.chunks.length} 个分块,预计 ${manifest.statistics.totalMessages} 条消息`
)
// 读取头像文件(如果存在)
const avatarsMap = readAvatars(baseDir, manifest.avatars)
if (avatarsMap.size > 0) {
onLog?.('info', `已加载 ${avatarsMap.size} 个用户头像`)
}
// 发送 meta
const chatType = manifest.chatInfo.type === 'group' ? ChatType.GROUP : ChatType.PRIVATE
const meta: ParsedMeta = {
name: manifest.chatInfo.name || '未知群聊',
platform: KNOWN_PLATFORMS.QQ,
type: chatType,
ownerId: manifest.chatInfo.selfUin || manifest.chatInfo.selfUid,
}
yield { type: 'meta', data: meta }
// 收集成员和消息
const memberMap = new Map<string, MemberInfo>()
const messageBatch: ParsedMessage[] = []
// 遍历所有 chunk 文件
for (const chunkInfo of manifest.chunked.chunks) {
const relativePath = getChunkRelativePath(chunkInfo)
const chunkPath = path.join(baseDir, relativePath)
const chunkMessageCount = getChunkMessageCount(chunkInfo)
if (!fs.existsSync(chunkPath)) {
onLog?.('error', `分块文件不存在: ${chunkPath}`)
continue
}
const chunkSize = fs.statSync(chunkPath).size
let chunkMessagesRead = 0
onLog?.('info', `正在解析分块: ${relativePath} (${chunkMessageCount} 条消息)`)
// 流式读取 JSONL 文件
for await (const msg of readJsonlFile(chunkPath)) {
chunkMessagesRead++
// 获取 platformId
const platformId = msg.sender.uin || msg.sender.uid
if (!platformId || platformId === '0' || platformId === '未知') {
skippedMessages++
continue
}
// 获取名字信息
// nickname: QQ 昵称(原始昵称)
// groupCard: 群昵称
// name: 展示名(一般是 groupCard || nickname
const accountName = msg.sender.nickname || msg.sender.name || platformId
const groupNickname = msg.sender.groupCard || undefined
// 更新成员信息
const existingMember = memberMap.get(platformId)
if (!existingMember) {
memberMap.set(platformId, {
platformId,
accountName,
groupNickname,
avatar: avatarsMap.get(platformId),
})
} else {
existingMember.accountName = accountName
if (groupNickname) existingMember.groupNickname = groupNickname
if (!existingMember.avatar) existingMember.avatar = avatarsMap.get(platformId)
}
// 解析时间戳(chunked 格式的时间戳是毫秒)
const timestamp =
typeof msg.timestamp === 'number' ? Math.floor(msg.timestamp / 1000) : parseTimestamp(msg.timestamp)
if (timestamp === null || !isValidYear(timestamp)) {
skippedMessages++
continue
}
// 消息类型
const type = msg.system ? MessageType.SYSTEM : convertMessageType(msg.type, msg.content, msg.recalled)
// 文本内容
let textContent = msg.content?.text || ''
if (msg.recalled) textContent = '[已撤回] ' + textContent
messageBatch.push({
platformMessageId: msg.id,
senderPlatformId: platformId,
senderAccountName: accountName,
senderGroupNickname: groupNickname,
timestamp,
type,
content: textContent || null,
})
messagesProcessed++
// 定期发送进度
if (messagesProcessed % batchSize === 0) {
// 估算字节读取进度
const chunkProgress = chunkMessageCount > 0 ? chunkMessagesRead / chunkMessageCount : 0
const chunkBytesRead = Math.floor(chunkProgress * chunkSize)
const currentBytesRead = bytesRead + chunkBytesRead
const progress = createProgress(
'parsing',
currentBytesRead,
totalBytes,
messagesProcessed,
`已处理 ${messagesProcessed} 条消息...`
)
yield { type: 'progress', data: progress }
onProgress?.(progress)
}
}
// 更新总字节读取
bytesRead += chunkSize
}
// 发送成员(包含头像)
const members: ParsedMember[] = Array.from(memberMap.values()).map((m) => ({
platformId: m.platformId,
accountName: m.accountName,
groupNickname: m.groupNickname,
avatar: m.avatar,
}))
yield { type: 'members', data: members }
// 分批发送消息
for (let i = 0; i < messageBatch.length; i += batchSize) {
const batch = messageBatch.slice(i, i + batchSize)
yield { type: 'messages', data: batch }
}
// 完成
const doneProgress = createProgress('done', totalBytes, totalBytes, messagesProcessed, '')
yield { type: 'progress', data: doneProgress }
onProgress?.(doneProgress)
onLog?.('info', `解析完成: ${messagesProcessed} 条消息, ${memberMap.size} 个成员`)
if (skippedMessages > 0) {
onLog?.('info', `跳过 ${skippedMessages} 条无效消息(缺少发送者ID或时间戳无效)`)
}
yield {
type: 'done',
data: { messageCount: messagesProcessed, memberCount: memberMap.size },
}
}
// ==================== 导出 ====================
export const parser_: Parser = {
feature,
parse: parseChunkedJsonl,
}
const module_: FormatModule = {
feature,
parser: parser_,
}
export default module_