Files
ChatLab/electron/main/parser/formats/shuakami-qq-preprocessor.ts
2025-12-05 23:04:46 +08:00

295 lines
8.6 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* shuakami/qq-chat-exporter 格式预处理器
* 适配项目: https://github.com/shuakami/qq-chat-exporter
*
* 功能:移除 content.html、content.raw 等冗余字段,减小文件体积
* 阈值:>50MB 自动触发预处理
*/
import * as fs from 'fs'
import * as path from 'path'
import * as os from 'os'
import { parser } from 'stream-json'
import { pick } from 'stream-json/filters/Pick'
import { streamValues } from 'stream-json/streamers/StreamValues'
import { chain } from 'stream-chain'
import type { ParseProgress, Preprocessor } from '../types'
import { getFileSize, createProgress } from '../utils'
/** 预处理阈值50MB */
const PREPROCESS_THRESHOLD = 50 * 1024 * 1024
/**
* 获取临时目录
*/
function getTempDir(): string {
return path.join(os.tmpdir(), 'chatlab')
}
/**
* 确保目录存在
*/
function ensureDir(dir: string): void {
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir, { recursive: true })
}
}
/**
* QQ JSON 消息的精简结构
*/
interface SlimQQMessage {
id?: string
messageId?: string
timestamp: number | string
sender: {
uid?: string
uin?: string
name: string
}
type?: string
messageType?: number
content: {
text: string
elements?: Array<{ type: string }>
resources?: Array<{ type: string }>
emojis?: Array<{ type: string }>
}
recalled?: boolean
isRecalled?: boolean
system?: boolean
isSystemMessage?: boolean
// V4 新增:保留 rawMessage 中的名字字段
rawMessage?: {
sendNickName?: string
sendMemberName?: string
senderUin?: string
senderUid?: string
}
}
/**
* 精简 QQ JSON 消息对象
*/
function slimMessage(msg: Record<string, unknown>): SlimQQMessage {
const sender = msg.sender as { uin?: string; uid?: string; name?: string } | undefined
const content = msg.content as Record<string, unknown> | undefined
const rawMessage = msg.rawMessage as Record<string, unknown> | undefined
const slimContent: SlimQQMessage['content'] = {
text: (content?.text as string) || '',
}
if (content?.elements && Array.isArray(content.elements)) {
slimContent.elements = (content.elements as Array<{ type: string }>).map((e) => ({
type: e.type,
}))
}
if (content?.resources && Array.isArray(content.resources)) {
slimContent.resources = (content.resources as Array<{ type: string }>).map((r) => ({
type: r.type,
}))
}
if (content?.emojis && Array.isArray(content.emojis)) {
slimContent.emojis = (content.emojis as Array<{ type: string }>).map((e) => ({
type: e.type,
}))
}
const slimMsg: SlimQQMessage = {
timestamp: msg.timestamp as number | string,
sender: { name: sender?.name || '' },
content: slimContent,
}
// 旧格式字段
if (msg.id) slimMsg.id = msg.id as string
if (msg.type) slimMsg.type = msg.type as string
if (msg.recalled) slimMsg.recalled = msg.recalled as boolean
if (msg.system) slimMsg.system = msg.system as boolean
// V4 新格式字段
if (msg.messageId) slimMsg.messageId = msg.messageId as string
if (msg.messageType !== undefined) slimMsg.messageType = msg.messageType as number
if (msg.isRecalled) slimMsg.isRecalled = msg.isRecalled as boolean
if (msg.isSystemMessage) slimMsg.isSystemMessage = msg.isSystemMessage as boolean
// sender 字段
if (sender?.uin) slimMsg.sender.uin = sender.uin
if (sender?.uid) slimMsg.sender.uid = sender.uid
// V4 新增:保留 rawMessage 中的关键名字字段
if (rawMessage) {
slimMsg.rawMessage = {}
if (rawMessage.sendNickName) slimMsg.rawMessage.sendNickName = rawMessage.sendNickName as string
if (rawMessage.sendMemberName) slimMsg.rawMessage.sendMemberName = rawMessage.sendMemberName as string
if (rawMessage.senderUin) slimMsg.rawMessage.senderUin = rawMessage.senderUin as string
if (rawMessage.senderUid) slimMsg.rawMessage.senderUid = rawMessage.senderUid as string
}
return slimMsg
}
/**
* 预处理 QQ JSON 文件
*/
async function preprocessQQJson(inputPath: string, onProgress?: (progress: ParseProgress) => void): Promise<string> {
const totalBytes = getFileSize(inputPath)
let bytesRead = 0
let messagesProcessed = 0
const tempDir = getTempDir()
ensureDir(tempDir)
const outputFilename = `slim_${Date.now()}_${path.basename(inputPath)}`
const outputPath = path.join(tempDir, outputFilename)
onProgress?.(createProgress('parsing', 0, totalBytes, 0, '预处理:读取文件头...'))
return new Promise((resolve, reject) => {
const headChunks: string[] = []
let headSize = 0
const maxHeadSize = 100000
const headStream = fs.createReadStream(inputPath, { encoding: 'utf-8' })
let chatInfo: Record<string, unknown> = { name: '未知群聊', type: 'group' }
let metadata: Record<string, unknown> | undefined
let statistics: Record<string, unknown> | undefined
headStream.on('data', (chunk: string | Buffer) => {
const str = typeof chunk === 'string' ? chunk : chunk.toString('utf-8')
if (headSize < maxHeadSize) {
headChunks.push(str)
headSize += str.length
} else {
headStream.destroy()
}
})
headStream.on('close', () => {
const headContent = headChunks.join('')
try {
const chatInfoMatch = headContent.match(/"chatInfo"\s*:\s*(\{[^}]+\})/)
if (chatInfoMatch) {
chatInfo = JSON.parse(chatInfoMatch[1])
}
} catch {
// 使用默认值
}
try {
const metadataMatch = headContent.match(/"metadata"\s*:\s*(\{[^}]+\})/)
if (metadataMatch) {
metadata = JSON.parse(metadataMatch[1])
}
} catch {
// 忽略
}
// 解析 statistics 字段(完整保留,用于聊天类型判断)
// statistics 是嵌套对象,后面紧跟 "messages" 字段
try {
const statisticsMatch = headContent.match(/"statistics"\s*:\s*(\{[\s\S]*?\})\s*,\s*"messages"/)
if (statisticsMatch) {
statistics = JSON.parse(statisticsMatch[1])
}
} catch {
// 解析失败时忽略
}
onProgress?.(createProgress('parsing', 0, totalBytes, 0, '预处理:开始精简消息...'))
const readStream = fs.createReadStream(inputPath, { encoding: 'utf-8' })
const writeStream = fs.createWriteStream(outputPath, { encoding: 'utf-8' })
readStream.on('data', (chunk: string | Buffer) => {
bytesRead += typeof chunk === 'string' ? Buffer.byteLength(chunk) : chunk.length
})
const header = { metadata, chatInfo, statistics, messages: [] }
const headerJson = JSON.stringify(header)
// 移除最后的 ]} 保留 [,结果如 {"metadata":...,"chatInfo":...,"statistics":...,"messages":[
writeStream.write(headerJson.slice(0, -2) + '\n')
let isFirstMessage = true
const pipeline = chain([readStream, parser(), pick({ filter: /^messages\.\d+$/ }), streamValues()])
pipeline.on('data', ({ value }: { value: Record<string, unknown> }) => {
const slimMsg = slimMessage(value)
const msgJson = JSON.stringify(slimMsg)
if (isFirstMessage) {
writeStream.write(msgJson)
isFirstMessage = false
} else {
writeStream.write(',\n' + msgJson)
}
messagesProcessed++
if (messagesProcessed % 10000 === 0) {
onProgress?.(
createProgress(
'parsing',
bytesRead,
totalBytes,
messagesProcessed,
`预处理:已精简 ${messagesProcessed} 条消息...`
)
)
}
})
pipeline.on('end', () => {
writeStream.write('\n]}')
writeStream.end()
writeStream.on('finish', () => {
onProgress?.(createProgress('done', totalBytes, totalBytes, messagesProcessed, '预处理完成'))
resolve(outputPath)
})
})
pipeline.on('error', (err) => {
writeStream.destroy()
if (fs.existsSync(outputPath)) {
fs.unlinkSync(outputPath)
}
onProgress?.(createProgress('error', bytesRead, totalBytes, messagesProcessed, `预处理错误: ${err.message}`))
reject(err)
})
})
headStream.on('error', reject)
})
}
/**
* 清理临时文件
*/
function cleanupTempFile(filePath: string): void {
try {
if (fs.existsSync(filePath) && filePath.includes(getTempDir())) {
fs.unlinkSync(filePath)
}
} catch {
// 忽略清理错误
}
}
/**
* QQ Chat Exporter 预处理器
*/
export const qqPreprocessor: Preprocessor = {
needsPreprocess(filePath: string, fileSize: number): boolean {
return fileSize > PREPROCESS_THRESHOLD
},
preprocess: preprocessQQJson,
cleanup: cleanupTempFile,
}