mirror of
https://github.com/hellodigua/ChatLab.git
synced 2026-05-06 13:06:09 +08:00
295 lines
8.6 KiB
TypeScript
295 lines
8.6 KiB
TypeScript
/**
|
|
* shuakami/qq-chat-exporter 格式预处理器
|
|
* 适配项目: https://github.com/shuakami/qq-chat-exporter
|
|
*
|
|
* 功能:移除 content.html、content.raw 等冗余字段,减小文件体积
|
|
* 阈值:>50MB 自动触发预处理
|
|
*/
|
|
|
|
import * as fs from 'fs'
|
|
import * as path from 'path'
|
|
import * as os from 'os'
|
|
import { parser } from 'stream-json'
|
|
import { pick } from 'stream-json/filters/Pick'
|
|
import { streamValues } from 'stream-json/streamers/StreamValues'
|
|
import { chain } from 'stream-chain'
|
|
import type { ParseProgress, Preprocessor } from '../types'
|
|
import { getFileSize, createProgress } from '../utils'
|
|
|
|
/** 预处理阈值:50MB */
|
|
const PREPROCESS_THRESHOLD = 50 * 1024 * 1024
|
|
|
|
/**
|
|
* 获取临时目录
|
|
*/
|
|
function getTempDir(): string {
|
|
return path.join(os.tmpdir(), 'chatlab')
|
|
}
|
|
|
|
/**
|
|
* 确保目录存在
|
|
*/
|
|
function ensureDir(dir: string): void {
|
|
if (!fs.existsSync(dir)) {
|
|
fs.mkdirSync(dir, { recursive: true })
|
|
}
|
|
}
|
|
|
|
/**
|
|
* QQ JSON 消息的精简结构
|
|
*/
|
|
interface SlimQQMessage {
|
|
id?: string
|
|
messageId?: string
|
|
timestamp: number | string
|
|
sender: {
|
|
uid?: string
|
|
uin?: string
|
|
name: string
|
|
}
|
|
type?: string
|
|
messageType?: number
|
|
content: {
|
|
text: string
|
|
elements?: Array<{ type: string }>
|
|
resources?: Array<{ type: string }>
|
|
emojis?: Array<{ type: string }>
|
|
}
|
|
recalled?: boolean
|
|
isRecalled?: boolean
|
|
system?: boolean
|
|
isSystemMessage?: boolean
|
|
// V4 新增:保留 rawMessage 中的名字字段
|
|
rawMessage?: {
|
|
sendNickName?: string
|
|
sendMemberName?: string
|
|
senderUin?: string
|
|
senderUid?: string
|
|
}
|
|
}
|
|
|
|
/**
|
|
* 精简 QQ JSON 消息对象
|
|
*/
|
|
function slimMessage(msg: Record<string, unknown>): SlimQQMessage {
|
|
const sender = msg.sender as { uin?: string; uid?: string; name?: string } | undefined
|
|
const content = msg.content as Record<string, unknown> | undefined
|
|
const rawMessage = msg.rawMessage as Record<string, unknown> | undefined
|
|
|
|
const slimContent: SlimQQMessage['content'] = {
|
|
text: (content?.text as string) || '',
|
|
}
|
|
|
|
if (content?.elements && Array.isArray(content.elements)) {
|
|
slimContent.elements = (content.elements as Array<{ type: string }>).map((e) => ({
|
|
type: e.type,
|
|
}))
|
|
}
|
|
|
|
if (content?.resources && Array.isArray(content.resources)) {
|
|
slimContent.resources = (content.resources as Array<{ type: string }>).map((r) => ({
|
|
type: r.type,
|
|
}))
|
|
}
|
|
|
|
if (content?.emojis && Array.isArray(content.emojis)) {
|
|
slimContent.emojis = (content.emojis as Array<{ type: string }>).map((e) => ({
|
|
type: e.type,
|
|
}))
|
|
}
|
|
|
|
const slimMsg: SlimQQMessage = {
|
|
timestamp: msg.timestamp as number | string,
|
|
sender: { name: sender?.name || '' },
|
|
content: slimContent,
|
|
}
|
|
|
|
// 旧格式字段
|
|
if (msg.id) slimMsg.id = msg.id as string
|
|
if (msg.type) slimMsg.type = msg.type as string
|
|
if (msg.recalled) slimMsg.recalled = msg.recalled as boolean
|
|
if (msg.system) slimMsg.system = msg.system as boolean
|
|
|
|
// V4 新格式字段
|
|
if (msg.messageId) slimMsg.messageId = msg.messageId as string
|
|
if (msg.messageType !== undefined) slimMsg.messageType = msg.messageType as number
|
|
if (msg.isRecalled) slimMsg.isRecalled = msg.isRecalled as boolean
|
|
if (msg.isSystemMessage) slimMsg.isSystemMessage = msg.isSystemMessage as boolean
|
|
|
|
// sender 字段
|
|
if (sender?.uin) slimMsg.sender.uin = sender.uin
|
|
if (sender?.uid) slimMsg.sender.uid = sender.uid
|
|
|
|
// V4 新增:保留 rawMessage 中的关键名字字段
|
|
if (rawMessage) {
|
|
slimMsg.rawMessage = {}
|
|
if (rawMessage.sendNickName) slimMsg.rawMessage.sendNickName = rawMessage.sendNickName as string
|
|
if (rawMessage.sendMemberName) slimMsg.rawMessage.sendMemberName = rawMessage.sendMemberName as string
|
|
if (rawMessage.senderUin) slimMsg.rawMessage.senderUin = rawMessage.senderUin as string
|
|
if (rawMessage.senderUid) slimMsg.rawMessage.senderUid = rawMessage.senderUid as string
|
|
}
|
|
|
|
return slimMsg
|
|
}
|
|
|
|
/**
|
|
* 预处理 QQ JSON 文件
|
|
*/
|
|
async function preprocessQQJson(inputPath: string, onProgress?: (progress: ParseProgress) => void): Promise<string> {
|
|
const totalBytes = getFileSize(inputPath)
|
|
let bytesRead = 0
|
|
let messagesProcessed = 0
|
|
|
|
const tempDir = getTempDir()
|
|
ensureDir(tempDir)
|
|
const outputFilename = `slim_${Date.now()}_${path.basename(inputPath)}`
|
|
const outputPath = path.join(tempDir, outputFilename)
|
|
|
|
onProgress?.(createProgress('parsing', 0, totalBytes, 0, '预处理:读取文件头...'))
|
|
|
|
return new Promise((resolve, reject) => {
|
|
const headChunks: string[] = []
|
|
let headSize = 0
|
|
const maxHeadSize = 100000
|
|
|
|
const headStream = fs.createReadStream(inputPath, { encoding: 'utf-8' })
|
|
let chatInfo: Record<string, unknown> = { name: '未知群聊', type: 'group' }
|
|
let metadata: Record<string, unknown> | undefined
|
|
let statistics: Record<string, unknown> | undefined
|
|
|
|
headStream.on('data', (chunk: string | Buffer) => {
|
|
const str = typeof chunk === 'string' ? chunk : chunk.toString('utf-8')
|
|
if (headSize < maxHeadSize) {
|
|
headChunks.push(str)
|
|
headSize += str.length
|
|
} else {
|
|
headStream.destroy()
|
|
}
|
|
})
|
|
|
|
headStream.on('close', () => {
|
|
const headContent = headChunks.join('')
|
|
|
|
try {
|
|
const chatInfoMatch = headContent.match(/"chatInfo"\s*:\s*(\{[^}]+\})/)
|
|
if (chatInfoMatch) {
|
|
chatInfo = JSON.parse(chatInfoMatch[1])
|
|
}
|
|
} catch {
|
|
// 使用默认值
|
|
}
|
|
|
|
try {
|
|
const metadataMatch = headContent.match(/"metadata"\s*:\s*(\{[^}]+\})/)
|
|
if (metadataMatch) {
|
|
metadata = JSON.parse(metadataMatch[1])
|
|
}
|
|
} catch {
|
|
// 忽略
|
|
}
|
|
|
|
// 解析 statistics 字段(完整保留,用于聊天类型判断)
|
|
// statistics 是嵌套对象,后面紧跟 "messages" 字段
|
|
try {
|
|
const statisticsMatch = headContent.match(/"statistics"\s*:\s*(\{[\s\S]*?\})\s*,\s*"messages"/)
|
|
if (statisticsMatch) {
|
|
statistics = JSON.parse(statisticsMatch[1])
|
|
}
|
|
} catch {
|
|
// 解析失败时忽略
|
|
}
|
|
|
|
onProgress?.(createProgress('parsing', 0, totalBytes, 0, '预处理:开始精简消息...'))
|
|
|
|
const readStream = fs.createReadStream(inputPath, { encoding: 'utf-8' })
|
|
const writeStream = fs.createWriteStream(outputPath, { encoding: 'utf-8' })
|
|
|
|
readStream.on('data', (chunk: string | Buffer) => {
|
|
bytesRead += typeof chunk === 'string' ? Buffer.byteLength(chunk) : chunk.length
|
|
})
|
|
|
|
const header = { metadata, chatInfo, statistics, messages: [] }
|
|
const headerJson = JSON.stringify(header)
|
|
// 移除最后的 ]} 保留 [,结果如 {"metadata":...,"chatInfo":...,"statistics":...,"messages":[
|
|
writeStream.write(headerJson.slice(0, -2) + '\n')
|
|
|
|
let isFirstMessage = true
|
|
|
|
const pipeline = chain([readStream, parser(), pick({ filter: /^messages\.\d+$/ }), streamValues()])
|
|
|
|
pipeline.on('data', ({ value }: { value: Record<string, unknown> }) => {
|
|
const slimMsg = slimMessage(value)
|
|
const msgJson = JSON.stringify(slimMsg)
|
|
|
|
if (isFirstMessage) {
|
|
writeStream.write(msgJson)
|
|
isFirstMessage = false
|
|
} else {
|
|
writeStream.write(',\n' + msgJson)
|
|
}
|
|
|
|
messagesProcessed++
|
|
|
|
if (messagesProcessed % 10000 === 0) {
|
|
onProgress?.(
|
|
createProgress(
|
|
'parsing',
|
|
bytesRead,
|
|
totalBytes,
|
|
messagesProcessed,
|
|
`预处理:已精简 ${messagesProcessed} 条消息...`
|
|
)
|
|
)
|
|
}
|
|
})
|
|
|
|
pipeline.on('end', () => {
|
|
writeStream.write('\n]}')
|
|
writeStream.end()
|
|
|
|
writeStream.on('finish', () => {
|
|
onProgress?.(createProgress('done', totalBytes, totalBytes, messagesProcessed, '预处理完成'))
|
|
resolve(outputPath)
|
|
})
|
|
})
|
|
|
|
pipeline.on('error', (err) => {
|
|
writeStream.destroy()
|
|
if (fs.existsSync(outputPath)) {
|
|
fs.unlinkSync(outputPath)
|
|
}
|
|
onProgress?.(createProgress('error', bytesRead, totalBytes, messagesProcessed, `预处理错误: ${err.message}`))
|
|
reject(err)
|
|
})
|
|
})
|
|
|
|
headStream.on('error', reject)
|
|
})
|
|
}
|
|
|
|
/**
|
|
* 清理临时文件
|
|
*/
|
|
function cleanupTempFile(filePath: string): void {
|
|
try {
|
|
if (fs.existsSync(filePath) && filePath.includes(getTempDir())) {
|
|
fs.unlinkSync(filePath)
|
|
}
|
|
} catch {
|
|
// 忽略清理错误
|
|
}
|
|
}
|
|
|
|
/**
|
|
* QQ Chat Exporter 预处理器
|
|
*/
|
|
export const qqPreprocessor: Preprocessor = {
|
|
needsPreprocess(filePath: string, fileSize: number): boolean {
|
|
return fileSize > PREPROCESS_THRESHOLD
|
|
},
|
|
|
|
preprocess: preprocessQQJson,
|
|
|
|
cleanup: cleanupTempFile,
|
|
}
|