mirror of
https://github.com/hellodigua/ChatLab.git
synced 2026-04-30 16:52:45 +08:00
373 lines
10 KiB
TypeScript
373 lines
10 KiB
TypeScript
/**
|
||
* shuakami/qq-chat-exporter 格式预处理器
|
||
* 适配项目: https://github.com/shuakami/qq-chat-exporter
|
||
*
|
||
* 功能:移除 content.html、content.raw 等冗余字段,减小文件体积
|
||
* 阈值:>50MB 自动触发预处理
|
||
*/
|
||
|
||
import * as fs from 'fs'
|
||
import * as path from 'path'
|
||
import * as os from 'os'
|
||
import { parser } from 'stream-json'
|
||
import { pick } from 'stream-json/filters/Pick'
|
||
import { streamValues } from 'stream-json/streamers/StreamValues'
|
||
import { chain } from 'stream-chain'
|
||
import type { ParseProgress, Preprocessor } from '../types'
|
||
import { getFileSize, createProgress } from '../utils'
|
||
|
||
/** 预处理阈值:50MB */
|
||
const PREPROCESS_THRESHOLD = 50 * 1024 * 1024
|
||
|
||
/**
|
||
* 获取临时目录
|
||
*/
|
||
function getTempDir(): string {
|
||
return path.join(os.tmpdir(), 'chatlab')
|
||
}
|
||
|
||
/**
|
||
* 确保目录存在
|
||
*/
|
||
function ensureDir(dir: string): void {
|
||
if (!fs.existsSync(dir)) {
|
||
fs.mkdirSync(dir, { recursive: true })
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 从字符串中提取 JSON 对象(处理嵌套和转义)
|
||
*/
|
||
function extractJsonObject(content: string, key: string): string | null {
|
||
const searchStr = `"${key}":`
|
||
const startIdx = content.indexOf(searchStr)
|
||
if (startIdx === -1) return null
|
||
|
||
let i = startIdx + searchStr.length
|
||
while (i < content.length && /\s/.test(content[i])) i++
|
||
|
||
if (content[i] !== '{') return null
|
||
|
||
let braceDepth = 0
|
||
let inString = false
|
||
let escape = false
|
||
const objStart = i
|
||
|
||
for (; i < content.length; i++) {
|
||
const char = content[i]
|
||
|
||
if (escape) {
|
||
escape = false
|
||
continue
|
||
}
|
||
|
||
if (char === '\\' && inString) {
|
||
escape = true
|
||
continue
|
||
}
|
||
|
||
if (char === '"') {
|
||
inString = !inString
|
||
continue
|
||
}
|
||
|
||
if (!inString) {
|
||
if (char === '{') braceDepth++
|
||
if (char === '}') {
|
||
braceDepth--
|
||
if (braceDepth === 0) {
|
||
return content.slice(objStart, i + 1)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
return null
|
||
}
|
||
|
||
/**
|
||
* 从文件末尾读取 avatars 对象
|
||
*/
|
||
function readAvatarsFromFile(filePath: string): string | null {
|
||
try {
|
||
const stats = fs.statSync(filePath)
|
||
const tailSize = Math.min(stats.size, 5000000) // 最多读取 5MB
|
||
const fd = fs.openSync(filePath, 'r')
|
||
const buffer = Buffer.alloc(tailSize)
|
||
fs.readSync(fd, buffer, 0, tailSize, stats.size - tailSize)
|
||
fs.closeSync(fd)
|
||
|
||
const tailContent = buffer.toString('utf-8')
|
||
return extractJsonObject(tailContent, 'avatars')
|
||
} catch {
|
||
return null
|
||
}
|
||
}
|
||
|
||
/**
|
||
* QQ JSON 消息的精简结构
|
||
*/
|
||
interface SlimQQMessage {
|
||
id?: string
|
||
messageId?: string
|
||
timestamp: number | string
|
||
sender: {
|
||
uid?: string
|
||
uin?: string
|
||
name: string
|
||
}
|
||
type?: string
|
||
messageType?: number
|
||
content: {
|
||
text: string
|
||
elements?: Array<{ type: string }>
|
||
resources?: Array<{ type: string }>
|
||
emojis?: Array<{ type: string }>
|
||
}
|
||
recalled?: boolean
|
||
isRecalled?: boolean
|
||
system?: boolean
|
||
isSystemMessage?: boolean
|
||
rawMessage?: {
|
||
sendNickName?: string
|
||
sendMemberName?: string
|
||
senderUin?: string
|
||
senderUid?: string
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 精简 QQ JSON 消息对象
|
||
*/
|
||
function slimMessage(msg: Record<string, unknown>): SlimQQMessage {
|
||
const sender = msg.sender as { uin?: string; uid?: string; name?: string } | undefined
|
||
const content = msg.content as Record<string, unknown> | undefined
|
||
const rawMessage = msg.rawMessage as Record<string, unknown> | undefined
|
||
|
||
const slimContent: SlimQQMessage['content'] = {
|
||
text: (content?.text as string) || '',
|
||
}
|
||
|
||
if (content?.elements && Array.isArray(content.elements)) {
|
||
slimContent.elements = (content.elements as Array<{ type: string }>).map((e) => ({
|
||
type: e.type,
|
||
}))
|
||
}
|
||
|
||
if (content?.resources && Array.isArray(content.resources)) {
|
||
slimContent.resources = (content.resources as Array<{ type: string }>).map((r) => ({
|
||
type: r.type,
|
||
}))
|
||
}
|
||
|
||
if (content?.emojis && Array.isArray(content.emojis)) {
|
||
slimContent.emojis = (content.emojis as Array<{ type: string }>).map((e) => ({
|
||
type: e.type,
|
||
}))
|
||
}
|
||
|
||
const slimMsg: SlimQQMessage = {
|
||
timestamp: msg.timestamp as number | string,
|
||
sender: { name: sender?.name || '' },
|
||
content: slimContent,
|
||
}
|
||
|
||
// 旧格式字段
|
||
if (msg.id) slimMsg.id = msg.id as string
|
||
if (msg.type) slimMsg.type = msg.type as string
|
||
if (msg.recalled) slimMsg.recalled = msg.recalled as boolean
|
||
if (msg.system) slimMsg.system = msg.system as boolean
|
||
|
||
// V4 新格式字段
|
||
if (msg.messageId) slimMsg.messageId = msg.messageId as string
|
||
if (msg.messageType !== undefined) slimMsg.messageType = msg.messageType as number
|
||
if (msg.isRecalled) slimMsg.isRecalled = msg.isRecalled as boolean
|
||
if (msg.isSystemMessage) slimMsg.isSystemMessage = msg.isSystemMessage as boolean
|
||
|
||
// sender 字段
|
||
if (sender?.uin) slimMsg.sender.uin = sender.uin
|
||
if (sender?.uid) slimMsg.sender.uid = sender.uid
|
||
|
||
// V4 新增:保留 rawMessage 中的关键名字字段
|
||
if (rawMessage) {
|
||
slimMsg.rawMessage = {}
|
||
if (rawMessage.sendNickName) slimMsg.rawMessage.sendNickName = rawMessage.sendNickName as string
|
||
if (rawMessage.sendMemberName) slimMsg.rawMessage.sendMemberName = rawMessage.sendMemberName as string
|
||
if (rawMessage.senderUin) slimMsg.rawMessage.senderUin = rawMessage.senderUin as string
|
||
if (rawMessage.senderUid) slimMsg.rawMessage.senderUid = rawMessage.senderUid as string
|
||
}
|
||
|
||
return slimMsg
|
||
}
|
||
|
||
/**
|
||
* 预处理 QQ JSON 文件
|
||
*/
|
||
async function preprocessQQJson(inputPath: string, onProgress?: (progress: ParseProgress) => void): Promise<string> {
|
||
const totalBytes = getFileSize(inputPath)
|
||
let bytesRead = 0
|
||
let messagesProcessed = 0
|
||
|
||
const tempDir = getTempDir()
|
||
ensureDir(tempDir)
|
||
const outputFilename = `slim_${Date.now()}_${path.basename(inputPath)}`
|
||
const outputPath = path.join(tempDir, outputFilename)
|
||
|
||
onProgress?.(createProgress('parsing', 0, totalBytes, 0, ''))
|
||
|
||
// 先从原文件读取 avatars(因为它在文件末尾,消息处理时可能无法访问)
|
||
const avatarsStr = readAvatarsFromFile(inputPath)
|
||
|
||
return new Promise((resolve, reject) => {
|
||
const headChunks: string[] = []
|
||
let headSize = 0
|
||
const maxHeadSize = 100000
|
||
|
||
const headStream = fs.createReadStream(inputPath, { encoding: 'utf-8' })
|
||
let chatInfo: Record<string, unknown> = { name: '未知群聊', type: 'group' }
|
||
let metadata: Record<string, unknown> | undefined
|
||
let statistics: Record<string, unknown> | undefined
|
||
|
||
headStream.on('data', (chunk: string | Buffer) => {
|
||
const str = typeof chunk === 'string' ? chunk : chunk.toString('utf-8')
|
||
if (headSize < maxHeadSize) {
|
||
headChunks.push(str)
|
||
headSize += str.length
|
||
} else {
|
||
headStream.destroy()
|
||
}
|
||
})
|
||
|
||
headStream.on('close', () => {
|
||
const headContent = headChunks.join('')
|
||
|
||
try {
|
||
const chatInfoMatch = headContent.match(/"chatInfo"\s*:\s*(\{[^}]+\})/)
|
||
if (chatInfoMatch) {
|
||
chatInfo = JSON.parse(chatInfoMatch[1])
|
||
}
|
||
} catch {
|
||
// 使用默认值
|
||
}
|
||
|
||
try {
|
||
const metadataMatch = headContent.match(/"metadata"\s*:\s*(\{[^}]+\})/)
|
||
if (metadataMatch) {
|
||
metadata = JSON.parse(metadataMatch[1])
|
||
}
|
||
} catch {
|
||
// 忽略
|
||
}
|
||
|
||
try {
|
||
const statisticsMatch = headContent.match(/"statistics"\s*:\s*(\{[\s\S]*?\})\s*,\s*"messages"/)
|
||
if (statisticsMatch) {
|
||
statistics = JSON.parse(statisticsMatch[1])
|
||
}
|
||
} catch {
|
||
// 解析失败时忽略
|
||
}
|
||
|
||
onProgress?.(createProgress('parsing', 0, totalBytes, 0, ''))
|
||
|
||
const readStream = fs.createReadStream(inputPath, { encoding: 'utf-8' })
|
||
const writeStream = fs.createWriteStream(outputPath, { encoding: 'utf-8' })
|
||
|
||
readStream.on('data', (chunk: string | Buffer) => {
|
||
bytesRead += typeof chunk === 'string' ? Buffer.byteLength(chunk) : chunk.length
|
||
})
|
||
|
||
const header = { metadata, chatInfo, statistics, messages: [] }
|
||
const headerJson = JSON.stringify(header)
|
||
// 移除最后的 ]} 保留 [
|
||
writeStream.write(headerJson.slice(0, -2) + '\n')
|
||
|
||
let isFirstMessage = true
|
||
|
||
const pipeline = chain([readStream, parser(), pick({ filter: /^messages\.\d+$/ }), streamValues()])
|
||
|
||
pipeline.on('data', ({ value }: { value: Record<string, unknown> }) => {
|
||
const slimMsg = slimMessage(value)
|
||
const msgJson = JSON.stringify(slimMsg)
|
||
|
||
if (isFirstMessage) {
|
||
writeStream.write(msgJson)
|
||
isFirstMessage = false
|
||
} else {
|
||
writeStream.write(',\n' + msgJson)
|
||
}
|
||
|
||
messagesProcessed++
|
||
|
||
if (messagesProcessed % 10000 === 0) {
|
||
onProgress?.(
|
||
createProgress(
|
||
'parsing',
|
||
bytesRead,
|
||
totalBytes,
|
||
messagesProcessed,
|
||
`预处理:已精简 ${messagesProcessed} 条消息...`
|
||
)
|
||
)
|
||
}
|
||
})
|
||
|
||
pipeline.on('end', () => {
|
||
// 关闭 messages 数组
|
||
writeStream.write('\n]')
|
||
|
||
// 添加 avatars 对象(如果存在)
|
||
if (avatarsStr) {
|
||
writeStream.write(',"avatars":' + avatarsStr)
|
||
}
|
||
|
||
// 关闭 JSON 对象
|
||
writeStream.write('}')
|
||
writeStream.end()
|
||
|
||
writeStream.on('finish', () => {
|
||
onProgress?.(createProgress('done', totalBytes, totalBytes, messagesProcessed, ''))
|
||
resolve(outputPath)
|
||
})
|
||
})
|
||
|
||
pipeline.on('error', (err) => {
|
||
writeStream.destroy()
|
||
if (fs.existsSync(outputPath)) {
|
||
fs.unlinkSync(outputPath)
|
||
}
|
||
onProgress?.(createProgress('error', bytesRead, totalBytes, messagesProcessed, err.message))
|
||
reject(err)
|
||
})
|
||
})
|
||
|
||
headStream.on('error', reject)
|
||
})
|
||
}
|
||
|
||
/**
|
||
* 清理临时文件
|
||
*/
|
||
function cleanupTempFile(filePath: string): void {
|
||
try {
|
||
if (fs.existsSync(filePath) && filePath.includes(getTempDir())) {
|
||
fs.unlinkSync(filePath)
|
||
}
|
||
} catch {
|
||
// 忽略清理错误
|
||
}
|
||
}
|
||
|
||
/**
|
||
* QQ Chat Exporter 预处理器
|
||
*/
|
||
export const qqPreprocessor: Preprocessor = {
|
||
needsPreprocess(filePath: string, fileSize: number): boolean {
|
||
return fileSize > PREPROCESS_THRESHOLD
|
||
},
|
||
|
||
preprocess: preprocessQQJson,
|
||
|
||
cleanup: cleanupTempFile,
|
||
}
|