mirror of
https://github.com/hellodigua/ChatLab.git
synced 2026-05-19 12:59:09 +08:00
feat: 支持WhatsApp原生格式消息
This commit is contained in:
@@ -12,6 +12,7 @@ import shuakamiQqExporter from './shuakami-qq-exporter'
|
||||
import yccccccyEchotrace from './ycccccccy-echotrace'
|
||||
import wechatDefault from './wechat-default'
|
||||
import qqNativeTxt from './qq-native-txt'
|
||||
import whatsappNativeTxt from './whatsapp-native-txt'
|
||||
|
||||
/**
|
||||
* 所有支持的格式模块(按优先级排序)
|
||||
@@ -22,8 +23,17 @@ export const formats: FormatModule[] = [
|
||||
shuakamiQqExporter, // 优先级 10 - shuakami/qq-chat-exporter
|
||||
yccccccyEchotrace, // 优先级 15 - ycccccccy/echotrace 微信导出
|
||||
wechatDefault, // 优先级 20 - 微信数据库导出 JSON
|
||||
whatsappNativeTxt, // 优先级 25 - WhatsApp 官方导出 TXT
|
||||
qqNativeTxt, // 优先级 30 - QQ 官方导出 TXT
|
||||
]
|
||||
|
||||
// 按名称导出,方便单独使用
|
||||
export { chatlab, chatlabJsonl, shuakamiQqExporter, yccccccyEchotrace, wechatDefault, qqNativeTxt }
|
||||
export {
|
||||
chatlab,
|
||||
chatlabJsonl,
|
||||
shuakamiQqExporter,
|
||||
yccccccyEchotrace,
|
||||
wechatDefault,
|
||||
qqNativeTxt,
|
||||
whatsappNativeTxt,
|
||||
}
|
||||
|
||||
@@ -0,0 +1,339 @@
|
||||
/**
|
||||
* WhatsApp 官方导出 TXT 格式解析器
|
||||
* 适配 WhatsApp 聊天导出功能
|
||||
*
|
||||
* 格式特征:
|
||||
* - 文件头:消息和通话已进行端到端加密
|
||||
* - 消息格式:YYYY/MM/DD HH:MM - 昵称: 内容
|
||||
* - 系统消息:YYYY/MM/DD HH:MM - 系统内容(无冒号分隔)
|
||||
* - 媒体占位:<省略影音内容>
|
||||
*/
|
||||
|
||||
import * as fs from 'fs'
|
||||
import * as path from 'path'
|
||||
import * as readline from 'readline'
|
||||
import { KNOWN_PLATFORMS, ChatType, MessageType } from '../../../../src/types/base'
|
||||
import type {
|
||||
FormatFeature,
|
||||
FormatModule,
|
||||
Parser,
|
||||
ParseOptions,
|
||||
ParseEvent,
|
||||
ParsedMeta,
|
||||
ParsedMember,
|
||||
ParsedMessage,
|
||||
} from '../types'
|
||||
import { getFileSize, createProgress } from '../utils'
|
||||
|
||||
// ==================== 辅助函数 ====================
|
||||
|
||||
/**
|
||||
* 从文件名提取聊天名称
|
||||
* 例如:与开心每一天的 WhatsApp 聊天.txt → 开心每一天
|
||||
* 例如:与gaoberry37的 WhatsApp 聊天.txt → gaoberry37
|
||||
*/
|
||||
function extractNameFromFilePath(filePath: string): string {
|
||||
const basename = path.basename(filePath)
|
||||
// 匹配:与xxx的 WhatsApp 聊天.txt
|
||||
const match = basename.match(/^与(.+?)的\s*WhatsApp\s*聊天\.txt$/i)
|
||||
if (match) {
|
||||
return match[1].trim()
|
||||
}
|
||||
// 兜底:移除扩展名
|
||||
return basename.replace(/\.txt$/i, '') || '未知聊天'
|
||||
}
|
||||
|
||||
// ==================== 特征定义 ====================
|
||||
|
||||
export const feature: FormatFeature = {
|
||||
id: 'whatsapp-native-txt',
|
||||
name: 'WhatsApp 官方导出 (TXT)',
|
||||
platform: KNOWN_PLATFORMS.WHATSAPP,
|
||||
priority: 25,
|
||||
extensions: ['.txt'],
|
||||
signatures: {
|
||||
// WhatsApp 导出文件的特征
|
||||
head: [/消息和通话已进行端到端加密/, /WhatsApp/i],
|
||||
},
|
||||
}
|
||||
|
||||
// ==================== 辅助函数:清理不可见字符 ====================
|
||||
|
||||
/**
|
||||
* 清理行首/行尾的不可见 Unicode 字符
|
||||
* WhatsApp 导出文件中可能包含 BOM、Left-to-Right Mark (U+200E) 等
|
||||
*/
|
||||
function cleanLine(line: string): string {
|
||||
// 移除常见的不可见字符:BOM、LTR Mark、RTL Mark、零宽字符等
|
||||
return line.replace(/^[\uFEFF\u200E\u200F\u200B\u200C\u200D\u2060]+/, '').trim()
|
||||
}
|
||||
|
||||
// ==================== 消息头正则 ====================
|
||||
|
||||
// 匹配消息行:2025/12/22 12:35 - 地瓜: 内容
|
||||
// 或系统消息:2025/12/22 12:35 - 系统消息内容
|
||||
const MESSAGE_LINE_REGEX = /^(\d{4}\/\d{2}\/\d{2} \d{2}:\d{2}) - (.+)$/
|
||||
|
||||
// 从消息内容中分离昵称和实际内容
|
||||
// 格式:昵称: 内容
|
||||
const SENDER_CONTENT_REGEX = /^(.+?): (.*)$/
|
||||
|
||||
// ==================== 系统消息识别 ====================
|
||||
|
||||
const SYSTEM_MESSAGE_PATTERNS = [
|
||||
/消息和通话已进行端到端加密/,
|
||||
/创建了此群组/,
|
||||
/加入群组/,
|
||||
/添加了/,
|
||||
/退出了群组/,
|
||||
/移除了/,
|
||||
/更改了本群组/,
|
||||
/已将此群组的设置更改为/,
|
||||
/这条消息已删除/,
|
||||
]
|
||||
|
||||
function isSystemMessage(content: string): boolean {
|
||||
return SYSTEM_MESSAGE_PATTERNS.some((pattern) => pattern.test(content))
|
||||
}
|
||||
|
||||
// ==================== 消息类型判断 ====================
|
||||
|
||||
function detectMessageType(content: string): MessageType {
|
||||
const trimmed = content.trim()
|
||||
|
||||
// 媒体消息
|
||||
if (trimmed === '<省略影音内容>') return MessageType.IMAGE // 统一归类为图片
|
||||
if (trimmed.includes('<已附加:') || trimmed.includes('<附件:')) return MessageType.FILE
|
||||
|
||||
// 删除消息
|
||||
if (trimmed === '这条消息已删除') return MessageType.RECALL
|
||||
|
||||
// 系统消息
|
||||
if (isSystemMessage(trimmed)) return MessageType.SYSTEM
|
||||
|
||||
return MessageType.TEXT
|
||||
}
|
||||
|
||||
// ==================== 时间解析 ====================
|
||||
|
||||
/**
|
||||
* 解析 WhatsApp 时间格式为秒级时间戳
|
||||
* @param timeStr 格式:2025/12/22 12:35
|
||||
*/
|
||||
function parseWhatsAppTime(timeStr: string): number {
|
||||
// 将 YYYY/MM/DD HH:MM 转换为 YYYY-MM-DDTHH:MM
|
||||
const normalized = timeStr.replace(/\//g, '-').replace(' ', 'T') + ':00'
|
||||
const date = new Date(normalized)
|
||||
return Math.floor(date.getTime() / 1000)
|
||||
}
|
||||
|
||||
// ==================== 成员信息 ====================
|
||||
|
||||
interface MemberInfo {
|
||||
platformId: string
|
||||
nickname: string
|
||||
}
|
||||
|
||||
// ==================== 解析器实现 ====================
|
||||
|
||||
async function* parseWhatsApp(options: ParseOptions): AsyncGenerator<ParseEvent, void, unknown> {
|
||||
const { filePath, batchSize = 5000, onProgress, onLog } = options
|
||||
|
||||
const totalBytes = getFileSize(filePath)
|
||||
let bytesRead = 0
|
||||
let messagesProcessed = 0
|
||||
let skippedLines = 0
|
||||
|
||||
// 发送初始进度
|
||||
const initialProgress = createProgress('parsing', 0, totalBytes, 0, '开始解析...')
|
||||
yield { type: 'progress', data: initialProgress }
|
||||
onProgress?.(initialProgress)
|
||||
|
||||
// 记录解析开始
|
||||
onLog?.('info', `开始解析 WhatsApp TXT 文件,大小: ${(totalBytes / 1024 / 1024).toFixed(2)} MB`)
|
||||
|
||||
// 收集数据
|
||||
const chatName = extractNameFromFilePath(filePath)
|
||||
const memberMap = new Map<string, MemberInfo>()
|
||||
const messages: ParsedMessage[] = []
|
||||
|
||||
// 当前正在解析的消息(可能跨多行)
|
||||
let currentMessage: {
|
||||
timestamp: number
|
||||
sender: string | null // null 表示系统消息
|
||||
contentLines: string[]
|
||||
} | null = null
|
||||
|
||||
// 保存当前消息
|
||||
const saveCurrentMessage = () => {
|
||||
if (currentMessage) {
|
||||
const content = currentMessage.contentLines.join('\n').trim()
|
||||
const type = detectMessageType(content)
|
||||
|
||||
// 系统消息使用特殊 ID 和统一名称
|
||||
const senderPlatformId = currentMessage.sender || 'system'
|
||||
const senderName = currentMessage.sender || '系统消息'
|
||||
|
||||
messages.push({
|
||||
senderPlatformId,
|
||||
senderAccountName: senderName,
|
||||
timestamp: currentMessage.timestamp,
|
||||
type,
|
||||
content: content || null,
|
||||
})
|
||||
|
||||
// 更新成员信息(跳过系统消息)
|
||||
if (currentMessage.sender) {
|
||||
memberMap.set(senderPlatformId, {
|
||||
platformId: senderPlatformId,
|
||||
nickname: senderName,
|
||||
})
|
||||
}
|
||||
|
||||
messagesProcessed++
|
||||
}
|
||||
}
|
||||
|
||||
// 逐行读取文件
|
||||
const fileStream = fs.createReadStream(filePath, { encoding: 'utf-8' })
|
||||
const rl = readline.createInterface({
|
||||
input: fileStream,
|
||||
crlfDelay: Infinity,
|
||||
})
|
||||
|
||||
fileStream.on('data', (chunk: string | Buffer) => {
|
||||
bytesRead += typeof chunk === 'string' ? Buffer.byteLength(chunk) : chunk.length
|
||||
})
|
||||
|
||||
for await (const line of rl) {
|
||||
// 清理行首不可见字符
|
||||
const cleanedLine = cleanLine(line)
|
||||
|
||||
// 尝试匹配消息行
|
||||
const lineMatch = cleanedLine.match(MESSAGE_LINE_REGEX)
|
||||
if (lineMatch) {
|
||||
// 保存前一条消息
|
||||
saveCurrentMessage()
|
||||
|
||||
const timeStr = lineMatch[1]
|
||||
const restContent = lineMatch[2]
|
||||
|
||||
// 尝试分离发送者和内容
|
||||
const senderMatch = restContent.match(SENDER_CONTENT_REGEX)
|
||||
if (senderMatch && !isSystemMessage(restContent)) {
|
||||
// 普通消息
|
||||
currentMessage = {
|
||||
timestamp: parseWhatsAppTime(timeStr),
|
||||
sender: senderMatch[1].trim(),
|
||||
contentLines: [senderMatch[2]],
|
||||
}
|
||||
} else {
|
||||
// 系统消息
|
||||
currentMessage = {
|
||||
timestamp: parseWhatsAppTime(timeStr),
|
||||
sender: null,
|
||||
contentLines: [restContent],
|
||||
}
|
||||
}
|
||||
|
||||
// 更新进度
|
||||
if (messagesProcessed % 500 === 0) {
|
||||
const progress = createProgress(
|
||||
'parsing',
|
||||
bytesRead,
|
||||
totalBytes,
|
||||
messagesProcessed,
|
||||
`已处理 ${messagesProcessed} 条消息...`
|
||||
)
|
||||
onProgress?.(progress)
|
||||
}
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
// 非消息行:可能是多行消息的延续
|
||||
if (currentMessage && cleanedLine) {
|
||||
currentMessage.contentLines.push(cleanedLine)
|
||||
} else if (cleanedLine) {
|
||||
// 无法解析的非空行
|
||||
skippedLines++
|
||||
}
|
||||
}
|
||||
|
||||
// 保存最后一条消息
|
||||
saveCurrentMessage()
|
||||
|
||||
// 确定聊天类型:根据参与者数量判断
|
||||
// - 排除系统成员后,2 人或更少:私聊
|
||||
// - 超过 2 人:群聊
|
||||
const hasSystemMember = memberMap.has('system')
|
||||
const realMemberCount = hasSystemMember ? memberMap.size - 1 : memberMap.size
|
||||
|
||||
const chatType = realMemberCount > 2 ? ChatType.GROUP : ChatType.PRIVATE
|
||||
|
||||
// 发送 meta
|
||||
const meta: ParsedMeta = {
|
||||
name: chatName,
|
||||
platform: KNOWN_PLATFORMS.WHATSAPP,
|
||||
type: chatType,
|
||||
}
|
||||
yield { type: 'meta', data: meta }
|
||||
|
||||
// 发送成员
|
||||
const members: ParsedMember[] = Array.from(memberMap.values()).map((m) => ({
|
||||
platformId: m.platformId,
|
||||
accountName: m.nickname,
|
||||
}))
|
||||
yield { type: 'members', data: members }
|
||||
|
||||
// 分批发送消息
|
||||
for (let i = 0; i < messages.length; i += batchSize) {
|
||||
const batch = messages.slice(i, i + batchSize)
|
||||
yield { type: 'messages', data: batch }
|
||||
}
|
||||
|
||||
// 完成
|
||||
const doneProgress = createProgress('done', totalBytes, totalBytes, messagesProcessed, '解析完成')
|
||||
yield { type: 'progress', data: doneProgress }
|
||||
onProgress?.(doneProgress)
|
||||
|
||||
// 统计消息类型
|
||||
const typeCounts = new Map<MessageType, number>()
|
||||
for (const msg of messages) {
|
||||
typeCounts.set(msg.type, (typeCounts.get(msg.type) || 0) + 1)
|
||||
}
|
||||
|
||||
// 记录解析摘要
|
||||
onLog?.('info', `解析完成: ${messagesProcessed} 条消息, ${memberMap.size} 个成员, 类型: ${chatType}`)
|
||||
onLog?.(
|
||||
'info',
|
||||
`消息类型统计: ${Array.from(typeCounts.entries())
|
||||
.map(([type, count]) => `${type}=${count}`)
|
||||
.join(', ')}`
|
||||
)
|
||||
if (skippedLines > 0) {
|
||||
onLog?.('info', `跳过 ${skippedLines} 行无法解析的内容`)
|
||||
}
|
||||
|
||||
yield {
|
||||
type: 'done',
|
||||
data: { messageCount: messagesProcessed, memberCount: memberMap.size },
|
||||
}
|
||||
}
|
||||
|
||||
// ==================== 导出解析器 ====================
|
||||
|
||||
export const parser_: Parser = {
|
||||
feature,
|
||||
parse: parseWhatsApp,
|
||||
}
|
||||
|
||||
// ==================== 导出格式模块 ====================
|
||||
|
||||
const module_: FormatModule = {
|
||||
feature,
|
||||
parser: parser_,
|
||||
// TXT 格式不需要预处理器
|
||||
}
|
||||
|
||||
export default module_
|
||||
@@ -379,6 +379,7 @@ export async function streamImport(filePath: string, requestId: string): Promise
|
||||
|
||||
onMeta: (meta: ParsedMeta) => {
|
||||
if (!metaInserted) {
|
||||
logInfo(`写入 meta: name=${meta.name}, type=${meta.type}`)
|
||||
insertMeta.run(
|
||||
meta.name,
|
||||
meta.platform,
|
||||
|
||||
Reference in New Issue
Block a user