feat: 支持WhatsApp原生格式消息

This commit is contained in:
digua
2025-12-24 00:06:20 +08:00
parent 616e7e38ea
commit 878d120fb0
6 changed files with 369 additions and 22 deletions
+11 -1
View File
@@ -12,6 +12,7 @@ import shuakamiQqExporter from './shuakami-qq-exporter'
import yccccccyEchotrace from './ycccccccy-echotrace'
import wechatDefault from './wechat-default'
import qqNativeTxt from './qq-native-txt'
import whatsappNativeTxt from './whatsapp-native-txt'
/**
* 所有支持的格式模块(按优先级排序)
@@ -22,8 +23,17 @@ export const formats: FormatModule[] = [
shuakamiQqExporter, // 优先级 10 - shuakami/qq-chat-exporter
yccccccyEchotrace, // 优先级 15 - ycccccccy/echotrace 微信导出
wechatDefault, // 优先级 20 - 微信数据库导出 JSON
whatsappNativeTxt, // 优先级 25 - WhatsApp 官方导出 TXT
qqNativeTxt, // 优先级 30 - QQ 官方导出 TXT
]
// 按名称导出,方便单独使用
export { chatlab, chatlabJsonl, shuakamiQqExporter, yccccccyEchotrace, wechatDefault, qqNativeTxt }
export {
chatlab,
chatlabJsonl,
shuakamiQqExporter,
yccccccyEchotrace,
wechatDefault,
qqNativeTxt,
whatsappNativeTxt,
}
@@ -0,0 +1,339 @@
/**
* WhatsApp 官方导出 TXT 格式解析器
* 适配 WhatsApp 聊天导出功能
*
* 格式特征:
* - 文件头:消息和通话已进行端到端加密
* - 消息格式:YYYY/MM/DD HH:MM - 昵称: 内容
* - 系统消息:YYYY/MM/DD HH:MM - 系统内容(无冒号分隔)
* - 媒体占位:<省略影音内容>
*/
import * as fs from 'fs'
import * as path from 'path'
import * as readline from 'readline'
import { KNOWN_PLATFORMS, ChatType, MessageType } from '../../../../src/types/base'
import type {
FormatFeature,
FormatModule,
Parser,
ParseOptions,
ParseEvent,
ParsedMeta,
ParsedMember,
ParsedMessage,
} from '../types'
import { getFileSize, createProgress } from '../utils'
// ==================== 辅助函数 ====================
/**
* 从文件名提取聊天名称
* 例如:与开心每一天的 WhatsApp 聊天.txt → 开心每一天
* 例如:与gaoberry37的 WhatsApp 聊天.txt → gaoberry37
*/
function extractNameFromFilePath(filePath: string): string {
const basename = path.basename(filePath)
// 匹配:与xxx的 WhatsApp 聊天.txt
const match = basename.match(/^与(.+?)的\s*WhatsApp\s*聊天\.txt$/i)
if (match) {
return match[1].trim()
}
// 兜底:移除扩展名
return basename.replace(/\.txt$/i, '') || '未知聊天'
}
// ==================== 特征定义 ====================
export const feature: FormatFeature = {
id: 'whatsapp-native-txt',
name: 'WhatsApp 官方导出 (TXT)',
platform: KNOWN_PLATFORMS.WHATSAPP,
priority: 25,
extensions: ['.txt'],
signatures: {
// WhatsApp 导出文件的特征
head: [/消息和通话已进行端到端加密/, /WhatsApp/i],
},
}
// ==================== 辅助函数:清理不可见字符 ====================
/**
* 清理行首/行尾的不可见 Unicode 字符
* WhatsApp 导出文件中可能包含 BOM、Left-to-Right Mark (U+200E) 等
*/
function cleanLine(line: string): string {
// 移除常见的不可见字符:BOM、LTR Mark、RTL Mark、零宽字符等
return line.replace(/^[\uFEFF\u200E\u200F\u200B\u200C\u200D\u2060]+/, '').trim()
}
// ==================== 消息头正则 ====================
// 匹配消息行:2025/12/22 12:35 - 地瓜: 内容
// 或系统消息:2025/12/22 12:35 - 系统消息内容
const MESSAGE_LINE_REGEX = /^(\d{4}\/\d{2}\/\d{2} \d{2}:\d{2}) - (.+)$/
// 从消息内容中分离昵称和实际内容
// 格式:昵称: 内容
const SENDER_CONTENT_REGEX = /^(.+?): (.*)$/
// ==================== 系统消息识别 ====================
const SYSTEM_MESSAGE_PATTERNS = [
/消息和通话已进行端到端加密/,
/创建了此群组/,
/加入群组/,
/添加了/,
/退出了群组/,
/移除了/,
/更改了本群组/,
/已将此群组的设置更改为/,
/这条消息已删除/,
]
function isSystemMessage(content: string): boolean {
return SYSTEM_MESSAGE_PATTERNS.some((pattern) => pattern.test(content))
}
// ==================== 消息类型判断 ====================
function detectMessageType(content: string): MessageType {
const trimmed = content.trim()
// 媒体消息
if (trimmed === '<省略影音内容>') return MessageType.IMAGE // 统一归类为图片
if (trimmed.includes('<已附加:') || trimmed.includes('<附件:')) return MessageType.FILE
// 删除消息
if (trimmed === '这条消息已删除') return MessageType.RECALL
// 系统消息
if (isSystemMessage(trimmed)) return MessageType.SYSTEM
return MessageType.TEXT
}
// ==================== 时间解析 ====================
/**
* 解析 WhatsApp 时间格式为秒级时间戳
* @param timeStr 格式:2025/12/22 12:35
*/
function parseWhatsAppTime(timeStr: string): number {
// 将 YYYY/MM/DD HH:MM 转换为 YYYY-MM-DDTHH:MM
const normalized = timeStr.replace(/\//g, '-').replace(' ', 'T') + ':00'
const date = new Date(normalized)
return Math.floor(date.getTime() / 1000)
}
// ==================== 成员信息 ====================
interface MemberInfo {
platformId: string
nickname: string
}
// ==================== 解析器实现 ====================
async function* parseWhatsApp(options: ParseOptions): AsyncGenerator<ParseEvent, void, unknown> {
const { filePath, batchSize = 5000, onProgress, onLog } = options
const totalBytes = getFileSize(filePath)
let bytesRead = 0
let messagesProcessed = 0
let skippedLines = 0
// 发送初始进度
const initialProgress = createProgress('parsing', 0, totalBytes, 0, '开始解析...')
yield { type: 'progress', data: initialProgress }
onProgress?.(initialProgress)
// 记录解析开始
onLog?.('info', `开始解析 WhatsApp TXT 文件,大小: ${(totalBytes / 1024 / 1024).toFixed(2)} MB`)
// 收集数据
const chatName = extractNameFromFilePath(filePath)
const memberMap = new Map<string, MemberInfo>()
const messages: ParsedMessage[] = []
// 当前正在解析的消息(可能跨多行)
let currentMessage: {
timestamp: number
sender: string | null // null 表示系统消息
contentLines: string[]
} | null = null
// 保存当前消息
const saveCurrentMessage = () => {
if (currentMessage) {
const content = currentMessage.contentLines.join('\n').trim()
const type = detectMessageType(content)
// 系统消息使用特殊 ID 和统一名称
const senderPlatformId = currentMessage.sender || 'system'
const senderName = currentMessage.sender || '系统消息'
messages.push({
senderPlatformId,
senderAccountName: senderName,
timestamp: currentMessage.timestamp,
type,
content: content || null,
})
// 更新成员信息(跳过系统消息)
if (currentMessage.sender) {
memberMap.set(senderPlatformId, {
platformId: senderPlatformId,
nickname: senderName,
})
}
messagesProcessed++
}
}
// 逐行读取文件
const fileStream = fs.createReadStream(filePath, { encoding: 'utf-8' })
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity,
})
fileStream.on('data', (chunk: string | Buffer) => {
bytesRead += typeof chunk === 'string' ? Buffer.byteLength(chunk) : chunk.length
})
for await (const line of rl) {
// 清理行首不可见字符
const cleanedLine = cleanLine(line)
// 尝试匹配消息行
const lineMatch = cleanedLine.match(MESSAGE_LINE_REGEX)
if (lineMatch) {
// 保存前一条消息
saveCurrentMessage()
const timeStr = lineMatch[1]
const restContent = lineMatch[2]
// 尝试分离发送者和内容
const senderMatch = restContent.match(SENDER_CONTENT_REGEX)
if (senderMatch && !isSystemMessage(restContent)) {
// 普通消息
currentMessage = {
timestamp: parseWhatsAppTime(timeStr),
sender: senderMatch[1].trim(),
contentLines: [senderMatch[2]],
}
} else {
// 系统消息
currentMessage = {
timestamp: parseWhatsAppTime(timeStr),
sender: null,
contentLines: [restContent],
}
}
// 更新进度
if (messagesProcessed % 500 === 0) {
const progress = createProgress(
'parsing',
bytesRead,
totalBytes,
messagesProcessed,
`已处理 ${messagesProcessed} 条消息...`
)
onProgress?.(progress)
}
continue
}
// 非消息行:可能是多行消息的延续
if (currentMessage && cleanedLine) {
currentMessage.contentLines.push(cleanedLine)
} else if (cleanedLine) {
// 无法解析的非空行
skippedLines++
}
}
// 保存最后一条消息
saveCurrentMessage()
// 确定聊天类型:根据参与者数量判断
// - 排除系统成员后,2 人或更少:私聊
// - 超过 2 人:群聊
const hasSystemMember = memberMap.has('system')
const realMemberCount = hasSystemMember ? memberMap.size - 1 : memberMap.size
const chatType = realMemberCount > 2 ? ChatType.GROUP : ChatType.PRIVATE
// 发送 meta
const meta: ParsedMeta = {
name: chatName,
platform: KNOWN_PLATFORMS.WHATSAPP,
type: chatType,
}
yield { type: 'meta', data: meta }
// 发送成员
const members: ParsedMember[] = Array.from(memberMap.values()).map((m) => ({
platformId: m.platformId,
accountName: m.nickname,
}))
yield { type: 'members', data: members }
// 分批发送消息
for (let i = 0; i < messages.length; i += batchSize) {
const batch = messages.slice(i, i + batchSize)
yield { type: 'messages', data: batch }
}
// 完成
const doneProgress = createProgress('done', totalBytes, totalBytes, messagesProcessed, '解析完成')
yield { type: 'progress', data: doneProgress }
onProgress?.(doneProgress)
// 统计消息类型
const typeCounts = new Map<MessageType, number>()
for (const msg of messages) {
typeCounts.set(msg.type, (typeCounts.get(msg.type) || 0) + 1)
}
// 记录解析摘要
onLog?.('info', `解析完成: ${messagesProcessed} 条消息, ${memberMap.size} 个成员, 类型: ${chatType}`)
onLog?.(
'info',
`消息类型统计: ${Array.from(typeCounts.entries())
.map(([type, count]) => `${type}=${count}`)
.join(', ')}`
)
if (skippedLines > 0) {
onLog?.('info', `跳过 ${skippedLines} 行无法解析的内容`)
}
yield {
type: 'done',
data: { messageCount: messagesProcessed, memberCount: memberMap.size },
}
}
// ==================== 导出解析器 ====================
export const parser_: Parser = {
feature,
parse: parseWhatsApp,
}
// ==================== 导出格式模块 ====================
const module_: FormatModule = {
feature,
parser: parser_,
// TXT 格式不需要预处理器
}
export default module_
@@ -379,6 +379,7 @@ export async function streamImport(filePath: string, requestId: string): Promise
onMeta: (meta: ParsedMeta) => {
if (!metaInserted) {
logInfo(`写入 meta: name=${meta.name}, type=${meta.type}`)
insertMeta.run(
meta.name,
meta.platform,