diff --git a/electron/main/merger/index.ts b/electron/main/merger/index.ts index 0524bed..8ac11d9 100644 --- a/electron/main/merger/index.ts +++ b/electron/main/merger/index.ts @@ -5,6 +5,7 @@ import * as fs from 'fs' import * as path from 'path' +import { createHash } from 'crypto' import { parseFileSync, detectFormat } from '../parser' import { importData } from '../database/core' import { TempDbReader } from './tempCache' @@ -73,7 +74,19 @@ export async function parseFileInfo(filePath: string): Promise { * 生成消息的唯一标识(用于去重和冲突检测) */ function getMessageKey(msg: ParsedMessage): string { - return `${msg.timestamp}_${msg.senderPlatformId}_${(msg.content || '').length}` + // 合并链路的去重语义需要和增量导入保持一致,否则两条链路会对重复消息得出不同结论。 + const normalizedContent = msg.content || null + const hash = createHash('sha256') + hash.update(String(msg.timestamp)) + hash.update('\0') + hash.update(msg.senderPlatformId) + hash.update('\0') + hash.update(normalizedContent === null ? 'null' : 'text') + hash.update('\0') + if (normalizedContent !== null) { + hash.update(normalizedContent) + } + return hash.digest('base64url') } function getParsedMessageDisplayName(msg: ParsedMessage): string { diff --git a/electron/main/worker/import/tempDb.test.ts b/electron/main/worker/import/tempDb.test.ts new file mode 100644 index 0000000..f18d36e --- /dev/null +++ b/electron/main/worker/import/tempDb.test.ts @@ -0,0 +1,25 @@ +import assert from 'node:assert/strict' +import test from 'node:test' + +import { generateMessageKey } from './tempDb' + +test('空字符串内容在写库归一化前后应生成同一个去重 key', () => { + const timestamp = 1710000000 + const senderPlatformId = 'user-1' + const parsedContent = '' + + const keyBeforePersist = generateMessageKey(timestamp, senderPlatformId, parsedContent) + const keyAfterPersist = generateMessageKey(timestamp, senderPlatformId, parsedContent || null) + + assert.equal(keyAfterPersist, keyBeforePersist) +}) + +test('同秒同发送者但等长不同内容应生成不同去重 key', () => { + const timestamp = 1710000000 + const senderPlatformId = 'user-1' + + const firstKey = generateMessageKey(timestamp, senderPlatformId, '你好啊') + const secondKey = generateMessageKey(timestamp, senderPlatformId, '再见呀') + + assert.notEqual(firstKey, secondKey) +}) diff --git a/electron/main/worker/import/tempDb.ts b/electron/main/worker/import/tempDb.ts index ea9c3b8..03d01e1 100644 --- a/electron/main/worker/import/tempDb.ts +++ b/electron/main/worker/import/tempDb.ts @@ -4,6 +4,7 @@ */ import Database from 'better-sqlite3' +import { createHash } from 'crypto' import * as fs from 'fs' import * as path from 'path' @@ -84,9 +85,21 @@ export function cleanupTempDatabase(dbPath: string): void { /** * 生成消息去重键 - * 使用 timestamp + senderPlatformId + contentLength 作为去重标识 + * 使用固定长度哈希作为去重标识。 + * 直接用 content.length 会误判等长不同内容,直接用原文又会让 Set 长期持有大文本。 */ export function generateMessageKey(timestamp: number, senderPlatformId: string, content: string | null): string { - const contentLength = content ? content.length : 0 - return `${timestamp}_${senderPlatformId}_${contentLength}` + // 去重 key 需要和当前写库语义一致,空字符串在存储层会被折叠成 null。 + const normalizedContent = content || null + const hash = createHash('sha256') + hash.update(String(timestamp)) + hash.update('\0') + hash.update(senderPlatformId) + hash.update('\0') + hash.update(normalizedContent === null ? 'null' : 'text') + hash.update('\0') + if (normalizedContent !== null) { + hash.update(normalizedContent) + } + return hash.digest('base64url') }