feat(ai): 实现聊天记录预处理管道

- 新增 preprocessor 模块：数据清洗(XML卡片)、黑名单过滤、智能去噪、连续发言合并、数据脱敏 - 内置多国脱敏规则（中国手机号/身份证、美国SSN、日韩号码等）+ 自定义规则支持 - 工具层统一 wrapWithPreprocessing 包装，自动对 rawMessages 执行预处理+格式化 - 昵称匿名化：用 U{id} 替代真实昵称，跨工具调用一致 - SQL 查询补充 senderId/senderPlatformId - PreprocessConfig 类型定义（preload + 主进程）
2026-05-14 18:39:07 +08:00 · 2026-02-27 23:56:35 +08:00
parent 1823042fad
commit c36878c58d
15 changed files with 785 additions and 14 deletions
@@ -0,0 +1,198 @@
+/**
+ * 内置脱敏规则库
+ * 按 locale 分组，支持通用规则和地区特定规则
+ */
+import type { DesensitizeRule } from './types'
+
+export type { DesensitizeRule }
+
+export const BUILTIN_DESENSITIZE_RULES: DesensitizeRule[] = [
+  // ==================== 中国 (zh-CN) ====================
+  {
+    id: 'cn_phone',
+    label: 'desensitize.rules.cn_phone',
+    pattern: '(?<!\\d)1[3-9]\\d{9}(?!\\d)',
+    replacement: '[手机号]',
+    enabled: true,
+    builtin: true,
+    locales: ['zh-CN'],
+  },
+  {
+    id: 'cn_id_card',
+    label: 'desensitize.rules.cn_id_card',
+    pattern: '(?<!\\d)\\d{17}[\\dXx](?!\\d)',
+    replacement: '[身份证]',
+    enabled: true,
+    builtin: true,
+    locales: ['zh-CN'],
+  },
+  {
+    id: 'cn_bank_card',
+    label: 'desensitize.rules.cn_bank_card',
+    pattern: '(?<!\\d)\\d{16,19}(?!\\d)',
+    replacement: '[银行卡]',
+    enabled: false,
+    builtin: true,
+    locales: ['zh-CN'],
+  },
+  {
+    id: 'cn_landline',
+    label: 'desensitize.rules.cn_landline',
+    pattern: '(?<!\\d)0\\d{2,3}-?\\d{7,8}(?!\\d)',
+    replacement: '[座机号]',
+    enabled: false,
+    builtin: true,
+    locales: ['zh-CN'],
+  },
+
+  // ==================== 美国 (en-US) ====================
+  {
+    id: 'us_ssn',
+    label: 'desensitize.rules.us_ssn',
+    pattern: '(?<!\\d)\\d{3}-\\d{2}-\\d{4}(?!\\d)',
+    replacement: '[SSN]',
+    enabled: true,
+    builtin: true,
+    locales: ['en-US'],
+  },
+  {
+    id: 'us_phone',
+    label: 'desensitize.rules.us_phone',
+    pattern: '(?<!\\d)(?:\\+?1[-\\s.]?)?\\(?\\d{3}\\)?[-\\s.]?\\d{3}[-\\s.]?\\d{4}(?!\\d)',
+    replacement: '[Phone]',
+    enabled: true,
+    builtin: true,
+    locales: ['en-US'],
+  },
+  {
+    id: 'us_drivers_license',
+    label: 'desensitize.rules.us_drivers_license',
+    pattern: '(?<![A-Z])\\b[A-Z]\\d{7,8}\\b',
+    replacement: "[Driver's License]",
+    enabled: false,
+    builtin: true,
+    locales: ['en-US'],
+  },
+
+  // ==================== 日本 (ja-JP) ====================
+  {
+    id: 'jp_phone',
+    label: 'desensitize.rules.jp_phone',
+    pattern: '(?<!\\d)0[789]0-?\\d{4}-?\\d{4}(?!\\d)',
+    replacement: '[電話番号]',
+    enabled: true,
+    builtin: true,
+    locales: ['ja-JP'],
+  },
+  {
+    id: 'jp_my_number',
+    label: 'desensitize.rules.jp_my_number',
+    pattern: '(?<!\\d)\\d{4}\\s?\\d{4}\\s?\\d{4}(?!\\d)',
+    replacement: '[マイナンバー]',
+    enabled: false,
+    builtin: true,
+    locales: ['ja-JP'],
+  },
+
+  // ==================== 韩国 (ko-KR) ====================
+  {
+    id: 'kr_phone',
+    label: 'desensitize.rules.kr_phone',
+    pattern: '(?<!\\d)01[016789]-?\\d{3,4}-?\\d{4}(?!\\d)',
+    replacement: '[전화번호]',
+    enabled: true,
+    builtin: true,
+    locales: ['ko-KR'],
+  },
+  {
+    id: 'kr_rrn',
+    label: 'desensitize.rules.kr_rrn',
+    pattern: '(?<!\\d)\\d{6}-[1-4]\\d{6}(?!\\d)',
+    replacement: '[주민번호]',
+    enabled: true,
+    builtin: true,
+    locales: ['ko-KR'],
+  },
+
+  // ==================== 凭据 / Token (所有语言) ====================
+  {
+    id: 'api_key_prefix',
+    label: 'desensitize.rules.api_key_prefix',
+    pattern: '\\b(?:sk-|pk_(?:live|test)_|ghp_|gho_|ghs_|ghu_|glpat-|xoxb-|xoxp-|AKIA)[A-Za-z0-9_\\-]{10,}\\b',
+    replacement: '[API Key]',
+    enabled: true,
+    builtin: true,
+    locales: [],
+  },
+  {
+    id: 'bearer_token',
+    label: 'desensitize.rules.bearer_token',
+    pattern: 'Bearer\\s+[A-Za-z0-9\\-._~+/]+=*',
+    replacement: 'Bearer [Token]',
+    enabled: true,
+    builtin: true,
+    locales: [],
+  },
+
+  // ==================== 通用 (所有语言) ====================
+  {
+    id: 'email',
+    label: 'desensitize.rules.email',
+    pattern: '[a-zA-Z0-9._%+\\-]+@[a-zA-Z0-9.\\-]+\\.[a-zA-Z]{2,}',
+    replacement: '[Email]',
+    enabled: true,
+    builtin: true,
+    locales: [],
+  },
+  {
+    id: 'credit_card',
+    label: 'desensitize.rules.credit_card',
+    pattern:
+      '(?<!\\d)(?:4\\d{3}|5[1-5]\\d{2}|3[47]\\d{2}|6(?:011|5\\d{2}))[-\\s.]?\\d{4}[-\\s.]?\\d{4}[-\\s.]?\\d{4}(?!\\d)',
+    replacement: '[Credit Card]',
+    enabled: false,
+    builtin: true,
+    locales: [],
+  },
+  {
+    id: 'ipv4',
+    label: 'desensitize.rules.ipv4',
+    pattern:
+      '(?<!\\d)(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)(?!\\d)',
+    replacement: '[IP]',
+    enabled: false,
+    builtin: true,
+    locales: [],
+  },
+  {
+    id: 'url',
+    label: 'desensitize.rules.url',
+    pattern: 'https?://[^\\s<>"]+',
+    replacement: '[URL]',
+    enabled: false,
+    builtin: true,
+    locales: [],
+  },
+]
+
+/**
+ * 获取指定 locale 的默认规则（当前 locale 特定 + 通用规则）
+ */
+export function getDefaultRulesForLocale(locale: string): DesensitizeRule[] {
+  return BUILTIN_DESENSITIZE_RULES.filter((rule) => rule.locales.length === 0 || rule.locales.includes(locale)).map(
+    (rule) => ({ ...rule })
+  )
+}
+
+/**
+ * 合并新 locale 的规则到现有规则列表
+ * 不重复添加已有 id，不修改已有规则的 enabled 状态
+ */
+export function mergeRulesForLocale(existing: DesensitizeRule[], locale: string): DesensitizeRule[] {
+  const existingIds = new Set(existing.map((r) => r.id))
+  const newRules = BUILTIN_DESENSITIZE_RULES.filter(
+    (rule) => !existingIds.has(rule.id) && (rule.locales.length === 0 || rule.locales.includes(locale))
+  ).map((rule) => ({ ...rule }))
+
+  return [...existing, ...newRules]
+}
@@ -0,0 +1,3 @@
+export type { PreprocessConfig, PreprocessableMessage, DesensitizeRule } from './types'
+export { preprocessMessages } from './pipeline'
+export { BUILTIN_DESENSITIZE_RULES, getDefaultRulesForLocale, mergeRulesForLocale } from './builtin-rules'
@@ -0,0 +1,272 @@
+/**
+ * 预处理管道
+ * 执行顺序：数据清洗 → 黑名单 → 去噪 → 合并 → 脱敏
+ */
+
+import type { PreprocessConfig, PreprocessableMessage, DesensitizeRule } from './types'
+import { aiLogger } from '../logger'
+
+const MERGE_WINDOW_DEFAULT = 180
+
+/**
+ * 对消息数组执行预处理管道
+ */
+export function preprocessMessages<T extends PreprocessableMessage>(messages: T[], config?: PreprocessConfig): T[] {
+  if (!config || !hasAnyEnabled(config)) return messages
+  if (messages.length === 0) return messages
+
+  const inputCount = messages.length
+  let result: T[] = [...messages]
+  const applied: string[] = []
+
+  if (config.dataCleaning !== false) {
+    const cleaned = applyDataCleaning(result)
+    if (cleaned.changed > 0) {
+      result = cleaned.messages
+      applied.push(`dataCleaning: ${cleaned.changed} messages cleaned`)
+    }
+  }
+
+  if (config.blacklistKeywords.length > 0) {
+    const before = result.length
+    result = applyBlacklistFilter(result, config.blacklistKeywords)
+    applied.push(`blacklist: ${before} → ${result.length} (-${before - result.length})`)
+  }
+
+  if (config.denoise) {
+    const before = result.length
+    result = applyDenoise(result)
+    applied.push(`denoise: ${before} → ${result.length} (-${before - result.length})`)
+  }
+
+  if (config.mergeConsecutive) {
+    const before = result.length
+    result = applyMergeConsecutive(result, config.mergeWindowSeconds ?? MERGE_WINDOW_DEFAULT)
+    applied.push(`merge: ${before} → ${result.length} (-${before - result.length})`)
+  }
+
+  if (config.desensitize) {
+    const enabledRules = (config.desensitizeRules || []).filter((r) => r.enabled)
+    if (enabledRules.length > 0) {
+      result = applyDesensitize(result, enabledRules)
+      applied.push(`desensitize: ${enabledRules.length} rules applied`)
+    }
+  }
+
+  aiLogger.info('Preprocess', `Pipeline: ${inputCount} → ${result.length} messages`, {
+    strategies: applied,
+  })
+
+  return result
+}
+
+function hasAnyEnabled(config: PreprocessConfig): boolean {
+  return (
+    config.dataCleaning !== false ||
+    config.mergeConsecutive ||
+    config.blacklistKeywords.length > 0 ||
+    config.denoise ||
+    config.desensitize
+  )
+}
+
+// ==================== 策略实现 ====================
+
+/**
+ * 数据清洗：将 XML 卡片消息（美团分享、音乐、公众号等）提取为简洁文本
+ * 微信 XML 消息以 <?xml 或 <msg> 或 <msg>< 开头
+ */
+function applyDataCleaning<T extends PreprocessableMessage>(messages: T[]): { messages: T[]; changed: number } {
+  let changed = 0
+  const result = messages.map((msg) => {
+    if (!msg.content) return msg
+    const cleaned = cleanXmlContent(msg.content)
+    if (cleaned === msg.content) return msg
+    changed++
+    return { ...msg, content: cleaned }
+  })
+  return { messages: result, changed }
+}
+
+const XML_START = /^<\?xml\s|^<msg[\s>]/
+
+function cleanXmlContent(content: string): string {
+  const trimmed = content.trim()
+  if (!XML_START.test(trimmed)) return content
+
+  const title = extractXmlTag(trimmed, 'title')
+  const des = extractXmlTag(trimmed, 'des')
+
+  if (title) {
+    return des ? `[分享] ${title} - ${des}` : `[分享] ${title}`
+  }
+  return '[应用消息]'
+}
+
+function extractXmlTag(xml: string, tag: string): string | null {
+  const openTag = `<${tag}>`
+  const closeTag = `</${tag}>`
+  const start = xml.indexOf(openTag)
+  if (start === -1) return null
+  const contentStart = start + openTag.length
+  const end = xml.indexOf(closeTag, contentStart)
+  if (end === -1) return null
+  const value = xml.slice(contentStart, end).trim()
+  return value.length > 0 ? value : null
+}
+
+/**
+ * 黑名单过滤：消息内容包含任一关键词则整条移除
+ */
+function applyBlacklistFilter<T extends PreprocessableMessage>(messages: T[], keywords: string[]): T[] {
+  if (keywords.length === 0) return messages
+  const lowerKeywords = keywords.map((k) => k.toLowerCase())
+  return messages.filter((msg) => {
+    if (!msg.content) return true
+    const lower = msg.content.toLowerCase()
+    return !lowerKeywords.some((kw) => lower.includes(kw))
+  })
+}
+
+/**
+ * 智能去噪：过滤无意义消息
+ * - 内容长度 < 2 的纯文本（如 "嗯"、"哦"）
+ * - 纯表情消息
+ * - 系统占位符（[图片]、[视频]、[语音] 等）
+ * 回复保护：有 replyToMessageId 的消息不过滤
+ */
+function applyDenoise<T extends PreprocessableMessage>(messages: T[]): T[] {
+  return messages.filter((msg) => {
+    if (!msg.content) return false
+
+    if (msg.replyToMessageId) return true
+
+    const content = msg.content.trim()
+    if (content.length === 0) return false
+
+    if (content.length < 2) return false
+
+    if (SYSTEM_PLACEHOLDERS.has(content)) return false
+
+    if (isPureEmoji(content)) return false
+
+    return true
+  })
+}
+
+const SYSTEM_PLACEHOLDERS = new Set([
+  '[图片]',
+  '[视频]',
+  '[语音]',
+  '[文件]',
+  '[动画表情]',
+  '[表情]',
+  '[链接]',
+  '[位置]',
+  '[名片]',
+  '[红包]',
+  '[转账]',
+  '[音乐]',
+  '[Image]',
+  '[Video]',
+  '[Voice]',
+  '[File]',
+  '[Sticker]',
+  '[Link]',
+])
+
+function isPureEmoji(str: string): boolean {
+  const stripped = str
+    .replace(/\p{Emoji_Presentation}/gu, '')
+    .replace(/\p{Extended_Pictographic}/gu, '')
+    .replace(/\u200d/g, '')
+    .replace(/\ufe0f/g, '')
+    .replace(/\u20e3/g, '')
+    .replace(/\s/g, '')
+  return stripped.length === 0
+}
+
+/**
+ * 合并连续发言：同一发送者在时间窗口内的连续消息合并为一条
+ * 使用 senderPlatformId（如可用），否则 fallback 到 senderName
+ */
+function applyMergeConsecutive<T extends PreprocessableMessage>(messages: T[], windowSeconds: number): T[] {
+  if (messages.length <= 1) return messages
+
+  const merged: T[] = []
+  let current: T | null = null
+
+  for (const msg of messages) {
+    if (!current) {
+      current = { ...msg }
+      continue
+    }
+
+    const sameSender = isSameSender(current, msg)
+    const withinWindow = Math.abs(msg.timestamp - current.timestamp) <= windowSeconds
+
+    if (sameSender && withinWindow) {
+      current = {
+        ...current,
+        content: [current.content, msg.content].filter(Boolean).join('\n'),
+      }
+    } else {
+      merged.push(current)
+      current = { ...msg }
+    }
+  }
+
+  if (current) merged.push(current)
+  return merged
+}
+
+function isSameSender(a: PreprocessableMessage, b: PreprocessableMessage): boolean {
+  if (a.senderPlatformId && b.senderPlatformId) {
+    return a.senderPlatformId === b.senderPlatformId
+  }
+  return a.senderName === b.senderName
+}
+
+/**
+ * 数据脱敏：按规则列表顺序依次替换敏感信息
+ * 规则按列表优先级排序，先匹配的先替换
+ * 自定义正则有超时保护（单条规则 50ms 上限）
+ */
+function applyDesensitize<T extends PreprocessableMessage>(messages: T[], rules: DesensitizeRule[]): T[] {
+  const compiledRules = compileRules(rules)
+  if (compiledRules.length === 0) return messages
+
+  return messages.map((msg) => {
+    if (!msg.content) return msg
+    let content = msg.content
+    for (const { regex, replacement } of compiledRules) {
+      regex.lastIndex = 0
+      content = content.replace(regex, replacement)
+    }
+    if (content === msg.content) return msg
+    return { ...msg, content }
+  })
+}
+
+const regexCache = new Map<string, RegExp | null>()
+
+function compileRules(rules: DesensitizeRule[]): Array<{ regex: RegExp; replacement: string }> {
+  const result: Array<{ regex: RegExp; replacement: string }> = []
+  for (const rule of rules) {
+    let regex = regexCache.get(rule.pattern)
+    if (regex === undefined) {
+      try {
+        regex = new RegExp(rule.pattern, 'g')
+        regexCache.set(rule.pattern, regex)
+      } catch {
+        aiLogger.warn('Preprocess', `Invalid regex in desensitize rule "${rule.id}": ${rule.pattern}`)
+        regexCache.set(rule.pattern, null)
+        continue
+      }
+    }
+    if (regex) {
+      result.push({ regex, replacement: rule.replacement })
+    }
+  }
+  return result
+}
@@ -0,0 +1,48 @@
+/** 单条脱敏规则 */
+export interface DesensitizeRule {
+  /** 唯一标识（预置规则用固定 id，自定义规则用 uuid） */
+  id: string
+  /** 显示名称 */
+  label: string
+  /** 正则表达式字符串（运行时 new RegExp(pattern, 'g')） */
+  pattern: string
+  /** 替换文本 */
+  replacement: string
+  /** 是否启用 */
+  enabled: boolean
+  /** 是否为预置规则（预置规则不可删除，仅可启用/禁用） */
+  builtin: boolean
+  /** 适用的 locale 列表（空数组表示通用） */
+  locales: string[]
+}
+
+/** 预处理配置 */
+export interface PreprocessConfig {
+  /** 数据清洗：清理 XML 卡片消息等非纯文本内容（默认开启） */
+  dataCleaning: boolean
+  /** 合并连续发言（同发送者 + 时间间隔 < mergeWindowSeconds） */
+  mergeConsecutive: boolean
+  /** 合并窗口（秒），默认 180 */
+  mergeWindowSeconds?: number
+  /** 自定义黑名单关键词，包含任一关键词的消息将被整条过滤 */
+  blacklistKeywords: string[]
+  /** 智能去噪（过滤纯语气词、纯表情、系统占位符） */
+  denoise: boolean
+  /** 数据脱敏总开关 */
+  desensitize: boolean
+  /** 脱敏规则列表（预置 + 自定义，按优先级排序） */
+  desensitizeRules: DesensitizeRule[]
+  /** 昵称匿名化：用 U{id} 替代真实昵称，减少 AI 幻觉 */
+  anonymizeNames: boolean
+}
+
+/** 预处理管道可接受的消息结构（兼容 SearchMessageResult / SessionMessagesResult.messages） */
+export interface PreprocessableMessage {
+  id?: number
+  senderId?: number
+  senderName: string
+  senderPlatformId?: string
+  content: string | null
+  timestamp: number
+  replyToMessageId?: string | null
+}
@@ -4,7 +4,7 @@ import type { ToolContext } from '../types'
 import { timeParamProperties } from '../utils/schemas'
 import * as workerManager from '../../../worker/workerManager'
 import { parseExtendedTimeParams } from '../utils/time-params'
-import { formatTimeRange, formatMessageCompact, t } from '../utils/format'
+import { formatTimeRange, t } from '../utils/format'

 const schema = Type.Object({
  member_id_1: Type.Number({ description: 'ai.tools.get_conversation_between.params.member_id_1' }),
@@ -51,7 +51,7 @@ export function createTool(context: ToolContext): AgentTool<typeof schema> {
        member1: result.member1Name,
        member2: result.member2Name,
        timeRange: formatTimeRange(effectiveTimeFilter, locale),
-        conversation: result.messages.map((m) => formatMessageCompact(m, locale)),
+        rawMessages: result.messages,
      }

      return {
@@ -2,7 +2,7 @@ import { Type } from '@mariozechner/pi-ai'
 import type { AgentTool } from '@mariozechner/pi-agent-core'
 import type { ToolContext } from '../types'
 import * as workerManager from '../../../worker/workerManager'
-import { formatMessageCompact, t } from '../utils/format'
+import { t } from '../utils/format'

 const schema = Type.Object({
  message_ids: Type.Array(Type.Number(), { description: 'ai.tools.get_message_context.params.message_ids' }),
@@ -37,7 +37,7 @@ export function createTool(context: ToolContext): AgentTool<typeof schema> {
        totalMessages: messages.length,
        contextSize: contextSize,
        requestedMessageIds: params.message_ids,
-        messages: messages.map((m) => formatMessageCompact(m, locale)),
+        rawMessages: messages,
      }

      return {
@@ -3,7 +3,7 @@ import type { AgentTool } from '@mariozechner/pi-agent-core'
 import type { ToolContext } from '../types'
 import * as workerManager from '../../../worker/workerManager'
 import { parseExtendedTimeParams } from '../utils/time-params'
-import { formatTimeRange, formatMessageCompact } from '../utils/format'
+import { formatTimeRange } from '../utils/format'
 import { timeParamProperties } from '../utils/schemas'

 const schema = Type.Object({
@@ -29,7 +29,7 @@ export function createTool(context: ToolContext): AgentTool<typeof schema> {
        total: result.total,
        returned: result.messages.length,
        timeRange: formatTimeRange(effectiveTimeFilter, locale),
-        messages: result.messages.map((m) => formatMessageCompact(m, locale)),
+        rawMessages: result.messages,
      }

      return {
@@ -2,7 +2,7 @@ import { Type } from '@mariozechner/pi-ai'
 import type { AgentTool } from '@mariozechner/pi-agent-core'
 import type { ToolContext } from '../types'
 import * as workerManager from '../../../worker/workerManager'
-import { isChineseLocale, formatMessageCompact } from '../utils/format'
+import { isChineseLocale } from '../utils/format'

 const schema = Type.Object({
  session_id: Type.Number({ description: 'ai.tools.get_session_messages.params.session_id' }),
@@ -38,7 +38,7 @@ export function createTool(context: ToolContext): AgentTool<typeof schema> {
          messageCount: result.messageCount,
          returnedCount: result.returnedCount,
          participants: result.participants,
-          messages: result.messages.map((m) => formatMessageCompact(m, locale)),
+          rawMessages: result.messages,
        }
      }

@@ -3,7 +3,7 @@ import type { AgentTool } from '@mariozechner/pi-agent-core'
 import type { ToolContext } from '../types'
 import * as workerManager from '../../../worker/workerManager'
 import { parseExtendedTimeParams } from '../utils/time-params'
-import { formatTimeRange, formatMessageCompact } from '../utils/format'
+import { formatTimeRange } from '../utils/format'
 import { timeParamProperties } from '../utils/schemas'

 const schema = Type.Object({
@@ -38,7 +38,7 @@ export function createTool(context: ToolContext): AgentTool<typeof schema> {
        total: result.total,
        returned: result.messages.length,
        timeRange: formatTimeRange(effectiveTimeFilter, locale),
-        messages: result.messages.map((m) => formatMessageCompact(m, locale)),
+        rawMessages: result.messages,
      }

      return {
@@ -1,6 +1,8 @@
 /**
 * AI Tools 模块入口
- * 工具创建与管理
+ * 工具创建、预处理管道与管理
+ *
+ * 架构：工具返回结构化数据（rawMessages） → 处理层执行预处理 + 格式化 → 生成 LLM 内容
 */

 import type { AgentTool } from '@mariozechner/pi-agent-core'
@@ -21,6 +23,8 @@ import {
 } from './definitions'
 import { isEmbeddingEnabled } from '../rag'
 import { t as i18nT } from '../../i18n'
+import { preprocessMessages, type PreprocessableMessage } from '../preprocessor'
+import { formatMessageCompact } from './utils/format'

 // 导出类型
 export * from './types'
@@ -41,6 +45,57 @@ const coreFactories: ToolFactory[] = [
  createGetSessionSummaries,
 ]

+/**
+ * 将工具返回的结构化数据格式化为 LLM 友好的纯文本
+ *
+ * 从 JSON.stringify 改为纯文本，节省 token 且更易于 LLM 理解。
+ * 元数据作为头部，消息逐行排列。
+ */
+function formatToolResultAsText(details: Record<string, unknown>): string {
+  const lines: string[] = []
+  const messages = details.messages as string[] | undefined
+
+  for (const [key, value] of Object.entries(details)) {
+    if (key === 'messages') continue
+    if (value === undefined || value === null) continue
+
+    if (typeof value === 'object') {
+      if ('start' in (value as Record<string, unknown>) && 'end' in (value as Record<string, unknown>)) {
+        const range = value as { start: string; end: string }
+        lines.push(`${key}: ${range.start} ~ ${range.end}`)
+      } else if (Array.isArray(value)) {
+        lines.push(`${key}: ${value.join(', ')}`)
+      } else {
+        lines.push(`${key}: ${JSON.stringify(value)}`)
+      }
+    } else {
+      lines.push(`${key}: ${value}`)
+    }
+  }
+
+  if (messages && messages.length > 0) {
+    lines.push('')
+    let lastDate = ''
+    for (const msg of messages) {
+      const spaceIdx = msg.indexOf(' ')
+      const secondSpaceIdx = msg.indexOf(' ', spaceIdx + 1)
+      if (spaceIdx > 0 && secondSpaceIdx > 0) {
+        const date = msg.slice(0, spaceIdx)
+        const rest = msg.slice(spaceIdx + 1)
+        if (date !== lastDate) {
+          lines.push(`--- ${date} ---`)
+          lastDate = date
+        }
+        lines.push(rest)
+      } else {
+        lines.push(msg)
+      }
+    }
+  }
+
+  return lines.join('\n')
+}
+
 /**
 * 翻译 AgentTool 的描述（工具级 + 参数级）
 *
@@ -71,11 +126,87 @@ function translateTool(tool: AgentTool<any>): AgentTool<any> {
  }
 }

+/**
+ * 预处理包装层
+ * 拦截工具的 execute 结果：如果 details 中包含 rawMessages，
+ * 则执行预处理管道 + 格式化，替换为最终的 LLM 内容
+ *
+ * 工具约定：返回消息的工具在 details 中放置 rawMessages 字段（结构化消息数组），
+ * 处理层负责 preprocess + formatMessageCompact，工具无需感知预处理逻辑。
+ */
+function wrapWithPreprocessing(tool: AgentTool<any>, context: ToolContext): AgentTool<any> {
+  const originalExecute = tool.execute
+  return {
+    ...tool,
+    execute: async (toolCallId: string, params: any) => {
+      const result = await originalExecute(toolCallId, params)
+
+      const details = result.details as Record<string, unknown> | undefined
+      if (!details?.rawMessages || !Array.isArray(details.rawMessages)) {
+        return result
+      }
+
+      const raw = details.rawMessages as PreprocessableMessage[]
+      const processed = preprocessMessages(raw, context.preprocessConfig)
+
+      let nameMapLine = ''
+      if (context.preprocessConfig?.anonymizeNames) {
+        nameMapLine = anonymizeMessageNames(processed, context.ownerInfo?.platformId)
+      }
+
+      const formatted = processed.map((m) => formatMessageCompact(m, context.locale))
+
+      const finalDetails = { ...details, messages: formatted, returned: processed.length }
+      delete finalDetails.rawMessages
+
+      let textContent = formatToolResultAsText(finalDetails)
+      if (nameMapLine) {
+        textContent = nameMapLine + '\n' + textContent
+      }
+
+      return {
+        content: [{ type: 'text' as const, text: textContent }],
+        details: finalDetails,
+      }
+    },
+  }
+}
+
+/**
+ * 昵称匿名化：用 U{senderId} 替代真实昵称
+ * 就地修改 messages 的 senderName，返回映射表文本行
+ */
+function anonymizeMessageNames(messages: PreprocessableMessage[], ownerPlatformId?: string): string {
+  const nameMap = new Map<number, { name: string; platformId?: string }>()
+  for (const msg of messages) {
+    if (msg.senderId != null && !nameMap.has(msg.senderId)) {
+      nameMap.set(msg.senderId, { name: msg.senderName, platformId: msg.senderPlatformId })
+    }
+  }
+
+  if (nameMap.size === 0) return ''
+
+  for (const msg of messages) {
+    if (msg.senderId != null) {
+      msg.senderName = `U${msg.senderId}`
+    }
+  }
+
+  const entries: string[] = []
+  for (const [id, { name, platformId }] of nameMap) {
+    const isOwner = ownerPlatformId && platformId === ownerPlatformId
+    entries.push(`U${id}=${name}${isOwner ? '(owner)' : ''}`)
+  }
+
+  return `[Name Map] ${entries.join(' | ')}`
+}
+
 /**
 * 获取所有可用的 AgentTool
 *
 * 根据配置动态过滤工具（如：语义搜索工具仅在启用 Embedding 时可用）
 * 根据当前 locale 动态翻译工具描述
+ * 统一包装预处理层
 */
 export function getAllTools(context: ToolContext): AgentTool<any>[] {
  const tools: AgentTool<any>[] = coreFactories.map((f) => f(context))
@@ -84,5 +215,5 @@ export function getAllTools(context: ToolContext): AgentTool<any>[] {
    tools.push(createSemanticSearchMessages(context))
  }

-  return tools.map(translateTool)
+  return tools.map(translateTool).map((t) => wrapWithPreprocessing(t, context))
 }
@@ -2,6 +2,8 @@
 * AI Tools 类型定义
 */

+import type { PreprocessConfig } from '../preprocessor'
+
 /** Owner 信息（当前用户在对话中的身份） */
 export interface OwnerInfo {
  /** Owner 的 platformId */
@@ -30,4 +32,6 @@ export interface ToolContext {
  ownerInfo?: OwnerInfo
  /** 语言环境（用于工具返回结果的国际化） */
  locale?: string
+  /** 聊天记录预处理配置（全局） */
+  preprocessConfig?: PreprocessConfig
 }
@@ -14,6 +14,7 @@ import { ensureAvatarColumn } from './basic'
 */
 export interface MessageResult {
  id: number
+  senderId: number
  senderName: string
  senderPlatformId: string
  senderAliases: string[]
@@ -49,6 +50,7 @@ export interface MessagesWithTotal {
 */
 interface DbMessageRow {
  id: number
+  senderId: number
  senderName: string
  senderPlatformId: string
  aliases: string | null
@@ -78,6 +80,7 @@ function sanitizeMessageRow(row: DbMessageRow): MessageResult {

  return {
    id: Number(row.id),
+    senderId: Number(row.senderId),
    senderName: String(row.senderName || ''),
    senderPlatformId: String(row.senderPlatformId || ''),
    senderAliases: aliases,
@@ -155,6 +158,7 @@ export function getRecentMessages(sessionId: string, filter?: TimeFilter, limit:
  const sql = `
    SELECT
      msg.id,
+      m.id as senderId,
      COALESCE(m.group_nickname, m.account_name, m.platform_id) as senderName,
      m.platform_id as senderPlatformId,
      m.aliases,
@@ -218,6 +222,7 @@ export function getAllRecentMessages(sessionId: string, filter?: TimeFilter, lim
  const sql = `
    SELECT
      msg.id,
+      m.id as senderId,
      COALESCE(m.group_nickname, m.account_name, m.platform_id) as senderName,
      m.platform_id as senderPlatformId,
      m.aliases,
@@ -301,6 +306,7 @@ export function searchMessages(
  const sql = `
    SELECT
      msg.id,
+      m.id as senderId,
      COALESCE(m.group_nickname, m.account_name, m.platform_id) as senderName,
      m.platform_id as senderPlatformId,
      m.aliases,
@@ -391,6 +397,7 @@ export function getMessageContext(
  const sql = `
    SELECT
      msg.id,
+      m.id as senderId,
      COALESCE(m.group_nickname, m.account_name, m.platform_id) as senderName,
      m.platform_id as senderPlatformId,
      m.aliases,
@@ -450,6 +457,7 @@ export function getMessagesBefore(
  const sql = `
    SELECT
      msg.id,
+      m.id as senderId,
      COALESCE(m.group_nickname, m.account_name, m.platform_id) as senderName,
      m.platform_id as senderPlatformId,
      m.aliases,
@@ -522,6 +530,7 @@ export function getMessagesAfter(
  const sql = `
    SELECT
      msg.id,
+      m.id as senderId,
      COALESCE(m.group_nickname, m.account_name, m.platform_id) as senderName,
      m.platform_id as senderPlatformId,
      m.aliases,
@@ -622,6 +631,7 @@ export function getConversationBetween(
  const sql = `
    SELECT
      msg.id,
+      m.id as senderId,
      COALESCE(m.group_nickname, m.account_name, m.platform_id) as senderName,
      m.platform_id as senderPlatformId,
      m.aliases,
@@ -78,7 +78,9 @@ export function searchSessions(
    const previewSql = `
      SELECT
        m.id,
+        mb.id as senderId,
        COALESCE(mb.group_nickname, mb.account_name, mb.platform_id) as senderName,
+        mb.platform_id as senderPlatformId,
        m.content,
        m.ts as timestamp
      FROM message_context mc
@@ -93,7 +95,9 @@ export function searchSessions(
    for (const session of sessions) {
      const previewMessages = db.prepare(previewSql).all(session.id, previewCount) as Array<{
        id: number
+        senderId: number
        senderName: string
+        senderPlatformId: string
        content: string | null
        timestamp: number
      }>
@@ -164,7 +168,9 @@ export function getSessionMessages(
    const messagesSql = `
      SELECT
        m.id,
+        mb.id as senderId,
        COALESCE(mb.group_nickname, mb.account_name, mb.platform_id) as senderName,
+        mb.platform_id as senderPlatformId,
        m.content,
        m.ts as timestamp
      FROM message_context mc
@@ -176,7 +182,9 @@ export function getSessionMessages(
    `
    const messages = db.prepare(messagesSql).all(chatSessionId, limit) as Array<{
      id: number
+      senderId: number
      senderName: string
+      senderPlatformId: string
      content: string | null
      timestamp: number
    }>
@@ -100,10 +100,37 @@ export interface AgentResult {
  toolRounds: number
 }

+/** 单条脱敏规则 */
+export interface DesensitizeRule {
+  id: string
+  label: string
+  pattern: string
+  replacement: string
+  enabled: boolean
+  builtin: boolean
+  locales: string[]
+}
+
+/** 聊天记录预处理配置 */
+export interface PreprocessConfig {
+  dataCleaning: boolean
+  mergeConsecutive: boolean
+  mergeWindowSeconds?: number
+  blacklistKeywords: string[]
+  denoise: boolean
+  desensitize: boolean
+  desensitizeRules: DesensitizeRule[]
+  anonymizeNames: boolean
+}
+
 export interface ToolContext {
  sessionId: string
  conversationId?: string
  timeFilter?: { startTs: number; endTs: number }
+  maxMessagesLimit?: number
+  ownerInfo?: { platformId: string; displayName: string }
+  locale?: string
+  preprocessConfig?: PreprocessConfig
 }

 // AI 服务配置类型（前端用）
@@ -463,6 +490,14 @@ export const aiApi = {
  showAiLogFile: (): Promise<{ success: boolean; path?: string; error?: string }> => {
    return ipcRenderer.invoke('ai:showLogFile')
  },
+
+  getDefaultDesensitizeRules: (locale: string): Promise<DesensitizeRule[]> => {
+    return ipcRenderer.invoke('ai:getDefaultDesensitizeRules', locale)
+  },
+
+  mergeDesensitizeRules: (existingRules: DesensitizeRule[], locale: string): Promise<DesensitizeRule[]> => {
+    return ipcRenderer.invoke('ai:mergeDesensitizeRules', existingRules, locale)
+  },
 }

 // ==================== LLM API ====================
@@ -647,12 +682,34 @@ export const agentApi = {
    locale?: string,
    maxHistoryRounds?: number
  ): { requestId: string; promise: Promise<{ success: boolean; result?: AgentResult; error?: string }> } => {
+    // 防御性处理：确保传给 IPC 的 context 是“可结构化克隆”的纯对象
+    // 避免调用方误传入响应式 Proxy（例如 Pinia/Vue state）导致 invoke 失败
+    const sanitizedContext: ToolContext = {
+      sessionId: context.sessionId,
+      conversationId: context.conversationId,
+      timeFilter: context.timeFilter
+        ? {
+            startTs: context.timeFilter.startTs,
+            endTs: context.timeFilter.endTs,
+          }
+        : undefined,
+      maxMessagesLimit: context.maxMessagesLimit,
+      ownerInfo: context.ownerInfo
+        ? {
+            platformId: context.ownerInfo.platformId,
+            displayName: context.ownerInfo.displayName,
+          }
+        : undefined,
+      locale: context.locale,
+      preprocessConfig: context.preprocessConfig,
+    }
+
    const requestId = `agent_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`
    console.log(
      '[preload] Agent runStream 开始，requestId:',
      requestId,
      'conversationId:',
-      context.conversationId ?? 'none',
+      sanitizedContext.conversationId ?? 'none',
      'chatType:',
      chatType ?? 'group',
      'hasPromptConfig:',
@@ -694,7 +751,16 @@ export const agentApi = {
      ipcRenderer.on('agent:complete', completeHandler)

      ipcRenderer
-        .invoke('agent:runStream', requestId, userMessage, context, chatType, promptConfig, locale, maxHistoryRounds)
+        .invoke(
+          'agent:runStream',
+          requestId,
+          userMessage,
+          sanitizedContext,
+          chatType,
+          promptConfig,
+          locale,
+          maxHistoryRounds
+        )
        .then((result) => {
          console.log('[preload] Agent invoke 返回:', result)
          if (!result.success) {
@@ -373,6 +373,8 @@ interface AiApi {
  getMessages: (conversationId: string) => Promise<AIMessage[]>
  deleteMessage: (messageId: string) => Promise<boolean>
  showAiLogFile: () => Promise<{ success: boolean; path?: string; error?: string }>
+  getDefaultDesensitizeRules: (locale: string) => Promise<DesensitizeRule[]>
+  mergeDesensitizeRules: (existingRules: DesensitizeRule[], locale: string) => Promise<DesensitizeRule[]>
  // 自定义筛选（支持分页）
  filterMessagesWithContext: (
    sessionId: string,
@@ -617,6 +619,29 @@ interface OwnerInfo {
  displayName: string
 }

+/** 单条脱敏规则 */
+interface DesensitizeRule {
+  id: string
+  label: string
+  pattern: string
+  replacement: string
+  enabled: boolean
+  builtin: boolean
+  locales: string[]
+}
+
+/** 聊天记录预处理配置 */
+interface PreprocessConfig {
+  dataCleaning: boolean
+  mergeConsecutive: boolean
+  mergeWindowSeconds?: number
+  blacklistKeywords: string[]
+  denoise: boolean
+  desensitize: boolean
+  desensitizeRules: DesensitizeRule[]
+  anonymizeNames: boolean
+}
+
 interface ToolContext {
  sessionId: string
  conversationId?: string
@@ -625,6 +650,10 @@ interface ToolContext {
  maxMessagesLimit?: number
  /** Owner 信息（当前用户在对话中的身份） */
  ownerInfo?: OwnerInfo
+  /** 语言环境 */
+  locale?: string
+  /** 聊天记录预处理配置 */
+  preprocessConfig?: PreprocessConfig
 }

 // 用户自定义提示词配置
@@ -871,6 +900,8 @@ export {
  AgentRuntimeStatus,
  AgentResult,
  ToolContext,
+  DesensitizeRule,
+  PreprocessConfig,
  PromptConfig,
  TokenUsage,
  CacheDirectoryInfo,