mirror of
https://github.com/hellodigua/ChatLab.git
synced 2026-05-14 18:39:07 +08:00
feat(ai): 实现聊天记录预处理管道
- 新增 preprocessor 模块:数据清洗(XML卡片)、黑名单过滤、智能去噪、连续发言合并、数据脱敏
- 内置多国脱敏规则(中国手机号/身份证、美国SSN、日韩号码等)+ 自定义规则支持
- 工具层统一 wrapWithPreprocessing 包装,自动对 rawMessages 执行预处理+格式化
- 昵称匿名化:用 U{id} 替代真实昵称,跨工具调用一致
- SQL 查询补充 senderId/senderPlatformId
- PreprocessConfig 类型定义(preload + 主进程)
This commit is contained in:
@@ -0,0 +1,198 @@
|
||||
/**
|
||||
* 内置脱敏规则库
|
||||
* 按 locale 分组,支持通用规则和地区特定规则
|
||||
*/
|
||||
import type { DesensitizeRule } from './types'
|
||||
|
||||
export type { DesensitizeRule }
|
||||
|
||||
export const BUILTIN_DESENSITIZE_RULES: DesensitizeRule[] = [
|
||||
// ==================== 中国 (zh-CN) ====================
|
||||
{
|
||||
id: 'cn_phone',
|
||||
label: 'desensitize.rules.cn_phone',
|
||||
pattern: '(?<!\\d)1[3-9]\\d{9}(?!\\d)',
|
||||
replacement: '[手机号]',
|
||||
enabled: true,
|
||||
builtin: true,
|
||||
locales: ['zh-CN'],
|
||||
},
|
||||
{
|
||||
id: 'cn_id_card',
|
||||
label: 'desensitize.rules.cn_id_card',
|
||||
pattern: '(?<!\\d)\\d{17}[\\dXx](?!\\d)',
|
||||
replacement: '[身份证]',
|
||||
enabled: true,
|
||||
builtin: true,
|
||||
locales: ['zh-CN'],
|
||||
},
|
||||
{
|
||||
id: 'cn_bank_card',
|
||||
label: 'desensitize.rules.cn_bank_card',
|
||||
pattern: '(?<!\\d)\\d{16,19}(?!\\d)',
|
||||
replacement: '[银行卡]',
|
||||
enabled: false,
|
||||
builtin: true,
|
||||
locales: ['zh-CN'],
|
||||
},
|
||||
{
|
||||
id: 'cn_landline',
|
||||
label: 'desensitize.rules.cn_landline',
|
||||
pattern: '(?<!\\d)0\\d{2,3}-?\\d{7,8}(?!\\d)',
|
||||
replacement: '[座机号]',
|
||||
enabled: false,
|
||||
builtin: true,
|
||||
locales: ['zh-CN'],
|
||||
},
|
||||
|
||||
// ==================== 美国 (en-US) ====================
|
||||
{
|
||||
id: 'us_ssn',
|
||||
label: 'desensitize.rules.us_ssn',
|
||||
pattern: '(?<!\\d)\\d{3}-\\d{2}-\\d{4}(?!\\d)',
|
||||
replacement: '[SSN]',
|
||||
enabled: true,
|
||||
builtin: true,
|
||||
locales: ['en-US'],
|
||||
},
|
||||
{
|
||||
id: 'us_phone',
|
||||
label: 'desensitize.rules.us_phone',
|
||||
pattern: '(?<!\\d)(?:\\+?1[-\\s.]?)?\\(?\\d{3}\\)?[-\\s.]?\\d{3}[-\\s.]?\\d{4}(?!\\d)',
|
||||
replacement: '[Phone]',
|
||||
enabled: true,
|
||||
builtin: true,
|
||||
locales: ['en-US'],
|
||||
},
|
||||
{
|
||||
id: 'us_drivers_license',
|
||||
label: 'desensitize.rules.us_drivers_license',
|
||||
pattern: '(?<![A-Z])\\b[A-Z]\\d{7,8}\\b',
|
||||
replacement: "[Driver's License]",
|
||||
enabled: false,
|
||||
builtin: true,
|
||||
locales: ['en-US'],
|
||||
},
|
||||
|
||||
// ==================== 日本 (ja-JP) ====================
|
||||
{
|
||||
id: 'jp_phone',
|
||||
label: 'desensitize.rules.jp_phone',
|
||||
pattern: '(?<!\\d)0[789]0-?\\d{4}-?\\d{4}(?!\\d)',
|
||||
replacement: '[電話番号]',
|
||||
enabled: true,
|
||||
builtin: true,
|
||||
locales: ['ja-JP'],
|
||||
},
|
||||
{
|
||||
id: 'jp_my_number',
|
||||
label: 'desensitize.rules.jp_my_number',
|
||||
pattern: '(?<!\\d)\\d{4}\\s?\\d{4}\\s?\\d{4}(?!\\d)',
|
||||
replacement: '[マイナンバー]',
|
||||
enabled: false,
|
||||
builtin: true,
|
||||
locales: ['ja-JP'],
|
||||
},
|
||||
|
||||
// ==================== 韩国 (ko-KR) ====================
|
||||
{
|
||||
id: 'kr_phone',
|
||||
label: 'desensitize.rules.kr_phone',
|
||||
pattern: '(?<!\\d)01[016789]-?\\d{3,4}-?\\d{4}(?!\\d)',
|
||||
replacement: '[전화번호]',
|
||||
enabled: true,
|
||||
builtin: true,
|
||||
locales: ['ko-KR'],
|
||||
},
|
||||
{
|
||||
id: 'kr_rrn',
|
||||
label: 'desensitize.rules.kr_rrn',
|
||||
pattern: '(?<!\\d)\\d{6}-[1-4]\\d{6}(?!\\d)',
|
||||
replacement: '[주민번호]',
|
||||
enabled: true,
|
||||
builtin: true,
|
||||
locales: ['ko-KR'],
|
||||
},
|
||||
|
||||
// ==================== 凭据 / Token (所有语言) ====================
|
||||
{
|
||||
id: 'api_key_prefix',
|
||||
label: 'desensitize.rules.api_key_prefix',
|
||||
pattern: '\\b(?:sk-|pk_(?:live|test)_|ghp_|gho_|ghs_|ghu_|glpat-|xoxb-|xoxp-|AKIA)[A-Za-z0-9_\\-]{10,}\\b',
|
||||
replacement: '[API Key]',
|
||||
enabled: true,
|
||||
builtin: true,
|
||||
locales: [],
|
||||
},
|
||||
{
|
||||
id: 'bearer_token',
|
||||
label: 'desensitize.rules.bearer_token',
|
||||
pattern: 'Bearer\\s+[A-Za-z0-9\\-._~+/]+=*',
|
||||
replacement: 'Bearer [Token]',
|
||||
enabled: true,
|
||||
builtin: true,
|
||||
locales: [],
|
||||
},
|
||||
|
||||
// ==================== 通用 (所有语言) ====================
|
||||
{
|
||||
id: 'email',
|
||||
label: 'desensitize.rules.email',
|
||||
pattern: '[a-zA-Z0-9._%+\\-]+@[a-zA-Z0-9.\\-]+\\.[a-zA-Z]{2,}',
|
||||
replacement: '[Email]',
|
||||
enabled: true,
|
||||
builtin: true,
|
||||
locales: [],
|
||||
},
|
||||
{
|
||||
id: 'credit_card',
|
||||
label: 'desensitize.rules.credit_card',
|
||||
pattern:
|
||||
'(?<!\\d)(?:4\\d{3}|5[1-5]\\d{2}|3[47]\\d{2}|6(?:011|5\\d{2}))[-\\s.]?\\d{4}[-\\s.]?\\d{4}[-\\s.]?\\d{4}(?!\\d)',
|
||||
replacement: '[Credit Card]',
|
||||
enabled: false,
|
||||
builtin: true,
|
||||
locales: [],
|
||||
},
|
||||
{
|
||||
id: 'ipv4',
|
||||
label: 'desensitize.rules.ipv4',
|
||||
pattern:
|
||||
'(?<!\\d)(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)(?!\\d)',
|
||||
replacement: '[IP]',
|
||||
enabled: false,
|
||||
builtin: true,
|
||||
locales: [],
|
||||
},
|
||||
{
|
||||
id: 'url',
|
||||
label: 'desensitize.rules.url',
|
||||
pattern: 'https?://[^\\s<>"]+',
|
||||
replacement: '[URL]',
|
||||
enabled: false,
|
||||
builtin: true,
|
||||
locales: [],
|
||||
},
|
||||
]
|
||||
|
||||
/**
|
||||
* 获取指定 locale 的默认规则(当前 locale 特定 + 通用规则)
|
||||
*/
|
||||
export function getDefaultRulesForLocale(locale: string): DesensitizeRule[] {
|
||||
return BUILTIN_DESENSITIZE_RULES.filter((rule) => rule.locales.length === 0 || rule.locales.includes(locale)).map(
|
||||
(rule) => ({ ...rule })
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* 合并新 locale 的规则到现有规则列表
|
||||
* 不重复添加已有 id,不修改已有规则的 enabled 状态
|
||||
*/
|
||||
export function mergeRulesForLocale(existing: DesensitizeRule[], locale: string): DesensitizeRule[] {
|
||||
const existingIds = new Set(existing.map((r) => r.id))
|
||||
const newRules = BUILTIN_DESENSITIZE_RULES.filter(
|
||||
(rule) => !existingIds.has(rule.id) && (rule.locales.length === 0 || rule.locales.includes(locale))
|
||||
).map((rule) => ({ ...rule }))
|
||||
|
||||
return [...existing, ...newRules]
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
export type { PreprocessConfig, PreprocessableMessage, DesensitizeRule } from './types'
|
||||
export { preprocessMessages } from './pipeline'
|
||||
export { BUILTIN_DESENSITIZE_RULES, getDefaultRulesForLocale, mergeRulesForLocale } from './builtin-rules'
|
||||
@@ -0,0 +1,272 @@
|
||||
/**
|
||||
* 预处理管道
|
||||
* 执行顺序:数据清洗 → 黑名单 → 去噪 → 合并 → 脱敏
|
||||
*/
|
||||
|
||||
import type { PreprocessConfig, PreprocessableMessage, DesensitizeRule } from './types'
|
||||
import { aiLogger } from '../logger'
|
||||
|
||||
const MERGE_WINDOW_DEFAULT = 180
|
||||
|
||||
/**
|
||||
* 对消息数组执行预处理管道
|
||||
*/
|
||||
export function preprocessMessages<T extends PreprocessableMessage>(messages: T[], config?: PreprocessConfig): T[] {
|
||||
if (!config || !hasAnyEnabled(config)) return messages
|
||||
if (messages.length === 0) return messages
|
||||
|
||||
const inputCount = messages.length
|
||||
let result: T[] = [...messages]
|
||||
const applied: string[] = []
|
||||
|
||||
if (config.dataCleaning !== false) {
|
||||
const cleaned = applyDataCleaning(result)
|
||||
if (cleaned.changed > 0) {
|
||||
result = cleaned.messages
|
||||
applied.push(`dataCleaning: ${cleaned.changed} messages cleaned`)
|
||||
}
|
||||
}
|
||||
|
||||
if (config.blacklistKeywords.length > 0) {
|
||||
const before = result.length
|
||||
result = applyBlacklistFilter(result, config.blacklistKeywords)
|
||||
applied.push(`blacklist: ${before} → ${result.length} (-${before - result.length})`)
|
||||
}
|
||||
|
||||
if (config.denoise) {
|
||||
const before = result.length
|
||||
result = applyDenoise(result)
|
||||
applied.push(`denoise: ${before} → ${result.length} (-${before - result.length})`)
|
||||
}
|
||||
|
||||
if (config.mergeConsecutive) {
|
||||
const before = result.length
|
||||
result = applyMergeConsecutive(result, config.mergeWindowSeconds ?? MERGE_WINDOW_DEFAULT)
|
||||
applied.push(`merge: ${before} → ${result.length} (-${before - result.length})`)
|
||||
}
|
||||
|
||||
if (config.desensitize) {
|
||||
const enabledRules = (config.desensitizeRules || []).filter((r) => r.enabled)
|
||||
if (enabledRules.length > 0) {
|
||||
result = applyDesensitize(result, enabledRules)
|
||||
applied.push(`desensitize: ${enabledRules.length} rules applied`)
|
||||
}
|
||||
}
|
||||
|
||||
aiLogger.info('Preprocess', `Pipeline: ${inputCount} → ${result.length} messages`, {
|
||||
strategies: applied,
|
||||
})
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
function hasAnyEnabled(config: PreprocessConfig): boolean {
|
||||
return (
|
||||
config.dataCleaning !== false ||
|
||||
config.mergeConsecutive ||
|
||||
config.blacklistKeywords.length > 0 ||
|
||||
config.denoise ||
|
||||
config.desensitize
|
||||
)
|
||||
}
|
||||
|
||||
// ==================== 策略实现 ====================
|
||||
|
||||
/**
|
||||
* 数据清洗:将 XML 卡片消息(美团分享、音乐、公众号等)提取为简洁文本
|
||||
* 微信 XML 消息以 <?xml 或 <msg> 或 <msg>< 开头
|
||||
*/
|
||||
function applyDataCleaning<T extends PreprocessableMessage>(messages: T[]): { messages: T[]; changed: number } {
|
||||
let changed = 0
|
||||
const result = messages.map((msg) => {
|
||||
if (!msg.content) return msg
|
||||
const cleaned = cleanXmlContent(msg.content)
|
||||
if (cleaned === msg.content) return msg
|
||||
changed++
|
||||
return { ...msg, content: cleaned }
|
||||
})
|
||||
return { messages: result, changed }
|
||||
}
|
||||
|
||||
const XML_START = /^<\?xml\s|^<msg[\s>]/
|
||||
|
||||
function cleanXmlContent(content: string): string {
|
||||
const trimmed = content.trim()
|
||||
if (!XML_START.test(trimmed)) return content
|
||||
|
||||
const title = extractXmlTag(trimmed, 'title')
|
||||
const des = extractXmlTag(trimmed, 'des')
|
||||
|
||||
if (title) {
|
||||
return des ? `[分享] ${title} - ${des}` : `[分享] ${title}`
|
||||
}
|
||||
return '[应用消息]'
|
||||
}
|
||||
|
||||
function extractXmlTag(xml: string, tag: string): string | null {
|
||||
const openTag = `<${tag}>`
|
||||
const closeTag = `</${tag}>`
|
||||
const start = xml.indexOf(openTag)
|
||||
if (start === -1) return null
|
||||
const contentStart = start + openTag.length
|
||||
const end = xml.indexOf(closeTag, contentStart)
|
||||
if (end === -1) return null
|
||||
const value = xml.slice(contentStart, end).trim()
|
||||
return value.length > 0 ? value : null
|
||||
}
|
||||
|
||||
/**
|
||||
* 黑名单过滤:消息内容包含任一关键词则整条移除
|
||||
*/
|
||||
function applyBlacklistFilter<T extends PreprocessableMessage>(messages: T[], keywords: string[]): T[] {
|
||||
if (keywords.length === 0) return messages
|
||||
const lowerKeywords = keywords.map((k) => k.toLowerCase())
|
||||
return messages.filter((msg) => {
|
||||
if (!msg.content) return true
|
||||
const lower = msg.content.toLowerCase()
|
||||
return !lowerKeywords.some((kw) => lower.includes(kw))
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* 智能去噪:过滤无意义消息
|
||||
* - 内容长度 < 2 的纯文本(如 "嗯"、"哦")
|
||||
* - 纯表情消息
|
||||
* - 系统占位符([图片]、[视频]、[语音] 等)
|
||||
* 回复保护:有 replyToMessageId 的消息不过滤
|
||||
*/
|
||||
function applyDenoise<T extends PreprocessableMessage>(messages: T[]): T[] {
|
||||
return messages.filter((msg) => {
|
||||
if (!msg.content) return false
|
||||
|
||||
if (msg.replyToMessageId) return true
|
||||
|
||||
const content = msg.content.trim()
|
||||
if (content.length === 0) return false
|
||||
|
||||
if (content.length < 2) return false
|
||||
|
||||
if (SYSTEM_PLACEHOLDERS.has(content)) return false
|
||||
|
||||
if (isPureEmoji(content)) return false
|
||||
|
||||
return true
|
||||
})
|
||||
}
|
||||
|
||||
const SYSTEM_PLACEHOLDERS = new Set([
|
||||
'[图片]',
|
||||
'[视频]',
|
||||
'[语音]',
|
||||
'[文件]',
|
||||
'[动画表情]',
|
||||
'[表情]',
|
||||
'[链接]',
|
||||
'[位置]',
|
||||
'[名片]',
|
||||
'[红包]',
|
||||
'[转账]',
|
||||
'[音乐]',
|
||||
'[Image]',
|
||||
'[Video]',
|
||||
'[Voice]',
|
||||
'[File]',
|
||||
'[Sticker]',
|
||||
'[Link]',
|
||||
])
|
||||
|
||||
function isPureEmoji(str: string): boolean {
|
||||
const stripped = str
|
||||
.replace(/\p{Emoji_Presentation}/gu, '')
|
||||
.replace(/\p{Extended_Pictographic}/gu, '')
|
||||
.replace(/\u200d/g, '')
|
||||
.replace(/\ufe0f/g, '')
|
||||
.replace(/\u20e3/g, '')
|
||||
.replace(/\s/g, '')
|
||||
return stripped.length === 0
|
||||
}
|
||||
|
||||
/**
|
||||
* 合并连续发言:同一发送者在时间窗口内的连续消息合并为一条
|
||||
* 使用 senderPlatformId(如可用),否则 fallback 到 senderName
|
||||
*/
|
||||
function applyMergeConsecutive<T extends PreprocessableMessage>(messages: T[], windowSeconds: number): T[] {
|
||||
if (messages.length <= 1) return messages
|
||||
|
||||
const merged: T[] = []
|
||||
let current: T | null = null
|
||||
|
||||
for (const msg of messages) {
|
||||
if (!current) {
|
||||
current = { ...msg }
|
||||
continue
|
||||
}
|
||||
|
||||
const sameSender = isSameSender(current, msg)
|
||||
const withinWindow = Math.abs(msg.timestamp - current.timestamp) <= windowSeconds
|
||||
|
||||
if (sameSender && withinWindow) {
|
||||
current = {
|
||||
...current,
|
||||
content: [current.content, msg.content].filter(Boolean).join('\n'),
|
||||
}
|
||||
} else {
|
||||
merged.push(current)
|
||||
current = { ...msg }
|
||||
}
|
||||
}
|
||||
|
||||
if (current) merged.push(current)
|
||||
return merged
|
||||
}
|
||||
|
||||
function isSameSender(a: PreprocessableMessage, b: PreprocessableMessage): boolean {
|
||||
if (a.senderPlatformId && b.senderPlatformId) {
|
||||
return a.senderPlatformId === b.senderPlatformId
|
||||
}
|
||||
return a.senderName === b.senderName
|
||||
}
|
||||
|
||||
/**
|
||||
* 数据脱敏:按规则列表顺序依次替换敏感信息
|
||||
* 规则按列表优先级排序,先匹配的先替换
|
||||
* 自定义正则有超时保护(单条规则 50ms 上限)
|
||||
*/
|
||||
function applyDesensitize<T extends PreprocessableMessage>(messages: T[], rules: DesensitizeRule[]): T[] {
|
||||
const compiledRules = compileRules(rules)
|
||||
if (compiledRules.length === 0) return messages
|
||||
|
||||
return messages.map((msg) => {
|
||||
if (!msg.content) return msg
|
||||
let content = msg.content
|
||||
for (const { regex, replacement } of compiledRules) {
|
||||
regex.lastIndex = 0
|
||||
content = content.replace(regex, replacement)
|
||||
}
|
||||
if (content === msg.content) return msg
|
||||
return { ...msg, content }
|
||||
})
|
||||
}
|
||||
|
||||
const regexCache = new Map<string, RegExp | null>()
|
||||
|
||||
function compileRules(rules: DesensitizeRule[]): Array<{ regex: RegExp; replacement: string }> {
|
||||
const result: Array<{ regex: RegExp; replacement: string }> = []
|
||||
for (const rule of rules) {
|
||||
let regex = regexCache.get(rule.pattern)
|
||||
if (regex === undefined) {
|
||||
try {
|
||||
regex = new RegExp(rule.pattern, 'g')
|
||||
regexCache.set(rule.pattern, regex)
|
||||
} catch {
|
||||
aiLogger.warn('Preprocess', `Invalid regex in desensitize rule "${rule.id}": ${rule.pattern}`)
|
||||
regexCache.set(rule.pattern, null)
|
||||
continue
|
||||
}
|
||||
}
|
||||
if (regex) {
|
||||
result.push({ regex, replacement: rule.replacement })
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
/** 单条脱敏规则 */
|
||||
export interface DesensitizeRule {
|
||||
/** 唯一标识(预置规则用固定 id,自定义规则用 uuid) */
|
||||
id: string
|
||||
/** 显示名称 */
|
||||
label: string
|
||||
/** 正则表达式字符串(运行时 new RegExp(pattern, 'g')) */
|
||||
pattern: string
|
||||
/** 替换文本 */
|
||||
replacement: string
|
||||
/** 是否启用 */
|
||||
enabled: boolean
|
||||
/** 是否为预置规则(预置规则不可删除,仅可启用/禁用) */
|
||||
builtin: boolean
|
||||
/** 适用的 locale 列表(空数组表示通用) */
|
||||
locales: string[]
|
||||
}
|
||||
|
||||
/** 预处理配置 */
|
||||
export interface PreprocessConfig {
|
||||
/** 数据清洗:清理 XML 卡片消息等非纯文本内容(默认开启) */
|
||||
dataCleaning: boolean
|
||||
/** 合并连续发言(同发送者 + 时间间隔 < mergeWindowSeconds) */
|
||||
mergeConsecutive: boolean
|
||||
/** 合并窗口(秒),默认 180 */
|
||||
mergeWindowSeconds?: number
|
||||
/** 自定义黑名单关键词,包含任一关键词的消息将被整条过滤 */
|
||||
blacklistKeywords: string[]
|
||||
/** 智能去噪(过滤纯语气词、纯表情、系统占位符) */
|
||||
denoise: boolean
|
||||
/** 数据脱敏总开关 */
|
||||
desensitize: boolean
|
||||
/** 脱敏规则列表(预置 + 自定义,按优先级排序) */
|
||||
desensitizeRules: DesensitizeRule[]
|
||||
/** 昵称匿名化:用 U{id} 替代真实昵称,减少 AI 幻觉 */
|
||||
anonymizeNames: boolean
|
||||
}
|
||||
|
||||
/** 预处理管道可接受的消息结构(兼容 SearchMessageResult / SessionMessagesResult.messages) */
|
||||
export interface PreprocessableMessage {
|
||||
id?: number
|
||||
senderId?: number
|
||||
senderName: string
|
||||
senderPlatformId?: string
|
||||
content: string | null
|
||||
timestamp: number
|
||||
replyToMessageId?: string | null
|
||||
}
|
||||
@@ -4,7 +4,7 @@ import type { ToolContext } from '../types'
|
||||
import { timeParamProperties } from '../utils/schemas'
|
||||
import * as workerManager from '../../../worker/workerManager'
|
||||
import { parseExtendedTimeParams } from '../utils/time-params'
|
||||
import { formatTimeRange, formatMessageCompact, t } from '../utils/format'
|
||||
import { formatTimeRange, t } from '../utils/format'
|
||||
|
||||
const schema = Type.Object({
|
||||
member_id_1: Type.Number({ description: 'ai.tools.get_conversation_between.params.member_id_1' }),
|
||||
@@ -51,7 +51,7 @@ export function createTool(context: ToolContext): AgentTool<typeof schema> {
|
||||
member1: result.member1Name,
|
||||
member2: result.member2Name,
|
||||
timeRange: formatTimeRange(effectiveTimeFilter, locale),
|
||||
conversation: result.messages.map((m) => formatMessageCompact(m, locale)),
|
||||
rawMessages: result.messages,
|
||||
}
|
||||
|
||||
return {
|
||||
|
||||
@@ -2,7 +2,7 @@ import { Type } from '@mariozechner/pi-ai'
|
||||
import type { AgentTool } from '@mariozechner/pi-agent-core'
|
||||
import type { ToolContext } from '../types'
|
||||
import * as workerManager from '../../../worker/workerManager'
|
||||
import { formatMessageCompact, t } from '../utils/format'
|
||||
import { t } from '../utils/format'
|
||||
|
||||
const schema = Type.Object({
|
||||
message_ids: Type.Array(Type.Number(), { description: 'ai.tools.get_message_context.params.message_ids' }),
|
||||
@@ -37,7 +37,7 @@ export function createTool(context: ToolContext): AgentTool<typeof schema> {
|
||||
totalMessages: messages.length,
|
||||
contextSize: contextSize,
|
||||
requestedMessageIds: params.message_ids,
|
||||
messages: messages.map((m) => formatMessageCompact(m, locale)),
|
||||
rawMessages: messages,
|
||||
}
|
||||
|
||||
return {
|
||||
|
||||
@@ -3,7 +3,7 @@ import type { AgentTool } from '@mariozechner/pi-agent-core'
|
||||
import type { ToolContext } from '../types'
|
||||
import * as workerManager from '../../../worker/workerManager'
|
||||
import { parseExtendedTimeParams } from '../utils/time-params'
|
||||
import { formatTimeRange, formatMessageCompact } from '../utils/format'
|
||||
import { formatTimeRange } from '../utils/format'
|
||||
import { timeParamProperties } from '../utils/schemas'
|
||||
|
||||
const schema = Type.Object({
|
||||
@@ -29,7 +29,7 @@ export function createTool(context: ToolContext): AgentTool<typeof schema> {
|
||||
total: result.total,
|
||||
returned: result.messages.length,
|
||||
timeRange: formatTimeRange(effectiveTimeFilter, locale),
|
||||
messages: result.messages.map((m) => formatMessageCompact(m, locale)),
|
||||
rawMessages: result.messages,
|
||||
}
|
||||
|
||||
return {
|
||||
|
||||
@@ -2,7 +2,7 @@ import { Type } from '@mariozechner/pi-ai'
|
||||
import type { AgentTool } from '@mariozechner/pi-agent-core'
|
||||
import type { ToolContext } from '../types'
|
||||
import * as workerManager from '../../../worker/workerManager'
|
||||
import { isChineseLocale, formatMessageCompact } from '../utils/format'
|
||||
import { isChineseLocale } from '../utils/format'
|
||||
|
||||
const schema = Type.Object({
|
||||
session_id: Type.Number({ description: 'ai.tools.get_session_messages.params.session_id' }),
|
||||
@@ -38,7 +38,7 @@ export function createTool(context: ToolContext): AgentTool<typeof schema> {
|
||||
messageCount: result.messageCount,
|
||||
returnedCount: result.returnedCount,
|
||||
participants: result.participants,
|
||||
messages: result.messages.map((m) => formatMessageCompact(m, locale)),
|
||||
rawMessages: result.messages,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ import type { AgentTool } from '@mariozechner/pi-agent-core'
|
||||
import type { ToolContext } from '../types'
|
||||
import * as workerManager from '../../../worker/workerManager'
|
||||
import { parseExtendedTimeParams } from '../utils/time-params'
|
||||
import { formatTimeRange, formatMessageCompact } from '../utils/format'
|
||||
import { formatTimeRange } from '../utils/format'
|
||||
import { timeParamProperties } from '../utils/schemas'
|
||||
|
||||
const schema = Type.Object({
|
||||
@@ -38,7 +38,7 @@ export function createTool(context: ToolContext): AgentTool<typeof schema> {
|
||||
total: result.total,
|
||||
returned: result.messages.length,
|
||||
timeRange: formatTimeRange(effectiveTimeFilter, locale),
|
||||
messages: result.messages.map((m) => formatMessageCompact(m, locale)),
|
||||
rawMessages: result.messages,
|
||||
}
|
||||
|
||||
return {
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
/**
|
||||
* AI Tools 模块入口
|
||||
* 工具创建与管理
|
||||
* 工具创建、预处理管道与管理
|
||||
*
|
||||
* 架构:工具返回结构化数据(rawMessages) → 处理层执行预处理 + 格式化 → 生成 LLM 内容
|
||||
*/
|
||||
|
||||
import type { AgentTool } from '@mariozechner/pi-agent-core'
|
||||
@@ -21,6 +23,8 @@ import {
|
||||
} from './definitions'
|
||||
import { isEmbeddingEnabled } from '../rag'
|
||||
import { t as i18nT } from '../../i18n'
|
||||
import { preprocessMessages, type PreprocessableMessage } from '../preprocessor'
|
||||
import { formatMessageCompact } from './utils/format'
|
||||
|
||||
// 导出类型
|
||||
export * from './types'
|
||||
@@ -41,6 +45,57 @@ const coreFactories: ToolFactory[] = [
|
||||
createGetSessionSummaries,
|
||||
]
|
||||
|
||||
/**
|
||||
* 将工具返回的结构化数据格式化为 LLM 友好的纯文本
|
||||
*
|
||||
* 从 JSON.stringify 改为纯文本,节省 token 且更易于 LLM 理解。
|
||||
* 元数据作为头部,消息逐行排列。
|
||||
*/
|
||||
function formatToolResultAsText(details: Record<string, unknown>): string {
|
||||
const lines: string[] = []
|
||||
const messages = details.messages as string[] | undefined
|
||||
|
||||
for (const [key, value] of Object.entries(details)) {
|
||||
if (key === 'messages') continue
|
||||
if (value === undefined || value === null) continue
|
||||
|
||||
if (typeof value === 'object') {
|
||||
if ('start' in (value as Record<string, unknown>) && 'end' in (value as Record<string, unknown>)) {
|
||||
const range = value as { start: string; end: string }
|
||||
lines.push(`${key}: ${range.start} ~ ${range.end}`)
|
||||
} else if (Array.isArray(value)) {
|
||||
lines.push(`${key}: ${value.join(', ')}`)
|
||||
} else {
|
||||
lines.push(`${key}: ${JSON.stringify(value)}`)
|
||||
}
|
||||
} else {
|
||||
lines.push(`${key}: ${value}`)
|
||||
}
|
||||
}
|
||||
|
||||
if (messages && messages.length > 0) {
|
||||
lines.push('')
|
||||
let lastDate = ''
|
||||
for (const msg of messages) {
|
||||
const spaceIdx = msg.indexOf(' ')
|
||||
const secondSpaceIdx = msg.indexOf(' ', spaceIdx + 1)
|
||||
if (spaceIdx > 0 && secondSpaceIdx > 0) {
|
||||
const date = msg.slice(0, spaceIdx)
|
||||
const rest = msg.slice(spaceIdx + 1)
|
||||
if (date !== lastDate) {
|
||||
lines.push(`--- ${date} ---`)
|
||||
lastDate = date
|
||||
}
|
||||
lines.push(rest)
|
||||
} else {
|
||||
lines.push(msg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return lines.join('\n')
|
||||
}
|
||||
|
||||
/**
|
||||
* 翻译 AgentTool 的描述(工具级 + 参数级)
|
||||
*
|
||||
@@ -71,11 +126,87 @@ function translateTool(tool: AgentTool<any>): AgentTool<any> {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 预处理包装层
|
||||
* 拦截工具的 execute 结果:如果 details 中包含 rawMessages,
|
||||
* 则执行预处理管道 + 格式化,替换为最终的 LLM 内容
|
||||
*
|
||||
* 工具约定:返回消息的工具在 details 中放置 rawMessages 字段(结构化消息数组),
|
||||
* 处理层负责 preprocess + formatMessageCompact,工具无需感知预处理逻辑。
|
||||
*/
|
||||
function wrapWithPreprocessing(tool: AgentTool<any>, context: ToolContext): AgentTool<any> {
|
||||
const originalExecute = tool.execute
|
||||
return {
|
||||
...tool,
|
||||
execute: async (toolCallId: string, params: any) => {
|
||||
const result = await originalExecute(toolCallId, params)
|
||||
|
||||
const details = result.details as Record<string, unknown> | undefined
|
||||
if (!details?.rawMessages || !Array.isArray(details.rawMessages)) {
|
||||
return result
|
||||
}
|
||||
|
||||
const raw = details.rawMessages as PreprocessableMessage[]
|
||||
const processed = preprocessMessages(raw, context.preprocessConfig)
|
||||
|
||||
let nameMapLine = ''
|
||||
if (context.preprocessConfig?.anonymizeNames) {
|
||||
nameMapLine = anonymizeMessageNames(processed, context.ownerInfo?.platformId)
|
||||
}
|
||||
|
||||
const formatted = processed.map((m) => formatMessageCompact(m, context.locale))
|
||||
|
||||
const finalDetails = { ...details, messages: formatted, returned: processed.length }
|
||||
delete finalDetails.rawMessages
|
||||
|
||||
let textContent = formatToolResultAsText(finalDetails)
|
||||
if (nameMapLine) {
|
||||
textContent = nameMapLine + '\n' + textContent
|
||||
}
|
||||
|
||||
return {
|
||||
content: [{ type: 'text' as const, text: textContent }],
|
||||
details: finalDetails,
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 昵称匿名化:用 U{senderId} 替代真实昵称
|
||||
* 就地修改 messages 的 senderName,返回映射表文本行
|
||||
*/
|
||||
function anonymizeMessageNames(messages: PreprocessableMessage[], ownerPlatformId?: string): string {
|
||||
const nameMap = new Map<number, { name: string; platformId?: string }>()
|
||||
for (const msg of messages) {
|
||||
if (msg.senderId != null && !nameMap.has(msg.senderId)) {
|
||||
nameMap.set(msg.senderId, { name: msg.senderName, platformId: msg.senderPlatformId })
|
||||
}
|
||||
}
|
||||
|
||||
if (nameMap.size === 0) return ''
|
||||
|
||||
for (const msg of messages) {
|
||||
if (msg.senderId != null) {
|
||||
msg.senderName = `U${msg.senderId}`
|
||||
}
|
||||
}
|
||||
|
||||
const entries: string[] = []
|
||||
for (const [id, { name, platformId }] of nameMap) {
|
||||
const isOwner = ownerPlatformId && platformId === ownerPlatformId
|
||||
entries.push(`U${id}=${name}${isOwner ? '(owner)' : ''}`)
|
||||
}
|
||||
|
||||
return `[Name Map] ${entries.join(' | ')}`
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取所有可用的 AgentTool
|
||||
*
|
||||
* 根据配置动态过滤工具(如:语义搜索工具仅在启用 Embedding 时可用)
|
||||
* 根据当前 locale 动态翻译工具描述
|
||||
* 统一包装预处理层
|
||||
*/
|
||||
export function getAllTools(context: ToolContext): AgentTool<any>[] {
|
||||
const tools: AgentTool<any>[] = coreFactories.map((f) => f(context))
|
||||
@@ -84,5 +215,5 @@ export function getAllTools(context: ToolContext): AgentTool<any>[] {
|
||||
tools.push(createSemanticSearchMessages(context))
|
||||
}
|
||||
|
||||
return tools.map(translateTool)
|
||||
return tools.map(translateTool).map((t) => wrapWithPreprocessing(t, context))
|
||||
}
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
* AI Tools 类型定义
|
||||
*/
|
||||
|
||||
import type { PreprocessConfig } from '../preprocessor'
|
||||
|
||||
/** Owner 信息(当前用户在对话中的身份) */
|
||||
export interface OwnerInfo {
|
||||
/** Owner 的 platformId */
|
||||
@@ -30,4 +32,6 @@ export interface ToolContext {
|
||||
ownerInfo?: OwnerInfo
|
||||
/** 语言环境(用于工具返回结果的国际化) */
|
||||
locale?: string
|
||||
/** 聊天记录预处理配置(全局) */
|
||||
preprocessConfig?: PreprocessConfig
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@ import { ensureAvatarColumn } from './basic'
|
||||
*/
|
||||
export interface MessageResult {
|
||||
id: number
|
||||
senderId: number
|
||||
senderName: string
|
||||
senderPlatformId: string
|
||||
senderAliases: string[]
|
||||
@@ -49,6 +50,7 @@ export interface MessagesWithTotal {
|
||||
*/
|
||||
interface DbMessageRow {
|
||||
id: number
|
||||
senderId: number
|
||||
senderName: string
|
||||
senderPlatformId: string
|
||||
aliases: string | null
|
||||
@@ -78,6 +80,7 @@ function sanitizeMessageRow(row: DbMessageRow): MessageResult {
|
||||
|
||||
return {
|
||||
id: Number(row.id),
|
||||
senderId: Number(row.senderId),
|
||||
senderName: String(row.senderName || ''),
|
||||
senderPlatformId: String(row.senderPlatformId || ''),
|
||||
senderAliases: aliases,
|
||||
@@ -155,6 +158,7 @@ export function getRecentMessages(sessionId: string, filter?: TimeFilter, limit:
|
||||
const sql = `
|
||||
SELECT
|
||||
msg.id,
|
||||
m.id as senderId,
|
||||
COALESCE(m.group_nickname, m.account_name, m.platform_id) as senderName,
|
||||
m.platform_id as senderPlatformId,
|
||||
m.aliases,
|
||||
@@ -218,6 +222,7 @@ export function getAllRecentMessages(sessionId: string, filter?: TimeFilter, lim
|
||||
const sql = `
|
||||
SELECT
|
||||
msg.id,
|
||||
m.id as senderId,
|
||||
COALESCE(m.group_nickname, m.account_name, m.platform_id) as senderName,
|
||||
m.platform_id as senderPlatformId,
|
||||
m.aliases,
|
||||
@@ -301,6 +306,7 @@ export function searchMessages(
|
||||
const sql = `
|
||||
SELECT
|
||||
msg.id,
|
||||
m.id as senderId,
|
||||
COALESCE(m.group_nickname, m.account_name, m.platform_id) as senderName,
|
||||
m.platform_id as senderPlatformId,
|
||||
m.aliases,
|
||||
@@ -391,6 +397,7 @@ export function getMessageContext(
|
||||
const sql = `
|
||||
SELECT
|
||||
msg.id,
|
||||
m.id as senderId,
|
||||
COALESCE(m.group_nickname, m.account_name, m.platform_id) as senderName,
|
||||
m.platform_id as senderPlatformId,
|
||||
m.aliases,
|
||||
@@ -450,6 +457,7 @@ export function getMessagesBefore(
|
||||
const sql = `
|
||||
SELECT
|
||||
msg.id,
|
||||
m.id as senderId,
|
||||
COALESCE(m.group_nickname, m.account_name, m.platform_id) as senderName,
|
||||
m.platform_id as senderPlatformId,
|
||||
m.aliases,
|
||||
@@ -522,6 +530,7 @@ export function getMessagesAfter(
|
||||
const sql = `
|
||||
SELECT
|
||||
msg.id,
|
||||
m.id as senderId,
|
||||
COALESCE(m.group_nickname, m.account_name, m.platform_id) as senderName,
|
||||
m.platform_id as senderPlatformId,
|
||||
m.aliases,
|
||||
@@ -622,6 +631,7 @@ export function getConversationBetween(
|
||||
const sql = `
|
||||
SELECT
|
||||
msg.id,
|
||||
m.id as senderId,
|
||||
COALESCE(m.group_nickname, m.account_name, m.platform_id) as senderName,
|
||||
m.platform_id as senderPlatformId,
|
||||
m.aliases,
|
||||
|
||||
@@ -78,7 +78,9 @@ export function searchSessions(
|
||||
const previewSql = `
|
||||
SELECT
|
||||
m.id,
|
||||
mb.id as senderId,
|
||||
COALESCE(mb.group_nickname, mb.account_name, mb.platform_id) as senderName,
|
||||
mb.platform_id as senderPlatformId,
|
||||
m.content,
|
||||
m.ts as timestamp
|
||||
FROM message_context mc
|
||||
@@ -93,7 +95,9 @@ export function searchSessions(
|
||||
for (const session of sessions) {
|
||||
const previewMessages = db.prepare(previewSql).all(session.id, previewCount) as Array<{
|
||||
id: number
|
||||
senderId: number
|
||||
senderName: string
|
||||
senderPlatformId: string
|
||||
content: string | null
|
||||
timestamp: number
|
||||
}>
|
||||
@@ -164,7 +168,9 @@ export function getSessionMessages(
|
||||
const messagesSql = `
|
||||
SELECT
|
||||
m.id,
|
||||
mb.id as senderId,
|
||||
COALESCE(mb.group_nickname, mb.account_name, mb.platform_id) as senderName,
|
||||
mb.platform_id as senderPlatformId,
|
||||
m.content,
|
||||
m.ts as timestamp
|
||||
FROM message_context mc
|
||||
@@ -176,7 +182,9 @@ export function getSessionMessages(
|
||||
`
|
||||
const messages = db.prepare(messagesSql).all(chatSessionId, limit) as Array<{
|
||||
id: number
|
||||
senderId: number
|
||||
senderName: string
|
||||
senderPlatformId: string
|
||||
content: string | null
|
||||
timestamp: number
|
||||
}>
|
||||
|
||||
@@ -100,10 +100,37 @@ export interface AgentResult {
|
||||
toolRounds: number
|
||||
}
|
||||
|
||||
/** 单条脱敏规则 */
|
||||
export interface DesensitizeRule {
|
||||
id: string
|
||||
label: string
|
||||
pattern: string
|
||||
replacement: string
|
||||
enabled: boolean
|
||||
builtin: boolean
|
||||
locales: string[]
|
||||
}
|
||||
|
||||
/** 聊天记录预处理配置 */
|
||||
export interface PreprocessConfig {
|
||||
dataCleaning: boolean
|
||||
mergeConsecutive: boolean
|
||||
mergeWindowSeconds?: number
|
||||
blacklistKeywords: string[]
|
||||
denoise: boolean
|
||||
desensitize: boolean
|
||||
desensitizeRules: DesensitizeRule[]
|
||||
anonymizeNames: boolean
|
||||
}
|
||||
|
||||
export interface ToolContext {
|
||||
sessionId: string
|
||||
conversationId?: string
|
||||
timeFilter?: { startTs: number; endTs: number }
|
||||
maxMessagesLimit?: number
|
||||
ownerInfo?: { platformId: string; displayName: string }
|
||||
locale?: string
|
||||
preprocessConfig?: PreprocessConfig
|
||||
}
|
||||
|
||||
// AI 服务配置类型(前端用)
|
||||
@@ -463,6 +490,14 @@ export const aiApi = {
|
||||
showAiLogFile: (): Promise<{ success: boolean; path?: string; error?: string }> => {
|
||||
return ipcRenderer.invoke('ai:showLogFile')
|
||||
},
|
||||
|
||||
getDefaultDesensitizeRules: (locale: string): Promise<DesensitizeRule[]> => {
|
||||
return ipcRenderer.invoke('ai:getDefaultDesensitizeRules', locale)
|
||||
},
|
||||
|
||||
mergeDesensitizeRules: (existingRules: DesensitizeRule[], locale: string): Promise<DesensitizeRule[]> => {
|
||||
return ipcRenderer.invoke('ai:mergeDesensitizeRules', existingRules, locale)
|
||||
},
|
||||
}
|
||||
|
||||
// ==================== LLM API ====================
|
||||
@@ -647,12 +682,34 @@ export const agentApi = {
|
||||
locale?: string,
|
||||
maxHistoryRounds?: number
|
||||
): { requestId: string; promise: Promise<{ success: boolean; result?: AgentResult; error?: string }> } => {
|
||||
// 防御性处理:确保传给 IPC 的 context 是“可结构化克隆”的纯对象
|
||||
// 避免调用方误传入响应式 Proxy(例如 Pinia/Vue state)导致 invoke 失败
|
||||
const sanitizedContext: ToolContext = {
|
||||
sessionId: context.sessionId,
|
||||
conversationId: context.conversationId,
|
||||
timeFilter: context.timeFilter
|
||||
? {
|
||||
startTs: context.timeFilter.startTs,
|
||||
endTs: context.timeFilter.endTs,
|
||||
}
|
||||
: undefined,
|
||||
maxMessagesLimit: context.maxMessagesLimit,
|
||||
ownerInfo: context.ownerInfo
|
||||
? {
|
||||
platformId: context.ownerInfo.platformId,
|
||||
displayName: context.ownerInfo.displayName,
|
||||
}
|
||||
: undefined,
|
||||
locale: context.locale,
|
||||
preprocessConfig: context.preprocessConfig,
|
||||
}
|
||||
|
||||
const requestId = `agent_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`
|
||||
console.log(
|
||||
'[preload] Agent runStream 开始,requestId:',
|
||||
requestId,
|
||||
'conversationId:',
|
||||
context.conversationId ?? 'none',
|
||||
sanitizedContext.conversationId ?? 'none',
|
||||
'chatType:',
|
||||
chatType ?? 'group',
|
||||
'hasPromptConfig:',
|
||||
@@ -694,7 +751,16 @@ export const agentApi = {
|
||||
ipcRenderer.on('agent:complete', completeHandler)
|
||||
|
||||
ipcRenderer
|
||||
.invoke('agent:runStream', requestId, userMessage, context, chatType, promptConfig, locale, maxHistoryRounds)
|
||||
.invoke(
|
||||
'agent:runStream',
|
||||
requestId,
|
||||
userMessage,
|
||||
sanitizedContext,
|
||||
chatType,
|
||||
promptConfig,
|
||||
locale,
|
||||
maxHistoryRounds
|
||||
)
|
||||
.then((result) => {
|
||||
console.log('[preload] Agent invoke 返回:', result)
|
||||
if (!result.success) {
|
||||
|
||||
Vendored
+31
@@ -373,6 +373,8 @@ interface AiApi {
|
||||
getMessages: (conversationId: string) => Promise<AIMessage[]>
|
||||
deleteMessage: (messageId: string) => Promise<boolean>
|
||||
showAiLogFile: () => Promise<{ success: boolean; path?: string; error?: string }>
|
||||
getDefaultDesensitizeRules: (locale: string) => Promise<DesensitizeRule[]>
|
||||
mergeDesensitizeRules: (existingRules: DesensitizeRule[], locale: string) => Promise<DesensitizeRule[]>
|
||||
// 自定义筛选(支持分页)
|
||||
filterMessagesWithContext: (
|
||||
sessionId: string,
|
||||
@@ -617,6 +619,29 @@ interface OwnerInfo {
|
||||
displayName: string
|
||||
}
|
||||
|
||||
/** 单条脱敏规则 */
|
||||
interface DesensitizeRule {
|
||||
id: string
|
||||
label: string
|
||||
pattern: string
|
||||
replacement: string
|
||||
enabled: boolean
|
||||
builtin: boolean
|
||||
locales: string[]
|
||||
}
|
||||
|
||||
/** 聊天记录预处理配置 */
|
||||
interface PreprocessConfig {
|
||||
dataCleaning: boolean
|
||||
mergeConsecutive: boolean
|
||||
mergeWindowSeconds?: number
|
||||
blacklistKeywords: string[]
|
||||
denoise: boolean
|
||||
desensitize: boolean
|
||||
desensitizeRules: DesensitizeRule[]
|
||||
anonymizeNames: boolean
|
||||
}
|
||||
|
||||
interface ToolContext {
|
||||
sessionId: string
|
||||
conversationId?: string
|
||||
@@ -625,6 +650,10 @@ interface ToolContext {
|
||||
maxMessagesLimit?: number
|
||||
/** Owner 信息(当前用户在对话中的身份) */
|
||||
ownerInfo?: OwnerInfo
|
||||
/** 语言环境 */
|
||||
locale?: string
|
||||
/** 聊天记录预处理配置 */
|
||||
preprocessConfig?: PreprocessConfig
|
||||
}
|
||||
|
||||
// 用户自定义提示词配置
|
||||
@@ -871,6 +900,8 @@ export {
|
||||
AgentRuntimeStatus,
|
||||
AgentResult,
|
||||
ToolContext,
|
||||
DesensitizeRule,
|
||||
PreprocessConfig,
|
||||
PromptConfig,
|
||||
TokenUsage,
|
||||
CacheDirectoryInfo,
|
||||
|
||||
Reference in New Issue
Block a user