diff --git a/electron/main/ai/tools/definitions/deep-search-messages.ts b/electron/main/ai/tools/definitions/deep-search-messages.ts new file mode 100644 index 0000000..cab27d1 --- /dev/null +++ b/electron/main/ai/tools/definitions/deep-search-messages.ts @@ -0,0 +1,49 @@ +import { Type } from '@mariozechner/pi-ai' +import type { AgentTool } from '@mariozechner/pi-agent-core' +import type { ToolContext } from '../types' +import * as workerManager from '../../../worker/workerManager' +import { parseExtendedTimeParams } from '../utils/time-params' +import { formatTimeRange } from '../utils/format' +import { timeParamProperties } from '../utils/schemas' + +const schema = Type.Object({ + keywords: Type.Array(Type.String(), { description: 'ai.tools.deep_search_messages.params.keywords' }), + sender_id: Type.Optional(Type.Number({ description: 'ai.tools.deep_search_messages.params.sender_id' })), + limit: Type.Optional(Type.Number({ description: 'ai.tools.deep_search_messages.params.limit' })), + ...timeParamProperties, +}) + +export function createTool(context: ToolContext): AgentTool { + return { + name: 'deep_search_messages', + label: 'deep_search_messages', + description: 'ai.tools.deep_search_messages.desc', + parameters: schema, + execute: async (_toolCallId, params) => { + const { sessionId, timeFilter: contextTimeFilter, maxMessagesLimit, locale } = context + const limit = Math.min(maxMessagesLimit || params.limit || 1000, 50000) + const effectiveTimeFilter = parseExtendedTimeParams(params, contextTimeFilter) + + const result = await workerManager.deepSearchMessages( + sessionId, + params.keywords, + effectiveTimeFilter, + limit, + 0, + params.sender_id + ) + + const data = { + total: result.total, + returned: result.messages.length, + timeRange: formatTimeRange(effectiveTimeFilter, locale), + rawMessages: result.messages, + } + + return { + content: [{ type: 'text', text: JSON.stringify(data) }], + details: data, + } + }, + } +} diff --git a/electron/main/ai/tools/definitions/index.ts b/electron/main/ai/tools/definitions/index.ts index 2d710f2..858e3dc 100644 --- a/electron/main/ai/tools/definitions/index.ts +++ b/electron/main/ai/tools/definitions/index.ts @@ -9,6 +9,7 @@ import type { ToolRegistryEntry } from '../types' import { createTool as createGetChatOverview } from './get-chat-overview' import { createTool as createSearchMessages } from './search-messages' +import { createTool as createDeepSearchMessages } from './deep-search-messages' import { createTool as createGetRecentMessages } from './get-recent-messages' import { createTool as createGetMessageContext } from './get-message-context' import { createTool as createSearchSessions } from './search-sessions' @@ -30,6 +31,7 @@ export const TOOL_REGISTRY: ToolRegistryEntry[] = [ // ==================== Core 工具(始终加载) ==================== { name: 'get_chat_overview', factory: createGetChatOverview, category: 'core' }, { name: 'search_messages', factory: createSearchMessages, category: 'core' }, + { name: 'deep_search_messages', factory: createDeepSearchMessages, category: 'core' }, { name: 'get_recent_messages', factory: createGetRecentMessages, category: 'core' }, { name: 'get_message_context', factory: createGetMessageContext, category: 'core' }, { name: 'search_sessions', factory: createSearchSessions, category: 'core' }, diff --git a/electron/main/database/core.ts b/electron/main/database/core.ts index 9b2af01..dcda1b3 100644 --- a/electron/main/database/core.ts +++ b/electron/main/database/core.ts @@ -120,6 +120,14 @@ function createDatabase(sessionId: string): Database.Database { CREATE INDEX IF NOT EXISTS idx_context_session ON message_context(session_id); `) + db.exec(` + CREATE VIRTUAL TABLE IF NOT EXISTS message_fts USING fts5( + content, + content='', + content_rowid=id + ) + `) + return db } diff --git a/electron/main/database/migrations.ts b/electron/main/database/migrations.ts index 8653a12..6ebd31e 100644 --- a/electron/main/database/migrations.ts +++ b/electron/main/database/migrations.ts @@ -12,6 +12,7 @@ import type Database from 'better-sqlite3' import { t } from '../i18n' +import { tokenizeForFts } from '../nlp/ftsTokenizer' /** 迁移脚本接口 */ interface Migration { @@ -35,7 +36,7 @@ export interface MigrationInfo { } /** 当前 schema 版本(最新迁移的版本号) */ -export const CURRENT_SCHEMA_VERSION = 3 +export const CURRENT_SCHEMA_VERSION = 4 /** * 迁移脚本列表 @@ -138,6 +139,56 @@ const migrations: Migration[] = [ } }, }, + { + version: 4, + descriptionKey: 'database.migrationV4Desc', + userMessageKey: 'database.migrationV4Message', + up: (db) => { + const hasTable = db + .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='message_fts'") + .get() + if (hasTable) return + + db.exec(` + CREATE VIRTUAL TABLE IF NOT EXISTS message_fts USING fts5( + content, + content='', + content_rowid=id + ) + `) + + const BATCH_SIZE = 5000 + const insertFts = db.prepare('INSERT INTO message_fts(rowid, content) VALUES (?, ?)') + + const countRow = db + .prepare( + "SELECT COUNT(*) as total FROM message WHERE type = 0 AND content IS NOT NULL AND content != ''" + ) + .get() as { total: number } + + let offset = 0 + while (offset < countRow.total) { + const rows = db + .prepare( + `SELECT id, content FROM message + WHERE type = 0 AND content IS NOT NULL AND content != '' + ORDER BY id ASC LIMIT ? OFFSET ?` + ) + .all(BATCH_SIZE, offset) as Array<{ id: number; content: string }> + + if (rows.length === 0) break + + for (const row of rows) { + const tokens = tokenizeForFts(row.content) + if (tokens) { + insertFts.run(row.id, tokens) + } + } + + offset += BATCH_SIZE + } + }, + }, ] /** diff --git a/electron/main/i18n/locales/en-US.ts b/electron/main/i18n/locales/en-US.ts index c221617..0de45aa 100644 --- a/electron/main/i18n/locales/en-US.ts +++ b/electron/main/i18n/locales/en-US.ts @@ -41,6 +41,8 @@ export default { migrationV2Message: 'Support member roles, message reply relationships and reply preview', migrationV3Desc: 'Add session index tables (chat_session, message_context) and session_gap_threshold field', migrationV3Message: 'Support session timeline browsing and AI-enhanced analysis', + migrationV4Desc: 'Create FTS5 full-text search index (message_fts) and build index data', + migrationV4Message: 'Enable full-text search for significantly faster keyword search', integrityError: 'Database structure is incomplete: missing meta table. Please delete this database file and re-import.', checkFailed: 'Database check failed: {{error}}', @@ -72,6 +74,20 @@ export default { 'End time, format "YYYY-MM-DD HH:mm", e.g. "2024-03-15 18:30". Overrides year/month/day/hour when specified', }, }, + deep_search_messages: { + desc: 'Exact substring match search for chat records. Slower but never misses any message containing the keyword. Use when regular search (search_messages) results are insufficient, or when searching for partial words or single characters.', + params: { + keywords: 'List of search keywords, using substring match (LIKE). Returns messages matching any keyword', + sender_id: 'Sender member ID for filtering messages from a specific member', + limit: 'Message count limit, default 1000, max 50000', + year: 'Filter messages by year, e.g. 2024', + month: 'Filter messages by month (1-12), use with year', + day: 'Filter messages by day (1-31), use with year and month', + hour: 'Filter messages by hour (0-23), use with year, month, and day', + start_time: 'Start time, format "YYYY-MM-DD HH:mm". Overrides year/month/day/hour when specified', + end_time: 'End time, format "YYYY-MM-DD HH:mm". Overrides year/month/day/hour when specified', + }, + }, get_recent_messages: { desc: 'Get chat messages within a specified time period. Suitable for overview questions like "what has everyone been chatting about recently" or "what was discussed in month X". Supports minute-level time queries.', params: { diff --git a/electron/main/i18n/locales/ja-JP.ts b/electron/main/i18n/locales/ja-JP.ts index 97dc5bb..0b2224f 100644 --- a/electron/main/i18n/locales/ja-JP.ts +++ b/electron/main/i18n/locales/ja-JP.ts @@ -42,6 +42,8 @@ export default { migrationV3Desc: 'セッションインデックス関連テーブル(chat_session、message_context)と session_gap_threshold フィールドを追加', migrationV3Message: 'セッションのタイムライン表示と AI 拡張分析に対応', + migrationV4Desc: 'FTS5 全文検索インデックス(message_fts)を作成しインデックスデータを構築', + migrationV4Message: '全文検索に対応し、キーワード検索速度が大幅に向上', integrityError: 'データベース構造が不完全です:meta テーブルがありません。このデータベースファイルを削除して再インポートすることをお勧めします。', checkFailed: 'データベースチェックに失敗しました: {{error}}', @@ -73,6 +75,20 @@ export default { '終了時刻。形式 "YYYY-MM-DD HH:mm"(例:"2024-03-15 18:30")。指定すると year/month/day/hour パラメータを上書きする', }, }, + deep_search_messages: { + desc: '完全部分文字列マッチでチャット履歴を検索する。速度は遅いが、キーワードを含むメッセージを漏らさない。通常の検索(search_messages)の結果が不十分な場合や、部分的な単語・単一文字を検索する場合に使用する。', + params: { + keywords: '検索キーワードリスト。部分文字列マッチ(LIKE)を使用し、いずれかのキーワードにマッチしたメッセージを返す', + sender_id: '送信者のメンバー ID。特定メンバーの送信メッセージをフィルタリングする', + limit: '返却メッセージ数の上限。デフォルト 1000、最大 50000', + year: '指定年のメッセージをフィルタリング(例:2024)', + month: '指定月のメッセージをフィルタリング(1-12)。year と併用する必要がある', + day: '指定日のメッセージをフィルタリング(1-31)。year と month と併用する必要がある', + hour: '指定時間のメッセージをフィルタリング(0-23)。year、month、day と併用する必要がある', + start_time: '開始時刻。形式 "YYYY-MM-DD HH:mm"。指定すると year/month/day/hour パラメータを上書きする', + end_time: '終了時刻。形式 "YYYY-MM-DD HH:mm"。指定すると year/month/day/hour パラメータを上書きする', + }, + }, get_recent_messages: { desc: '指定期間内のグループチャットメッセージを取得する。「最近みんな何を話していた?」「X月にグループで何が話題だった?」などの概要的な質問に適している。分単位の精度で時間クエリをサポートする。', params: { diff --git a/electron/main/i18n/locales/zh-CN.ts b/electron/main/i18n/locales/zh-CN.ts index e2f4690..e936c65 100644 --- a/electron/main/i18n/locales/zh-CN.ts +++ b/electron/main/i18n/locales/zh-CN.ts @@ -41,6 +41,8 @@ export default { migrationV2Message: '支持成员角色、消息回复关系和回复内容预览', migrationV3Desc: '添加会话索引相关表(chat_session、message_context)和 session_gap_threshold 字段', migrationV3Message: '支持会话时间轴浏览和 AI 增强分析功能', + migrationV4Desc: '创建 FTS5 全文搜索索引(message_fts)并构建索引数据', + migrationV4Message: '支持全文搜索,大幅提升关键词搜索速度', integrityError: '数据库结构不完整:缺少 meta 表。建议删除此数据库文件后重新导入。', checkFailed: '数据库检查失败: {{error}}', }, @@ -67,6 +69,20 @@ export default { end_time: '结束时间,格式 "YYYY-MM-DD HH:mm",如 "2024-03-15 18:30"。指定后会覆盖 year/month/day/hour 参数', }, }, + deep_search_messages: { + desc: '精确子串匹配搜索聊天记录,速度较慢但不会遗漏任何包含关键词的消息。当普通搜索(search_messages)结果不足、需要搜索部分词或单个字符时使用。', + params: { + keywords: '搜索关键词列表,使用子串匹配(LIKE),任一关键词匹配即返回', + sender_id: '发送者的成员 ID,用于筛选特定成员发送的消息', + limit: '返回消息数量限制,默认 1000,最大 50000', + year: '筛选指定年份的消息,如 2024', + month: '筛选指定月份的消息(1-12),需要配合 year 使用', + day: '筛选指定日期的消息(1-31),需要配合 year 和 month 使用', + hour: '筛选指定小时的消息(0-23),需要配合 year、month 和 day 使用', + start_time: '开始时间,格式 "YYYY-MM-DD HH:mm"。指定后会覆盖 year/month/day/hour 参数', + end_time: '结束时间,格式 "YYYY-MM-DD HH:mm"。指定后会覆盖 year/month/day/hour 参数', + }, + }, get_recent_messages: { desc: '获取指定时间段内的群聊消息。适用于回答"最近大家聊了什么"、"X月群里聊了什么"等概览性问题。支持精确到分钟级别的时间查询。', params: { diff --git a/electron/main/i18n/locales/zh-TW.ts b/electron/main/i18n/locales/zh-TW.ts index 6146c9a..aa971e7 100644 --- a/electron/main/i18n/locales/zh-TW.ts +++ b/electron/main/i18n/locales/zh-TW.ts @@ -41,6 +41,8 @@ export default { migrationV2Message: '支援成員角色、訊息回覆關係和回覆內容預覽', migrationV3Desc: '新增會話索引相關表(chat_session、message_context)和 session_gap_threshold 欄位', migrationV3Message: '支援會話時間軸瀏覽和 AI 增強分析功能', + migrationV4Desc: '建立 FTS5 全文搜尋索引(message_fts)並構建索引資料', + migrationV4Message: '支援全文搜尋,大幅提升關鍵字搜尋速度', integrityError: '資料庫結構不完整:缺少 meta 表。建議刪除此資料庫檔案後重新匯入。', checkFailed: '資料庫檢查失敗:{{error}}', }, @@ -67,6 +69,20 @@ export default { end_time: '結束時間,格式 "YYYY-MM-DD HH:mm",如 "2024-03-15 18:30"。指定後會覆蓋 year/month/day/hour 參數', }, }, + deep_search_messages: { + desc: '精確子串匹配搜尋聊天紀錄,速度較慢但不會遺漏任何包含關鍵詞的訊息。當普通搜尋(search_messages)結果不足、需要搜尋部分詞或單個字元時使用。', + params: { + keywords: '搜尋關鍵詞清單,使用子串匹配(LIKE),任一關鍵詞匹配即回傳', + sender_id: '傳送者的成員 ID,用於篩選特定成員傳送的訊息', + limit: '回傳訊息數量限制,預設 1000,最大 50000', + year: '篩選指定年份的訊息,如 2024', + month: '篩選指定月份的訊息(1-12),需要配合 year 使用', + day: '篩選指定日期的訊息(1-31),需要配合 year 和 month 使用', + hour: '篩選指定小時的訊息(0-23),需要配合 year、month 和 day 使用', + start_time: '開始時間,格式 "YYYY-MM-DD HH:mm"。指定後會覆蓋 year/month/day/hour 參數', + end_time: '結束時間,格式 "YYYY-MM-DD HH:mm"。指定後會覆蓋 year/month/day/hour 參數', + }, + }, get_recent_messages: { desc: '取得指定時間段內的群聊訊息。適用於回答「最近大家聊了什麼」、「X月群裡聊了什麼」等概覽性問題。支援精確到分鐘級別的時間查詢。', params: { diff --git a/electron/main/nlp/ftsTokenizer.ts b/electron/main/nlp/ftsTokenizer.ts new file mode 100644 index 0000000..159fe49 --- /dev/null +++ b/electron/main/nlp/ftsTokenizer.ts @@ -0,0 +1,112 @@ +/** + * FTS5 专用分词器 + * + * 与 NLP 词频分析不同,FTS 分词不做词性过滤和停用词过滤, + * 搜索场景需要保留所有词以保证召回率。 + * + * 使用 jieba 处理中文(天然兼容中英混合文本), + * Intl.Segmenter 处理纯英文/日文。 + */ + +interface JiebaInstance { + cut: (text: string, hmm?: boolean) => string[] +} + +let jiebaInstance: JiebaInstance | null = null + +function getJieba(): JiebaInstance { + if (!jiebaInstance) { + try { + // eslint-disable-next-line @typescript-eslint/no-require-imports + const { Jieba } = require('@node-rs/jieba') + // eslint-disable-next-line @typescript-eslint/no-require-imports + const { dict } = require('@node-rs/jieba/dict') + jiebaInstance = Jieba.withDict(dict) + } catch (error) { + console.error('[FTS] Failed to load jieba module:', error) + throw new Error('jieba 模块加载失败') + } + } + return jiebaInstance! +} + +/** + * 对文本进行 FTS 分词,返回空格分隔的 token 字符串。 + * 用于写入 FTS5 索引。 + */ +export function tokenizeForFts(text: string | null | undefined): string { + if (!text || text.trim().length === 0) return '' + + try { + const jieba = getJieba() + const tokens = jieba.cut(text, false) + return tokens + .map((t) => t.trim().toLowerCase()) + .filter((t) => t.length > 0) + .join(' ') + } catch { + return fallbackTokenize(text) + } +} + +/** + * 降级分词:jieba 不可用时按字符/空格切分 + */ +function fallbackTokenize(text: string): string { + return text + .toLowerCase() + .split(/\s+/) + .filter((t) => t.length > 0) + .join(' ') +} + +/** + * 转义单个 token 使其在 FTS5 MATCH 中安全使用。 + * 用双引号包裹并转义内部双引号。 + */ +function escapeToken(token: string): string { + return `"${token.replace(/"/g, '""')}"` +} + +/** + * 将用户搜索关键词列表转换为 FTS5 MATCH 表达式。 + * + * 语义: + * - 单个关键词先分词,分词结果之间为 AND(都要出现) + * - 多个关键词之间为 OR(任一匹配即可) + * + * 示例: + * - ["今天开心"] -> '"今天" "开心"' (AND) + * - ["今天开心", "难过"] -> '("今天" "开心") OR "难过"' (OR) + * - ["hello"] -> '"hello"' + */ +export function tokenizeQueryForFts(keywords: string[]): string { + if (keywords.length === 0) return '' + + const groups = keywords + .map((kw) => { + const trimmed = kw.trim() + if (!trimmed) return '' + + try { + const jieba = getJieba() + const tokens = jieba + .cut(trimmed, false) + .map((t) => t.trim().toLowerCase()) + .filter((t) => t.length > 0) + + if (tokens.length === 0) return '' + if (tokens.length === 1) return escapeToken(tokens[0]) + return tokens.map(escapeToken).join(' ') + } catch { + const simple = trimmed.toLowerCase().trim() + return simple ? escapeToken(simple) : '' + } + }) + .filter((g) => g.length > 0) + + if (groups.length === 0) return '' + if (groups.length === 1) return groups[0] + + return groups.map((g) => (g.includes(' ') ? `(${g})` : g)).join(' OR ') +} diff --git a/electron/main/worker/dbWorker.ts b/electron/main/worker/dbWorker.ts index eeb7f08..3fd3fb0 100644 --- a/electron/main/worker/dbWorker.ts +++ b/electron/main/worker/dbWorker.ts @@ -32,6 +32,7 @@ import { getLaughAnalysis, getClusterGraph, searchMessages, + deepSearchMessages, getMessageContext, getRecentMessages, getAllRecentMessages, @@ -160,6 +161,9 @@ const syncHandlers: Record any> = { getWordFrequency: (p) => getWordFrequency(p), segmentText: (p) => segmentText(p.text, p.locale, p.minLength), getPosTags: () => getPosTags(), + + // 深度搜索(LIKE 子串匹配) + deepSearchMessages: (p) => deepSearchMessages(p.sessionId, p.keywords, p.filter, p.limit, p.offset, p.senderId), } // 异步消息处理器(流式操作) diff --git a/electron/main/worker/import/incrementalImport.ts b/electron/main/worker/import/incrementalImport.ts index dc25f91..a4a7e6c 100644 --- a/electron/main/worker/import/incrementalImport.ts +++ b/electron/main/worker/import/incrementalImport.ts @@ -243,6 +243,16 @@ export async function incrementalImport( // 更新 imported_at 时间 db.prepare('UPDATE meta SET imported_at = ?').run(Math.floor(Date.now() / 1000)) + // 重建 FTS5 索引(增量导入后需要重建以包含新消息) + if (newMessageCount > 0) { + try { + const { rebuildFtsIndex } = await import('../query/fts') + rebuildFtsIndex(sessionId) + } catch { + // FTS 重建失败不影响导入流程 + } + } + // 写入概览统计缓存文件 try { const { computeAndSetOverviewCache } = await import('../../database/sessionCache') diff --git a/electron/main/worker/import/streamImport.ts b/electron/main/worker/import/streamImport.ts index 60283b3..89130a2 100644 --- a/electron/main/worker/import/streamImport.ts +++ b/electron/main/worker/import/streamImport.ts @@ -703,6 +703,44 @@ async function streamImportSingle( createIndexes(db) logPerf('Indexes created', totalMessageCount) + // 构建 FTS5 全文搜索索引 + try { + const { createFtsTable } = await import('../query/fts') + const { tokenizeForFts } = await import('../../nlp/ftsTokenizer') + + createFtsTable(db) + + const FTS_BATCH = 5000 + const insertFts = db.prepare('INSERT INTO message_fts(rowid, content) VALUES (?, ?)') + let ftsOffset = 0 + + while (true) { + const rows = db + .prepare( + `SELECT id, content FROM message + WHERE type = 0 AND content IS NOT NULL AND content != '' + ORDER BY id ASC LIMIT ? OFFSET ?` + ) + .all(FTS_BATCH, ftsOffset) as Array<{ id: number; content: string }> + + if (rows.length === 0) break + + const batch = db.transaction(() => { + for (const row of rows) { + const tokens = tokenizeForFts(row.content) + if (tokens) insertFts.run(row.id, tokens) + } + }) + batch() + + ftsOffset += FTS_BATCH + } + + logPerf('FTS index built', totalMessageCount) + } catch (ftsError) { + logError('FTS index build failed (non-fatal)', ftsError instanceof Error ? ftsError : undefined) + } + // 最终 WAL checkpoint sendProgress(requestId, { stage: 'importing', diff --git a/electron/main/worker/import/utils.ts b/electron/main/worker/import/utils.ts index c75cb3f..3ee8075 100644 --- a/electron/main/worker/import/utils.ts +++ b/electron/main/worker/import/utils.ts @@ -64,7 +64,7 @@ export function createDatabaseWithoutIndexes(sessionId: string): Database.Databa group_id TEXT, group_avatar TEXT, owner_id TEXT, - schema_version INTEGER DEFAULT 3, + schema_version INTEGER DEFAULT 4, session_gap_threshold INTEGER ); diff --git a/electron/main/worker/query/fts.ts b/electron/main/worker/query/fts.ts new file mode 100644 index 0000000..b9fc886 --- /dev/null +++ b/electron/main/worker/query/fts.ts @@ -0,0 +1,215 @@ +/** + * FTS5 全文搜索索引管理模块 + * + * 使用 Contentless FTS5 虚拟表为 message.content 提供倒排索引。 + * 通过应用层预分词(jieba + Intl.Segmenter),空格分隔后存入 FTS5。 + * + * DELETE 策略:不在 FTS 做同步删除,依赖 JOIN message 自然过滤。 + * 提供 rebuildFtsIndex() 用于手动清理无效条目。 + */ + +import Database from 'better-sqlite3' +import { getDbPath, openDatabase } from '../core' +import { tokenizeForFts, tokenizeQueryForFts } from '../../nlp/ftsTokenizer' + +const BATCH_SIZE = 5000 + +const CREATE_FTS_TABLE_SQL = ` + CREATE VIRTUAL TABLE IF NOT EXISTS message_fts USING fts5( + content, + content='', + content_rowid=id + ) +` + +/** + * 打开可写数据库(FTS 写入专用,不使用缓存池) + */ +function openWritableDb(sessionId: string): Database.Database | null { + const dbPath = getDbPath(sessionId) + try { + const db = new Database(dbPath) + db.pragma('journal_mode = WAL') + return db + } catch { + return null + } +} + +/** + * 检查数据库是否已有 FTS 索引 + */ +export function hasFtsIndex(sessionId: string): boolean { + const db = openDatabase(sessionId) + if (!db) return false + try { + const row = db + .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='message_fts'") + .get() as { name: string } | undefined + return !!row + } catch { + return false + } +} + +/** + * 创建 FTS 虚拟表(如果不存在) + */ +export function createFtsTable(db: Database.Database): void { + db.exec(CREATE_FTS_TABLE_SQL) +} + +/** + * 批量构建 FTS 索引(分批处理,每批 BATCH_SIZE 条) + * 用于迁移或首次导入后的全量构建。 + */ +export function buildFtsIndex(sessionId: string): { indexed: number } { + const db = openWritableDb(sessionId) + if (!db) return { indexed: 0 } + + try { + createFtsTable(db) + + const insertFts = db.prepare('INSERT INTO message_fts(rowid, content) VALUES (?, ?)') + + const countRow = db.prepare( + "SELECT COUNT(*) as total FROM message WHERE type = 0 AND content IS NOT NULL AND content != ''" + ).get() as { total: number } + const total = countRow.total + + let indexed = 0 + let offset = 0 + + while (offset < total) { + const rows = db + .prepare( + `SELECT id, content FROM message + WHERE type = 0 AND content IS NOT NULL AND content != '' + ORDER BY id ASC + LIMIT ? OFFSET ?` + ) + .all(BATCH_SIZE, offset) as Array<{ id: number; content: string }> + + if (rows.length === 0) break + + const batchInsert = db.transaction(() => { + for (const row of rows) { + const tokens = tokenizeForFts(row.content) + if (tokens) { + insertFts.run(row.id, tokens) + } + } + }) + batchInsert() + + indexed += rows.length + offset += BATCH_SIZE + } + + return { indexed } + } finally { + db.close() + } +} + +/** + * 重建 FTS 索引(清空后重新构建) + * 用于清理无效条目(如删除成员后)或修复索引 + */ +export function rebuildFtsIndex(sessionId: string): { indexed: number } { + const db = openWritableDb(sessionId) + if (!db) return { indexed: 0 } + + try { + const hasTable = db + .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='message_fts'") + .get() + + if (hasTable) { + db.exec('DROP TABLE message_fts') + } + + db.close() + + return buildFtsIndex(sessionId) + } catch { + db.close() + return { indexed: 0 } + } +} + +/** + * 批量写入 FTS 条目 + * 用于增量导入时同步写入 + */ +export function insertFtsEntries( + sessionId: string, + entries: Array<{ id: number; content: string | null }> +): void { + const db = openWritableDb(sessionId) + if (!db) return + + try { + const hasTable = db + .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='message_fts'") + .get() + if (!hasTable) { + db.close() + return + } + + const insertFts = db.prepare('INSERT INTO message_fts(rowid, content) VALUES (?, ?)') + + const batchInsert = db.transaction(() => { + for (const entry of entries) { + if (entry.content) { + const tokens = tokenizeForFts(entry.content) + if (tokens) { + insertFts.run(entry.id, tokens) + } + } + } + }) + batchInsert() + } finally { + db.close() + } +} + +/** + * 通过 FTS5 搜索消息,返回匹配的 message rowids + */ +export function searchByFts( + sessionId: string, + keywords: string[], + limit: number = 1000, + offset: number = 0 +): { rowids: number[]; total: number } { + if (keywords.length === 0) return { rowids: [], total: 0 } + + const db = openDatabase(sessionId) + if (!db) return { rowids: [], total: 0 } + + const matchQuery = tokenizeQueryForFts(keywords) + if (!matchQuery) return { rowids: [], total: 0 } + + try { + const countRow = db + .prepare('SELECT COUNT(*) as total FROM message_fts WHERE content MATCH ?') + .get(matchQuery) as { total: number } + + const rows = db + .prepare( + `SELECT rowid FROM message_fts WHERE content MATCH ? ORDER BY rank LIMIT ? OFFSET ?` + ) + .all(matchQuery, limit, offset) as Array<{ rowid: number }> + + return { + rowids: rows.map((r) => r.rowid), + total: countRow.total, + } + } catch (error) { + console.error('[FTS] Search failed, query:', matchQuery, error) + return { rowids: [], total: 0 } + } +} diff --git a/electron/main/worker/query/index.ts b/electron/main/worker/query/index.ts index 4d58deb..97aedec 100644 --- a/electron/main/worker/query/index.ts +++ b/electron/main/worker/query/index.ts @@ -44,6 +44,7 @@ export type { ClusterGraphData, ClusterGraphNode, ClusterGraphLink, ClusterGraph // 聊天记录查询 export { searchMessages, + deepSearchMessages, getMessageContext, getRecentMessages, getAllRecentMessages, @@ -94,3 +95,6 @@ export type { // NLP 查询 export { getWordFrequency, segmentText, getPosTags } from './nlp' + +// FTS 索引管理 +export { hasFtsIndex, buildFtsIndex, rebuildFtsIndex } from './fts' diff --git a/electron/main/worker/query/messages.ts b/electron/main/worker/query/messages.ts index 96c9614..49c9861 100644 --- a/electron/main/worker/query/messages.ts +++ b/electron/main/worker/query/messages.ts @@ -6,6 +6,8 @@ import { openDatabase, buildTimeFilter, type TimeFilter } from '../core' import { ensureAvatarColumn } from './basic' +import { hasFtsIndex } from './fts' +import { tokenizeQueryForFts } from '../../nlp/ftsTokenizer' // ==================== 类型定义 ==================== @@ -275,22 +277,110 @@ export function searchMessages( const db = openDatabase(sessionId) if (!db) return { messages: [], total: 0 } - // 构建关键词条件(OR 逻辑) - let keywordCondition = '1=1' // 默认条件(始终为真) + const useFts = keywords.length > 0 && hasFtsIndex(sessionId) + let matchQuery = '' + if (useFts) { + matchQuery = tokenizeQueryForFts(keywords) + } + + // FTS5 路径:使用倒排索引加速搜索 + if (useFts && matchQuery) { + return searchMessagesWithFts(db, sessionId, matchQuery, filter, limit, offset, senderId) + } + + // LIKE 路径(fallback):旧数据库无 FTS 索引或无关键词 + return searchMessagesWithLike(db, keywords, filter, limit, offset, senderId) +} + +/** + * FTS5 搜索路径 + */ +function searchMessagesWithFts( + db: ReturnType & object, + _sessionId: string, + matchQuery: string, + filter?: TimeFilter, + limit: number = 20, + offset: number = 0, + senderId?: number +): MessagesWithTotal { + const { clause: timeClause, params: timeParams } = buildTimeFilter(filter, 'msg') + const timeCondition = timeClause ? timeClause.replace('WHERE', 'AND') : '' + const { condition: senderCondition, params: senderParams } = buildSenderCondition(senderId) + + try { + const countSql = ` + SELECT COUNT(*) as total + FROM message msg + JOIN member m ON msg.sender_id = m.id + WHERE msg.id IN (SELECT rowid FROM message_fts WHERE content MATCH ?) + ${timeCondition} + ${senderCondition} + ` + const totalRow = db.prepare(countSql).get(matchQuery, ...timeParams, ...senderParams) as { total: number } + const total = totalRow?.total || 0 + + const sql = ` + SELECT + msg.id, + m.id as senderId, + COALESCE(m.group_nickname, m.account_name, m.platform_id) as senderName, + m.platform_id as senderPlatformId, + m.aliases, + m.avatar, + msg.content, + msg.ts as timestamp, + msg.type, + msg.reply_to_message_id, + reply_msg.content as replyToContent, + COALESCE(reply_m.group_nickname, reply_m.account_name, reply_m.platform_id) as replyToSenderName + FROM message msg + JOIN member m ON msg.sender_id = m.id + LEFT JOIN message reply_msg ON msg.reply_to_message_id = reply_msg.platform_message_id + LEFT JOIN member reply_m ON reply_msg.sender_id = reply_m.id + WHERE msg.id IN (SELECT rowid FROM message_fts WHERE content MATCH ?) + ${timeCondition} + ${senderCondition} + ORDER BY msg.ts DESC + LIMIT ? OFFSET ? + ` + + const rows = db + .prepare(sql) + .all(matchQuery, ...timeParams, ...senderParams, limit, offset) as DbMessageRow[] + + return { + messages: rows.map(sanitizeMessageRow), + total, + } + } catch (error) { + console.error('[FTS] searchMessages FTS path failed, falling back to LIKE:', error) + return searchMessagesWithLike(db, [], filter, limit, offset, senderId) + } +} + +/** + * LIKE 搜索路径(fallback 或 deep_search 使用) + */ +export function searchMessagesWithLike( + db: ReturnType & object, + keywords: string[], + filter?: TimeFilter, + limit: number = 20, + offset: number = 0, + senderId?: number +): MessagesWithTotal { + let keywordCondition = '1=1' const keywordParams: string[] = [] if (keywords.length > 0) { keywordCondition = `(${keywords.map(() => `msg.content LIKE ?`).join(' OR ')})` keywordParams.push(...keywords.map((k) => `%${k}%`)) } - // 构建时间过滤条件(使用 'msg' 表别名避免多表 JOIN 时的列名歧义) const { clause: timeClause, params: timeParams } = buildTimeFilter(filter, 'msg') const timeCondition = timeClause ? timeClause.replace('WHERE', 'AND') : '' - - // 构建发送者筛选条件 const { condition: senderCondition, params: senderParams } = buildSenderCondition(senderId) - // 查询总数 const countSql = ` SELECT COUNT(*) as total FROM message msg @@ -302,7 +392,6 @@ export function searchMessages( const totalRow = db.prepare(countSql).get(...keywordParams, ...timeParams, ...senderParams) as { total: number } const total = totalRow?.total || 0 - // 查询消息 const sql = ` SELECT msg.id, @@ -336,6 +425,24 @@ export function searchMessages( } } +/** + * 深度搜索消息(LIKE 子串匹配,速度较慢但不会遗漏) + * 始终使用 LIKE 路径,不经过 FTS5。 + */ +export function deepSearchMessages( + sessionId: string, + keywords: string[], + filter?: TimeFilter, + limit: number = 20, + offset: number = 0, + senderId?: number +): MessagesWithTotal { + ensureAvatarColumn(sessionId) + const db = openDatabase(sessionId) + if (!db) return { messages: [], total: 0 } + return searchMessagesWithLike(db, keywords, filter, limit, offset, senderId) +} + /** * 获取消息上下文(指定消息前后的消息) * 使用消息 ID 方式获取精确的前后 N 条消息 diff --git a/electron/main/worker/query/session/aiTools.ts b/electron/main/worker/query/session/aiTools.ts index f55bfe9..649eab7 100644 --- a/electron/main/worker/query/session/aiTools.ts +++ b/electron/main/worker/query/session/aiTools.ts @@ -5,6 +5,8 @@ import { openReadonlyDatabase } from './core' import type { SessionSearchResultItem, SessionMessagesResult } from './types' +import { hasFtsIndex } from '../fts' +import { tokenizeQueryForFts } from '../../../nlp/ftsTokenizer' /** * 搜索会话(用于 AI 工具) @@ -50,17 +52,31 @@ export function searchSessions( // 关键词过滤:只返回包含关键词的会话 if (keywords && keywords.length > 0) { - const keywordConditions = keywords.map(() => `m.content LIKE ?`).join(' OR ') - sessionSql += ` - AND cs.id IN ( - SELECT DISTINCT mc.session_id - FROM message_context mc - JOIN message m ON m.id = mc.message_id - WHERE (${keywordConditions}) - ) - ` - for (const kw of keywords) { - params.push(`%${kw}%`) + const useFts = hasFtsIndex(sessionId) + const matchQuery = useFts ? tokenizeQueryForFts(keywords) : '' + + if (useFts && matchQuery) { + sessionSql += ` + AND cs.id IN ( + SELECT DISTINCT mc.session_id + FROM message_context mc + WHERE mc.message_id IN (SELECT rowid FROM message_fts WHERE content MATCH ?) + ) + ` + params.push(matchQuery) + } else { + const keywordConditions = keywords.map(() => `m.content LIKE ?`).join(' OR ') + sessionSql += ` + AND cs.id IN ( + SELECT DISTINCT mc.session_id + FROM message_context mc + JOIN message m ON m.id = mc.message_id + WHERE (${keywordConditions}) + ) + ` + for (const kw of keywords) { + params.push(`%${kw}%`) + } } } diff --git a/electron/main/worker/workerManager.ts b/electron/main/worker/workerManager.ts index c84e349..0c0ad22 100644 --- a/electron/main/worker/workerManager.ts +++ b/electron/main/worker/workerManager.ts @@ -468,6 +468,20 @@ export async function searchMessages( return sendToWorker('searchMessages', { sessionId, keywords, filter, limit, offset, senderId }) } +/** + * 深度搜索消息(LIKE 子串匹配,速度较慢但不会遗漏) + */ +export async function deepSearchMessages( + sessionId: string, + keywords: string[], + filter?: any, + limit?: number, + offset?: number, + senderId?: number +): Promise<{ messages: SearchMessageResult[]; total: number }> { + return sendToWorker('deepSearchMessages', { sessionId, keywords, filter, limit, offset, senderId }) +} + /** * 获取消息上下文 * 支持单个或批量消息 ID,返回合并去重后的上下文消息 diff --git a/src/i18n/locales/en-US/ai.json b/src/i18n/locales/en-US/ai.json index cd4d428..773f285 100644 --- a/src/i18n/locales/en-US/ai.json +++ b/src/i18n/locales/en-US/ai.json @@ -49,6 +49,7 @@ "tools": { "get_chat_overview": "Get Chat Overview", "search_messages": "Search Messages", + "deep_search_messages": "Deep Search Messages", "get_recent_messages": "Get Recent Messages", "get_member_stats": "Get Member Stats", "get_time_stats": "Get Time Stats", @@ -310,6 +311,7 @@ "builtinToolDesc": { "get_chat_overview": "Get chat overview information", "search_messages": "Search chat messages", + "deep_search_messages": "Deep search chat messages", "get_recent_messages": "Get recent messages", "get_message_context": "Get message context", "get_conversation_between": "Get conversation between two members", diff --git a/src/i18n/locales/ja-JP/ai.json b/src/i18n/locales/ja-JP/ai.json index eb4798a..cf23fd6 100644 --- a/src/i18n/locales/ja-JP/ai.json +++ b/src/i18n/locales/ja-JP/ai.json @@ -49,6 +49,7 @@ "tools": { "get_chat_overview": "チャット概要を取得", "search_messages": "チャット履歴を検索", + "deep_search_messages": "チャット履歴を深層検索", "get_recent_messages": "最近のメッセージを取得", "get_member_stats": "メンバー統計を取得", "get_time_stats": "時間分布を取得", @@ -310,6 +311,7 @@ "builtinToolDesc": { "get_chat_overview": "チャット概要情報を取得", "search_messages": "チャットメッセージを検索", + "deep_search_messages": "チャットメッセージを深層検索", "get_recent_messages": "最近のメッセージを取得", "get_message_context": "メッセージのコンテキストを取得", "get_conversation_between": "2人のやり取りを取得", diff --git a/src/i18n/locales/zh-CN/ai.json b/src/i18n/locales/zh-CN/ai.json index 3138fcc..97c2922 100644 --- a/src/i18n/locales/zh-CN/ai.json +++ b/src/i18n/locales/zh-CN/ai.json @@ -49,6 +49,7 @@ "tools": { "get_chat_overview": "获取聊天概览", "search_messages": "搜索聊天记录", + "deep_search_messages": "深度搜索聊天记录", "get_recent_messages": "获取最近消息", "get_member_stats": "获取成员统计", "get_time_stats": "获取时间分布", @@ -310,6 +311,7 @@ "builtinToolDesc": { "get_chat_overview": "获取聊天概览信息", "search_messages": "搜索聊天消息", + "deep_search_messages": "深度搜索聊天消息", "get_recent_messages": "获取最近消息", "get_message_context": "获取消息上下文", "get_conversation_between": "获取两人对话", diff --git a/src/i18n/locales/zh-TW/ai.json b/src/i18n/locales/zh-TW/ai.json index 39ad636..e598061 100644 --- a/src/i18n/locales/zh-TW/ai.json +++ b/src/i18n/locales/zh-TW/ai.json @@ -49,6 +49,7 @@ "tools": { "get_chat_overview": "取得聊天概覽", "search_messages": "搜尋聊天紀錄", + "deep_search_messages": "深度搜尋聊天紀錄", "get_recent_messages": "取得最近訊息", "get_member_stats": "取得成員統計", "get_time_stats": "取得時間分佈", @@ -310,6 +311,7 @@ "builtinToolDesc": { "get_chat_overview": "取得聊天概覽資訊", "search_messages": "搜尋聊天訊息", + "deep_search_messages": "深度搜尋聊天訊息", "get_recent_messages": "取得最近訊息", "get_message_context": "取得訊息上下文", "get_conversation_between": "取得兩人對話",