feat: 引入分词能力,并新增词云子Tab

This commit is contained in:
digua
2026-01-28 00:24:44 +08:00
committed by digua
parent 923c424efe
commit 036141bcb0
28 changed files with 1914 additions and 69 deletions
+74
View File
@@ -0,0 +1,74 @@
/**
* NLP 功能 IPC 处理器
* 提供词频统计、分词等 NLP 功能
*/
import { ipcMain } from 'electron'
import * as worker from '../worker/workerManager'
import type { IpcContext } from './types'
import type { WordFrequencyParams, WordFrequencyResult, SupportedLocale, PosTagInfo } from '../nlp'
/**
* 注册 NLP 相关 IPC 处理器
*/
export function registerNlpHandlers(_ctx: IpcContext): void {
/**
* 获取词频统计
* 用于词云展示
*/
ipcMain.handle(
'nlp:getWordFrequency',
async (
_event,
params: WordFrequencyParams
): Promise<WordFrequencyResult> => {
try {
const result = await worker.query('getWordFrequency', params)
return result as WordFrequencyResult
} catch (error) {
console.error('[NLP] 获取词频统计失败:', error)
return {
words: [],
totalWords: 0,
totalMessages: 0,
uniqueWords: 0,
}
}
}
)
/**
* 单文本分词
* 用于调试或其他用途
*/
ipcMain.handle(
'nlp:segmentText',
async (
_event,
text: string,
locale: SupportedLocale,
minLength?: number
): Promise<string[]> => {
try {
const result = await worker.query('segmentText', { text, locale, minLength })
return result as string[]
} catch (error) {
console.error('[NLP] 分词失败:', error)
return []
}
}
)
/**
* 获取词性标签定义
*/
ipcMain.handle('nlp:getPosTags', async (): Promise<PosTagInfo[]> => {
try {
const result = await worker.query('getPosTags', {})
return result as PosTagInfo[]
} catch (error) {
console.error('[NLP] 获取词性标签失败:', error)
return []
}
})
}
+2
View File
@@ -13,6 +13,7 @@ import { registerAIHandlers } from './ipc/ai'
import { registerMessagesHandlers } from './ipc/messages'
import { registerCacheHandlers } from './ipc/cache'
import { registerNetworkHandlers } from './ipc/network'
import { registerNlpHandlers } from './ipc/nlp'
import { registerAnalyticsHandlers } from './analytics'
// 导入 Worker 模块(用于异步分析查询和流式导入)
import * as worker from './worker/workerManager'
@@ -45,6 +46,7 @@ const mainIpcMain = (win: BrowserWindow) => {
registerMessagesHandlers(context)
registerCacheHandlers(context)
registerNetworkHandlers(context)
registerNlpHandlers(context)
registerAnalyticsHandlers()
console.log('[IpcMain] All IPC handlers registered successfully')
+7
View File
@@ -0,0 +1,7 @@
/**
* NLP 模块统一导出
*/
export * from './types'
export * from './stopwords'
export * from './segmenter'
+338
View File
@@ -0,0 +1,338 @@
/**
* 分词器模块
* 中文使用 @node-rs/jieba,其他语言使用 Intl.Segmenter
*/
import type { SupportedLocale, PosFilterMode, PosTagInfo } from './types'
import { isStopword } from './stopwords'
// Jieba 实例类型
interface JiebaInstance {
cut: (text: string, hmm?: boolean) => string[]
tag: (text: string) => Array<{ tag: string; word: string }>
}
// Jieba 实例(延迟初始化)
let jiebaInstance: JiebaInstance | null = null
/**
* 获取 Jieba 实例(延迟加载)
*/
function getJieba(): JiebaInstance {
if (!jiebaInstance) {
try {
// eslint-disable-next-line @typescript-eslint/no-require-imports
const { Jieba } = require('@node-rs/jieba')
// eslint-disable-next-line @typescript-eslint/no-require-imports
const { dict } = require('@node-rs/jieba/dict')
jiebaInstance = Jieba.withDict(dict)
console.log('[NLP] jieba 模块加载成功')
} catch (error) {
console.error('[NLP] jieba 模块加载失败:', error)
throw new Error('jieba 模块加载失败')
}
}
return jiebaInstance
}
/**
* 词性标签定义
*/
export const POS_TAG_DEFINITIONS: PosTagInfo[] = [
// 名词类
{ tag: 'n', name: '名词', description: '普通名词', meaningful: true },
{ tag: 'nr', name: '人名', description: '人名', meaningful: true },
{ tag: 'ns', name: '地名', description: '地名', meaningful: true },
{ tag: 'nt', name: '机构名', description: '机构团体名', meaningful: true },
{ tag: 'nz', name: '其他专名', description: '其他专有名词', meaningful: true },
{ tag: 'nw', name: '作品名', description: '作品名', meaningful: true },
// 动词类
{ tag: 'v', name: '动词', description: '普通动词', meaningful: true },
{ tag: 'vn', name: '动名词', description: '动名词', meaningful: true },
{ tag: 'vd', name: '副动词', description: '副动词', meaningful: true },
{ tag: 'vg', name: '动语素', description: '动词性语素', meaningful: true },
// 形容词类
{ tag: 'a', name: '形容词', description: '普通形容词', meaningful: true },
{ tag: 'an', name: '名形词', description: '名形词', meaningful: true },
{ tag: 'ad', name: '副形词', description: '副形词', meaningful: true },
{ tag: 'ag', name: '形语素', description: '形容词性语素', meaningful: true },
// 其他有意义
{ tag: 'i', name: '成语', description: '成语', meaningful: true },
{ tag: 'l', name: '习用语', description: '习用语', meaningful: true },
{ tag: 'j', name: '简称', description: '简称略语', meaningful: true },
// 副词、介词等(通常不太有意义)
{ tag: 'd', name: '副词', description: '副词', meaningful: false },
{ tag: 'p', name: '介词', description: '介词', meaningful: false },
{ tag: 'c', name: '连词', description: '连词', meaningful: false },
{ tag: 'u', name: '助词', description: '助词', meaningful: false },
{ tag: 'r', name: '代词', description: '代词', meaningful: false },
{ tag: 'm', name: '数词', description: '数词', meaningful: false },
{ tag: 'q', name: '量词', description: '量词', meaningful: false },
{ tag: 'f', name: '方位词', description: '方位词', meaningful: false },
{ tag: 't', name: '时间词', description: '时间词', meaningful: false },
{ tag: 'e', name: '叹词', description: '叹词', meaningful: false },
{ tag: 'y', name: '语气词', description: '语气词', meaningful: false },
{ tag: 'o', name: '拟声词', description: '拟声词', meaningful: false },
{ tag: 'x', name: '非语素字', description: '非语素字', meaningful: false },
{ tag: 'w', name: '标点符号', description: '标点符号', meaningful: false },
]
/**
* 有意义的词性标签集合
*/
export const MEANINGFUL_POS_TAGS = new Set(
POS_TAG_DEFINITIONS.filter((t) => t.meaningful).map((t) => t.tag)
)
/**
* 获取所有词性标签信息
*/
export function getPosTagDefinitions(): PosTagInfo[] {
return POS_TAG_DEFINITIONS
}
// 用于过滤的正则表达式
const EMOJI_REGEX = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}]/gu
const PUNCTUATION_REGEX = /[!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~,。!?、;:""''()【】《》…—~·\s]/g
const URL_REGEX = /https?:\/\/[^\s]+/g
const MENTION_REGEX = /@[^\s@]+/g
const PURE_NUMBER_REGEX = /^\d+$/
/**
* 清理文本
* 移除表情、URL、@提及、标点等
*/
function cleanText(text: string): string {
return text
.replace(URL_REGEX, ' ')
.replace(MENTION_REGEX, ' ')
.replace(EMOJI_REGEX, ' ')
.replace(PUNCTUATION_REGEX, ' ')
.replace(/\s+/g, ' ')
.trim()
}
/**
* 判断是否为有效词语
*/
function isValidWord(word: string, locale: SupportedLocale, minLength: number, enableStopwords: boolean = true): boolean {
// 空字符串
if (!word || word.trim().length === 0) return false
// 纯数字
if (PURE_NUMBER_REGEX.test(word)) return false
// 长度不足
if (word.length < minLength) return false
// 停用词
if (enableStopwords && isStopword(word, locale)) return false
return true
}
/**
* 中文分词选项
*/
interface ChineseSegmentOptions {
/** 词性过滤模式 */
posFilterMode?: PosFilterMode
/** 自定义词性过滤列表 */
customPosTags?: string[]
}
/**
* 收集文本的词性统计(用于显示每个词性有多少词)
* 只统计中文,英文无词性标注
*/
export function collectPosTagStats(
texts: string[],
minWordLength: number = 2,
enableStopwords: boolean = true
): Map<string, number> {
const posStats = new Map<string, number>()
try {
const jieba = getJieba()
for (const text of texts) {
const cleaned = cleanText(text)
if (!cleaned) continue
const tagged = jieba.tag(cleaned)
for (const item of tagged) {
// 检查词是否有效(长度和停用词过滤)
if (!isValidWord(item.word, minWordLength, 'zh-CN', enableStopwords)) {
continue
}
posStats.set(item.tag, (posStats.get(item.tag) || 0) + 1)
}
}
} catch (error) {
console.error('[NLP] 收集词性统计失败:', error)
}
return posStats
}
/**
* 中文分词(使用 jieba 词性标注)
* @param text 文本
* @param options 分词选项
*/
function segmentChinese(text: string, options: ChineseSegmentOptions = {}): string[] {
const { posFilterMode = 'meaningful', customPosTags } = options
const cleaned = cleanText(text)
if (!cleaned) return []
try {
const jieba = getJieba()
// 全部模式:直接分词,不做词性过滤
if (posFilterMode === 'all') {
return jieba.cut(cleaned, false)
}
// 使用词性标注
const tagged = jieba.tag(cleaned)
// 根据模式过滤
let allowedTags: Set<string>
if (posFilterMode === 'custom' && customPosTags) {
allowedTags = new Set(customPosTags)
} else {
// meaningful 模式
allowedTags = MEANINGFUL_POS_TAGS
}
return tagged
.filter((item) => allowedTags.has(item.tag))
.map((item) => item.word)
} catch (error) {
console.error('[NLP] 中文分词失败:', error)
// 降级:使用简单分词
try {
const jieba = getJieba()
return jieba.cut(cleaned, false)
} catch {
return cleaned.split('')
}
}
}
/**
* 英文分词(使用 Intl.Segmenter
*/
function segmentEnglish(text: string): string[] {
const cleaned = cleanText(text)
if (!cleaned) return []
try {
const segmenter = new Intl.Segmenter('en', { granularity: 'word' })
const segments = segmenter.segment(cleaned)
return [...segments]
.filter((segment) => segment.isWordLike)
.map((segment) => segment.segment.toLowerCase())
} catch {
// 降级:简单按空格分词
return cleaned
.toLowerCase()
.split(/\s+/)
.filter((word) => word.length > 0)
}
}
/**
* 分词选项
*/
export interface SegmentOptions {
/** 最小词长(可选,默认中文2,英文3) */
minLength?: number
/** 词性过滤模式(仅中文有效) */
posFilterMode?: PosFilterMode
/** 自定义词性过滤列表 */
customPosTags?: string[]
/** 是否启用停用词过滤 */
enableStopwords?: boolean
}
/**
* 通用分词入口
* @param text 待分词文本
* @param locale 语言
* @param options 分词选项
* @returns 过滤后的分词结果
*/
export function segment(
text: string,
locale: SupportedLocale,
options: SegmentOptions = {}
): string[] {
const {
minLength,
posFilterMode = 'meaningful',
customPosTags,
enableStopwords = true,
} = options
const defaultMinLength = locale === 'zh-CN' ? 2 : 3
const effectiveMinLength = minLength ?? defaultMinLength
let words: string[]
if (locale === 'zh-CN') {
words = segmentChinese(text, { posFilterMode, customPosTags })
} else {
words = segmentEnglish(text)
}
// 过滤无效词
return words.filter((word) => isValidWord(word, locale, effectiveMinLength, enableStopwords))
}
/**
* 批量分词并统计词频选项
*/
export interface BatchSegmentOptions extends SegmentOptions {
minCount?: number
topN?: number
}
/**
* 批量分词并统计词频
* @param texts 文本数组
* @param locale 语言
* @param options 选项
* @returns 词频 Map
*/
export function batchSegmentWithFrequency(
texts: string[],
locale: SupportedLocale,
options: BatchSegmentOptions = {}
): Map<string, number> {
const { minLength, minCount = 2, topN = 100, posFilterMode, customPosTags, enableStopwords } = options
const wordFrequency = new Map<string, number>()
for (const text of texts) {
const words = segment(text, locale, { minLength, posFilterMode, customPosTags, enableStopwords })
for (const word of words) {
wordFrequency.set(word, (wordFrequency.get(word) || 0) + 1)
}
}
// 过滤低频词
const filtered = new Map<string, number>()
for (const [word, count] of wordFrequency) {
if (count >= minCount) {
filtered.set(word, count)
}
}
// 排序并取 topN
const sorted = [...filtered.entries()]
.sort((a, b) => b[1] - a[1])
.slice(0, topN)
return new Map(sorted)
}
+119
View File
@@ -0,0 +1,119 @@
/**
* 停用词表
* 用于过滤无意义的高频词
*/
/** 中文停用词 */
export const CHINESE_STOPWORDS = new Set([
// 代词
'我', '你', '他', '她', '它', '我们', '你们', '他们', '她们', '它们',
'自己', '别人', '大家', '谁', '什么', '哪', '哪里', '哪儿', '这', '那',
'这个', '那个', '这些', '那些', '这里', '那里', '这儿', '那儿', '这样', '那样',
// 助词
'的', '地', '得', '了', '着', '过', '吗', '呢', '吧', '啊',
'呀', '哇', '哦', '嗯', '噢', '喔', '呃', '唉', '哎', '嘛',
// 介词
'在', '从', '到', '向', '往', '把', '被', '给', '跟', '和',
'与', '对', '比', '为', '因', '由', '以', '按', '用', '让',
// 连词
'和', '与', '或', '或者', '而', '并', '并且', '但', '但是', '可是',
'然而', '不过', '只是', '如果', '要是', '假如', '虽然', '尽管', '即使', '所以',
'因此', '于是', '那么', '因为', '由于', '既然', '为了', '以便',
// 副词
'不', '没', '没有', '很', '太', '最', '更', '也', '都', '就',
'才', '又', '再', '还', '却', '只', '只是', '已', '已经', '曾',
'曾经', '正', '正在', '将', '将要', '会', '能', '可以', '可能', '应该',
'必须', '一定', '大概', '也许', '或许', '其实', '确实', '真的', '当然', '一直',
'总是', '经常', '常常', '往往', '偶尔', '几乎', '差不多', '简直', '反正', '终于',
// 量词
'个', '只', '条', '件', '位', '种', '些', '点', '下', '次',
// 数词
'一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
'百', '千', '万', '亿', '两', '几', '多', '少', '第', '每',
// 动词(常见无实意动词)
'是', '有', '在', '做', '去', '来', '说', '看', '想', '要',
'能', '会', '让', '给', '叫', '用', '打', '把', '被', '到',
// 其他常见词
'上', '下', '前', '后', '里', '外', '中', '内', '左', '右',
'东', '南', '西', '北', '时', '时候', '现在', '今天', '明天', '昨天',
'年', '月', '日', '号', '点', '分', '秒', '周', '星期',
// 网络聊天常见无意义词
'好', '好的', '行', '可以', '嗯嗯', '哈', '呵', '额', '恩', '昂',
'ok', 'OK', '好吧', '知道', '知道了', '谢谢', '感谢', '抱歉', '不好意思',
// 语气词和程度词(虽然词性是名词/动词,但在聊天中无实际意义)
'感觉', '有点', '可能', '应该', '好像', '觉得', '认为', '看看', '看到',
'说', '问', '找', '弄', '搞', '搞定', '整', '干', '做', '来', '去',
'有', '没有', '没', '是不是', '有没有', '能不能', '会不会', '要不要',
'怎样', '如何', '为何', '为什么', '怎么', '怎么样', '怎么办',
'东西', '事情', '事', '问题', '时候', '地方', '情况', '样子', '意思',
'一下', '一点', '一些', '一样', '一起', '一直', '一般', '一定', '差不多',
])
/** 英文停用词 */
export const ENGLISH_STOPWORDS = new Set([
// Articles
'a', 'an', 'the',
// Pronouns
'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',
'you', 'your', 'yours', 'yourself', 'yourselves',
'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those',
// Prepositions
'in', 'on', 'at', 'by', 'for', 'with', 'about', 'against', 'between',
'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to',
'from', 'up', 'down', 'out', 'off', 'over', 'under', 'again', 'further',
// Conjunctions
'and', 'but', 'or', 'nor', 'so', 'yet', 'both', 'either', 'neither',
'not', 'only', 'own', 'same', 'than', 'too', 'very', 'just',
// Be verbs
'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
// Have verbs
'have', 'has', 'had', 'having',
// Do verbs
'do', 'does', 'did', 'doing',
// Modal verbs
'will', 'would', 'shall', 'should', 'can', 'could', 'may', 'might', 'must',
// Other common words
'if', 'then', 'else', 'when', 'where', 'why', 'how', 'all', 'each',
'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
'any', 'now', 'here', 'there', 'of', 'as',
// Contractions (without apostrophe)
'dont', 'doesnt', 'didnt', 'wont', 'wouldnt', 'cant', 'couldnt',
'shouldnt', 'isnt', 'arent', 'wasnt', 'werent', 'havent', 'hasnt', 'hadnt',
// Chat common words
'ok', 'okay', 'yes', 'no', 'yeah', 'yep', 'nope', 'sure', 'thanks',
'thank', 'please', 'sorry', 'hi', 'hello', 'hey', 'bye', 'goodbye',
'well', 'like', 'know', 'think', 'want', 'need', 'get', 'got', 'go',
'going', 'come', 'coming', 'make', 'made', 'take', 'took', 'see', 'saw',
'look', 'looking', 'say', 'said', 'tell', 'told', 'ask', 'asked',
'let', 'put', 'keep', 'give', 'gave', 'find', 'found', 'try', 'tried',
// Time words
'today', 'tomorrow', 'yesterday', 'now', 'then', 'always', 'never',
'sometimes', 'often', 'usually', 'still', 'already', 'soon', 'later',
])
/**
* 获取停用词集合
* @param locale 语言
* @returns 停用词集合
*/
export function getStopwords(locale: string): Set<string> {
if (locale === 'zh-CN') {
return CHINESE_STOPWORDS
}
return ENGLISH_STOPWORDS
}
/**
* 判断是否为停用词
* @param word 词语
* @param locale 语言
* @returns 是否为停用词
*/
export function isStopword(word: string, locale: string): boolean {
const stopwords = getStopwords(locale)
// 英文统一转小写比较
const normalizedWord = locale === 'en-US' ? word.toLowerCase() : word
return stopwords.has(normalizedWord)
}
+96
View File
@@ -0,0 +1,96 @@
/**
* NLP 模块类型定义
*/
/** 支持的语言 */
export type SupportedLocale = 'zh-CN' | 'en-US'
/** 分词结果 */
export interface SegmentResult {
/** 分词后的词语列表 */
words: string[]
/** 原始文本 */
original: string
}
/** 词频项 */
export interface WordFrequencyItem {
/** 词语 */
word: string
/** 出现次数 */
count: number
/** 占比百分比 */
percentage: number
}
/** 词性统计项 */
export interface PosTagStat {
/** 词性标签 */
tag: string
/** 该词性的词语数量 */
count: number
}
/** 词频统计结果 */
export interface WordFrequencyResult {
/** 词频列表(按出现次数降序) */
words: WordFrequencyItem[]
/** 总词数 */
totalWords: number
/** 总消息数 */
totalMessages: number
/** 唯一词数 */
uniqueWords: number
/** 词性统计(每个词性的词语数量) */
posTagStats?: PosTagStat[]
}
/** 词性过滤模式 */
export type PosFilterMode = 'all' | 'meaningful' | 'custom'
/** 词频统计参数 */
export interface WordFrequencyParams {
/** 会话 ID */
sessionId: string
/** 用户语言设置 */
locale: SupportedLocale
/** 时间过滤 */
timeFilter?: {
startTs?: number
endTs?: number
}
/** 成员 ID(筛选特定成员) */
memberId?: number
/** 返回前 N 个高频词,默认 100 */
topN?: number
/** 最小词长,默认中文 2,英文 3 */
minWordLength?: number
/** 最小出现次数,默认 2 */
minCount?: number
/** 词性过滤模式:all=全部, meaningful=只保留有意义的词, custom=自定义 */
posFilterMode?: PosFilterMode
/** 自定义词性过滤列表(posFilterMode='custom' 时使用) */
customPosTags?: string[]
/** 是否启用停用词过滤,默认 true */
enableStopwords?: boolean
}
/** 词性标签信息 */
export interface PosTagInfo {
/** 词性标签 */
tag: string
/** 词性名称(中文) */
name: string
/** 词性描述 */
description: string
/** 是否为有意义的词性 */
meaningful: boolean
}
/** 分词器配置 */
export interface SegmenterConfig {
/** 语言 */
locale: SupportedLocale
/** 自定义词典路径(可选,为后期扩展预留) */
customDictPath?: string
}
+9
View File
@@ -62,6 +62,10 @@ import {
// 自定义筛选
filterMessagesWithContext,
getMultipleSessionsMessages,
// NLP 查询
getWordFrequency,
segmentText,
getPosTags,
} from './query'
import { streamImport, streamParseFileInfo, analyzeIncrementalImport, incrementalImport } from './import'
@@ -148,6 +152,11 @@ const syncHandlers: Record<string, (payload: any) => any> = {
filterMessagesWithContext: (p) =>
filterMessagesWithContext(p.sessionId, p.keywords, p.timeFilter, p.senderIds, p.contextSize),
getMultipleSessionsMessages: (p) => getMultipleSessionsMessages(p.sessionId, p.chatSessionIds),
// NLP 查询
getWordFrequency: (p) => getWordFrequency(p),
segmentText: (p) => segmentText(p.text, p.locale, p.minLength),
getPosTags: () => getPosTags(),
}
// 异步消息处理器(流式操作)
+3
View File
@@ -80,3 +80,6 @@ export type {
FilterResult,
FilterMessage,
} from './session'
// NLP 查询
export { getWordFrequency, segmentText, getPosTags } from './nlp'
+137
View File
@@ -0,0 +1,137 @@
/**
* NLP 查询模块
* 提供词频统计等 NLP 相关查询功能
*/
import { openDatabase, buildTimeFilter, type TimeFilter } from '../core'
import { segment, batchSegmentWithFrequency, getPosTagDefinitions, collectPosTagStats } from '../../nlp'
import type { SupportedLocale, WordFrequencyResult, WordFrequencyParams, PosTagInfo, PosTagStat } from '../../nlp'
/**
* 获取词频统计
* 用于词云展示
*/
export function getWordFrequency(params: WordFrequencyParams): WordFrequencyResult {
const {
sessionId,
locale,
timeFilter,
memberId,
topN = 100,
minWordLength,
minCount = 2,
posFilterMode = 'meaningful',
customPosTags,
enableStopwords = true,
} = params
const db = openDatabase(sessionId)
if (!db) {
return {
words: [],
totalWords: 0,
totalMessages: 0,
uniqueWords: 0,
}
}
// 构建时间和成员过滤
const filter: TimeFilter = {
...timeFilter,
memberId,
}
const { clause, params: filterParams } = buildTimeFilter(filter, 'msg')
// 构建 WHERE 子句,排除系统消息
let whereClause = clause
if (whereClause.includes('WHERE')) {
whereClause += " AND COALESCE(m.account_name, '') != '系统消息' AND msg.type = 0 AND msg.content IS NOT NULL AND TRIM(msg.content) != ''"
} else {
whereClause = " WHERE COALESCE(m.account_name, '') != '系统消息' AND msg.type = 0 AND msg.content IS NOT NULL AND TRIM(msg.content) != ''"
}
// 查询消息内容
const messages = db
.prepare(
`
SELECT msg.content
FROM message msg
JOIN member m ON msg.sender_id = m.id
${whereClause}
`
)
.all(...filterParams) as Array<{ content: string }>
// 如果没有消息,返回空结果
if (messages.length === 0) {
return {
words: [],
totalWords: 0,
totalMessages: 0,
uniqueWords: 0,
}
}
// 提取文本内容
const texts = messages.map((m) => m.content)
// 收集词性统计(用于显示每个词性有多少词,仅中文有效)
let posTagStats: PosTagStat[] | undefined
if ((locale as SupportedLocale) === 'zh-CN') {
const posStatsMap = collectPosTagStats(
texts,
minWordLength ?? 2,
enableStopwords
)
posTagStats = [...posStatsMap.entries()].map(([tag, count]) => ({ tag, count }))
}
// 批量分词并统计词频
const wordFrequency = batchSegmentWithFrequency(texts, locale as SupportedLocale, {
minLength: minWordLength,
minCount,
topN,
posFilterMode,
customPosTags,
enableStopwords,
})
// 计算总词数(用于百分比)
let totalWords = 0
for (const count of wordFrequency.values()) {
totalWords += count
}
// 构建结果
const words = [...wordFrequency.entries()].map(([word, count]) => ({
word,
count,
percentage: totalWords > 0 ? Math.round((count / totalWords) * 10000) / 100 : 0,
}))
return {
words,
totalWords,
totalMessages: messages.length,
uniqueWords: wordFrequency.size,
posTagStats,
}
}
/**
* 单文本分词(用于调试或其他用途)
*/
export function segmentText(
text: string,
locale: SupportedLocale,
minLength?: number
): string[] {
return segment(text, locale, { minLength })
}
/**
* 获取词性标签定义
*/
export function getPosTags(): PosTagInfo[] {
return getPosTagDefinitions()
}
+9
View File
@@ -199,6 +199,15 @@ export function closeWorker(): void {
}
}
// ==================== 通用查询 API ====================
/**
* 通用查询函数(用于新增的查询类型)
*/
export async function query<T = any>(type: string, payload: any): Promise<T> {
return sendToWorker<T>(type, payload)
}
// ==================== 导出的异步 API ====================
export async function getAvailableYears(sessionId: string): Promise<number[]> {
+63
View File
@@ -632,6 +632,61 @@ interface NetworkApi {
testProxyConnection: (proxyUrl: string) => Promise<{ success: boolean; error?: string }>
}
// NLP API 类型 - 自然语言处理功能
type SupportedLocale = 'zh-CN' | 'en-US'
/** 词性过滤模式 */
type PosFilterMode = 'all' | 'meaningful' | 'custom'
interface WordFrequencyItem {
word: string
count: number
percentage: number
}
interface PosTagStat {
tag: string
count: number
}
interface WordFrequencyResult {
words: WordFrequencyItem[]
totalWords: number
totalMessages: number
uniqueWords: number
posTagStats?: PosTagStat[]
}
interface WordFrequencyParams {
sessionId: string
locale: SupportedLocale
timeFilter?: { startTs?: number; endTs?: number }
memberId?: number
topN?: number
minWordLength?: number
minCount?: number
/** 词性过滤模式:all=全部, meaningful=只保留有意义的词, custom=自定义 */
posFilterMode?: PosFilterMode
/** 自定义词性过滤列表(posFilterMode='custom' 时使用) */
customPosTags?: string[]
/** 是否启用停用词过滤,默认 true */
enableStopwords?: boolean
}
/** 词性标签信息 */
interface PosTagInfo {
tag: string
name: string
description: string
meaningful: boolean
}
interface NlpApi {
getWordFrequency: (params: WordFrequencyParams) => Promise<WordFrequencyResult>
segmentText: (text: string, locale: SupportedLocale, minLength?: number) => Promise<string[]>
getPosTags: () => Promise<PosTagInfo[]>
}
// Session Index API 类型 - 会话索引功能
interface SessionStats {
sessionCount: number
@@ -716,6 +771,7 @@ declare global {
cacheApi: CacheApi
networkApi: NetworkApi
sessionApi: SessionApi
nlpApi: NlpApi
}
}
@@ -731,6 +787,7 @@ export {
AgentApi,
CacheApi,
NetworkApi,
NlpApi,
ProxyConfig,
SearchMessageResult,
AIConversation,
@@ -754,4 +811,10 @@ export {
EmbeddingConfig,
VectorStoreConfig,
RerankConfig,
WordFrequencyItem,
WordFrequencyResult,
WordFrequencyParams,
SupportedLocale,
PosFilterMode,
PosTagInfo,
}
+65
View File
@@ -1180,6 +1180,68 @@ const agentApi = {
},
}
// NLP API - 自然语言处理功能
interface WordFrequencyItem {
word: string
count: number
percentage: number
}
interface WordFrequencyResult {
words: WordFrequencyItem[]
totalWords: number
totalMessages: number
uniqueWords: number
}
type PosFilterMode = 'all' | 'meaningful' | 'custom'
interface WordFrequencyParams {
sessionId: string
locale: 'zh-CN' | 'en-US'
timeFilter?: { startTs?: number; endTs?: number }
memberId?: number
topN?: number
minWordLength?: number
minCount?: number
/** 词性过滤模式:all=全部, meaningful=只保留有意义的词, custom=自定义 */
posFilterMode?: PosFilterMode
/** 自定义词性过滤列表(posFilterMode='custom' 时使用) */
customPosTags?: string[]
/** 是否启用停用词过滤,默认 true */
enableStopwords?: boolean
}
interface PosTagInfo {
tag: string
name: string
description: string
meaningful: boolean
}
const nlpApi = {
/**
* 获取词频统计(用于词云)
*/
getWordFrequency: (params: WordFrequencyParams): Promise<WordFrequencyResult> => {
return ipcRenderer.invoke('nlp:getWordFrequency', params)
},
/**
* 单文本分词
*/
segmentText: (text: string, locale: 'zh-CN' | 'en-US', minLength?: number): Promise<string[]> => {
return ipcRenderer.invoke('nlp:segmentText', text, locale, minLength)
},
/**
* 获取词性标签定义
*/
getPosTags: (): Promise<PosTagInfo[]> => {
return ipcRenderer.invoke('nlp:getPosTags')
},
}
// Network API - 网络设置
type ProxyMode = 'off' | 'system' | 'manual'
@@ -1567,6 +1629,7 @@ if (process.contextIsolated) {
contextBridge.exposeInMainWorld('cacheApi', cacheApi)
contextBridge.exposeInMainWorld('networkApi', networkApi)
contextBridge.exposeInMainWorld('sessionApi', sessionApi)
contextBridge.exposeInMainWorld('nlpApi', nlpApi)
} catch (error) {
console.error(error)
}
@@ -1593,4 +1656,6 @@ if (process.contextIsolated) {
window.networkApi = networkApi
// @ts-ignore (define in dts)
window.sessionApi = sessionApi
// @ts-ignore (define in dts)
window.nlpApi = nlpApi
}