mirror of
https://github.com/hellodigua/ChatLab.git
synced 2026-05-16 11:29:24 +08:00
feat: 引入分词能力,并新增词云子Tab
This commit is contained in:
@@ -0,0 +1,74 @@
|
||||
/**
|
||||
* NLP 功能 IPC 处理器
|
||||
* 提供词频统计、分词等 NLP 功能
|
||||
*/
|
||||
|
||||
import { ipcMain } from 'electron'
|
||||
import * as worker from '../worker/workerManager'
|
||||
import type { IpcContext } from './types'
|
||||
import type { WordFrequencyParams, WordFrequencyResult, SupportedLocale, PosTagInfo } from '../nlp'
|
||||
|
||||
/**
|
||||
* 注册 NLP 相关 IPC 处理器
|
||||
*/
|
||||
export function registerNlpHandlers(_ctx: IpcContext): void {
|
||||
/**
|
||||
* 获取词频统计
|
||||
* 用于词云展示
|
||||
*/
|
||||
ipcMain.handle(
|
||||
'nlp:getWordFrequency',
|
||||
async (
|
||||
_event,
|
||||
params: WordFrequencyParams
|
||||
): Promise<WordFrequencyResult> => {
|
||||
try {
|
||||
const result = await worker.query('getWordFrequency', params)
|
||||
return result as WordFrequencyResult
|
||||
} catch (error) {
|
||||
console.error('[NLP] 获取词频统计失败:', error)
|
||||
return {
|
||||
words: [],
|
||||
totalWords: 0,
|
||||
totalMessages: 0,
|
||||
uniqueWords: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
/**
|
||||
* 单文本分词
|
||||
* 用于调试或其他用途
|
||||
*/
|
||||
ipcMain.handle(
|
||||
'nlp:segmentText',
|
||||
async (
|
||||
_event,
|
||||
text: string,
|
||||
locale: SupportedLocale,
|
||||
minLength?: number
|
||||
): Promise<string[]> => {
|
||||
try {
|
||||
const result = await worker.query('segmentText', { text, locale, minLength })
|
||||
return result as string[]
|
||||
} catch (error) {
|
||||
console.error('[NLP] 分词失败:', error)
|
||||
return []
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
/**
|
||||
* 获取词性标签定义
|
||||
*/
|
||||
ipcMain.handle('nlp:getPosTags', async (): Promise<PosTagInfo[]> => {
|
||||
try {
|
||||
const result = await worker.query('getPosTags', {})
|
||||
return result as PosTagInfo[]
|
||||
} catch (error) {
|
||||
console.error('[NLP] 获取词性标签失败:', error)
|
||||
return []
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -13,6 +13,7 @@ import { registerAIHandlers } from './ipc/ai'
|
||||
import { registerMessagesHandlers } from './ipc/messages'
|
||||
import { registerCacheHandlers } from './ipc/cache'
|
||||
import { registerNetworkHandlers } from './ipc/network'
|
||||
import { registerNlpHandlers } from './ipc/nlp'
|
||||
import { registerAnalyticsHandlers } from './analytics'
|
||||
// 导入 Worker 模块(用于异步分析查询和流式导入)
|
||||
import * as worker from './worker/workerManager'
|
||||
@@ -45,6 +46,7 @@ const mainIpcMain = (win: BrowserWindow) => {
|
||||
registerMessagesHandlers(context)
|
||||
registerCacheHandlers(context)
|
||||
registerNetworkHandlers(context)
|
||||
registerNlpHandlers(context)
|
||||
registerAnalyticsHandlers()
|
||||
|
||||
console.log('[IpcMain] All IPC handlers registered successfully')
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
/**
|
||||
* NLP 模块统一导出
|
||||
*/
|
||||
|
||||
export * from './types'
|
||||
export * from './stopwords'
|
||||
export * from './segmenter'
|
||||
@@ -0,0 +1,338 @@
|
||||
/**
|
||||
* 分词器模块
|
||||
* 中文使用 @node-rs/jieba,其他语言使用 Intl.Segmenter
|
||||
*/
|
||||
|
||||
import type { SupportedLocale, PosFilterMode, PosTagInfo } from './types'
|
||||
import { isStopword } from './stopwords'
|
||||
|
||||
// Jieba 实例类型
|
||||
interface JiebaInstance {
|
||||
cut: (text: string, hmm?: boolean) => string[]
|
||||
tag: (text: string) => Array<{ tag: string; word: string }>
|
||||
}
|
||||
|
||||
// Jieba 实例(延迟初始化)
|
||||
let jiebaInstance: JiebaInstance | null = null
|
||||
|
||||
/**
|
||||
* 获取 Jieba 实例(延迟加载)
|
||||
*/
|
||||
function getJieba(): JiebaInstance {
|
||||
if (!jiebaInstance) {
|
||||
try {
|
||||
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||||
const { Jieba } = require('@node-rs/jieba')
|
||||
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||||
const { dict } = require('@node-rs/jieba/dict')
|
||||
jiebaInstance = Jieba.withDict(dict)
|
||||
console.log('[NLP] jieba 模块加载成功')
|
||||
} catch (error) {
|
||||
console.error('[NLP] jieba 模块加载失败:', error)
|
||||
throw new Error('jieba 模块加载失败')
|
||||
}
|
||||
}
|
||||
return jiebaInstance
|
||||
}
|
||||
|
||||
/**
|
||||
* 词性标签定义
|
||||
*/
|
||||
export const POS_TAG_DEFINITIONS: PosTagInfo[] = [
|
||||
// 名词类
|
||||
{ tag: 'n', name: '名词', description: '普通名词', meaningful: true },
|
||||
{ tag: 'nr', name: '人名', description: '人名', meaningful: true },
|
||||
{ tag: 'ns', name: '地名', description: '地名', meaningful: true },
|
||||
{ tag: 'nt', name: '机构名', description: '机构团体名', meaningful: true },
|
||||
{ tag: 'nz', name: '其他专名', description: '其他专有名词', meaningful: true },
|
||||
{ tag: 'nw', name: '作品名', description: '作品名', meaningful: true },
|
||||
// 动词类
|
||||
{ tag: 'v', name: '动词', description: '普通动词', meaningful: true },
|
||||
{ tag: 'vn', name: '动名词', description: '动名词', meaningful: true },
|
||||
{ tag: 'vd', name: '副动词', description: '副动词', meaningful: true },
|
||||
{ tag: 'vg', name: '动语素', description: '动词性语素', meaningful: true },
|
||||
// 形容词类
|
||||
{ tag: 'a', name: '形容词', description: '普通形容词', meaningful: true },
|
||||
{ tag: 'an', name: '名形词', description: '名形词', meaningful: true },
|
||||
{ tag: 'ad', name: '副形词', description: '副形词', meaningful: true },
|
||||
{ tag: 'ag', name: '形语素', description: '形容词性语素', meaningful: true },
|
||||
// 其他有意义
|
||||
{ tag: 'i', name: '成语', description: '成语', meaningful: true },
|
||||
{ tag: 'l', name: '习用语', description: '习用语', meaningful: true },
|
||||
{ tag: 'j', name: '简称', description: '简称略语', meaningful: true },
|
||||
// 副词、介词等(通常不太有意义)
|
||||
{ tag: 'd', name: '副词', description: '副词', meaningful: false },
|
||||
{ tag: 'p', name: '介词', description: '介词', meaningful: false },
|
||||
{ tag: 'c', name: '连词', description: '连词', meaningful: false },
|
||||
{ tag: 'u', name: '助词', description: '助词', meaningful: false },
|
||||
{ tag: 'r', name: '代词', description: '代词', meaningful: false },
|
||||
{ tag: 'm', name: '数词', description: '数词', meaningful: false },
|
||||
{ tag: 'q', name: '量词', description: '量词', meaningful: false },
|
||||
{ tag: 'f', name: '方位词', description: '方位词', meaningful: false },
|
||||
{ tag: 't', name: '时间词', description: '时间词', meaningful: false },
|
||||
{ tag: 'e', name: '叹词', description: '叹词', meaningful: false },
|
||||
{ tag: 'y', name: '语气词', description: '语气词', meaningful: false },
|
||||
{ tag: 'o', name: '拟声词', description: '拟声词', meaningful: false },
|
||||
{ tag: 'x', name: '非语素字', description: '非语素字', meaningful: false },
|
||||
{ tag: 'w', name: '标点符号', description: '标点符号', meaningful: false },
|
||||
]
|
||||
|
||||
/**
|
||||
* 有意义的词性标签集合
|
||||
*/
|
||||
export const MEANINGFUL_POS_TAGS = new Set(
|
||||
POS_TAG_DEFINITIONS.filter((t) => t.meaningful).map((t) => t.tag)
|
||||
)
|
||||
|
||||
/**
|
||||
* 获取所有词性标签信息
|
||||
*/
|
||||
export function getPosTagDefinitions(): PosTagInfo[] {
|
||||
return POS_TAG_DEFINITIONS
|
||||
}
|
||||
|
||||
// 用于过滤的正则表达式
|
||||
const EMOJI_REGEX = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}]/gu
|
||||
const PUNCTUATION_REGEX = /[!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~,。!?、;:""''()【】《》…—~·\s]/g
|
||||
const URL_REGEX = /https?:\/\/[^\s]+/g
|
||||
const MENTION_REGEX = /@[^\s@]+/g
|
||||
const PURE_NUMBER_REGEX = /^\d+$/
|
||||
|
||||
/**
|
||||
* 清理文本
|
||||
* 移除表情、URL、@提及、标点等
|
||||
*/
|
||||
function cleanText(text: string): string {
|
||||
return text
|
||||
.replace(URL_REGEX, ' ')
|
||||
.replace(MENTION_REGEX, ' ')
|
||||
.replace(EMOJI_REGEX, ' ')
|
||||
.replace(PUNCTUATION_REGEX, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim()
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断是否为有效词语
|
||||
*/
|
||||
function isValidWord(word: string, locale: SupportedLocale, minLength: number, enableStopwords: boolean = true): boolean {
|
||||
// 空字符串
|
||||
if (!word || word.trim().length === 0) return false
|
||||
|
||||
// 纯数字
|
||||
if (PURE_NUMBER_REGEX.test(word)) return false
|
||||
|
||||
// 长度不足
|
||||
if (word.length < minLength) return false
|
||||
|
||||
// 停用词
|
||||
if (enableStopwords && isStopword(word, locale)) return false
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
/**
|
||||
* 中文分词选项
|
||||
*/
|
||||
interface ChineseSegmentOptions {
|
||||
/** 词性过滤模式 */
|
||||
posFilterMode?: PosFilterMode
|
||||
/** 自定义词性过滤列表 */
|
||||
customPosTags?: string[]
|
||||
}
|
||||
|
||||
/**
|
||||
* 收集文本的词性统计(用于显示每个词性有多少词)
|
||||
* 只统计中文,英文无词性标注
|
||||
*/
|
||||
export function collectPosTagStats(
|
||||
texts: string[],
|
||||
minWordLength: number = 2,
|
||||
enableStopwords: boolean = true
|
||||
): Map<string, number> {
|
||||
const posStats = new Map<string, number>()
|
||||
|
||||
try {
|
||||
const jieba = getJieba()
|
||||
|
||||
for (const text of texts) {
|
||||
const cleaned = cleanText(text)
|
||||
if (!cleaned) continue
|
||||
|
||||
const tagged = jieba.tag(cleaned)
|
||||
|
||||
for (const item of tagged) {
|
||||
// 检查词是否有效(长度和停用词过滤)
|
||||
if (!isValidWord(item.word, minWordLength, 'zh-CN', enableStopwords)) {
|
||||
continue
|
||||
}
|
||||
posStats.set(item.tag, (posStats.get(item.tag) || 0) + 1)
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('[NLP] 收集词性统计失败:', error)
|
||||
}
|
||||
|
||||
return posStats
|
||||
}
|
||||
|
||||
/**
|
||||
* 中文分词(使用 jieba 词性标注)
|
||||
* @param text 文本
|
||||
* @param options 分词选项
|
||||
*/
|
||||
function segmentChinese(text: string, options: ChineseSegmentOptions = {}): string[] {
|
||||
const { posFilterMode = 'meaningful', customPosTags } = options
|
||||
const cleaned = cleanText(text)
|
||||
if (!cleaned) return []
|
||||
|
||||
try {
|
||||
const jieba = getJieba()
|
||||
|
||||
// 全部模式:直接分词,不做词性过滤
|
||||
if (posFilterMode === 'all') {
|
||||
return jieba.cut(cleaned, false)
|
||||
}
|
||||
|
||||
// 使用词性标注
|
||||
const tagged = jieba.tag(cleaned)
|
||||
|
||||
// 根据模式过滤
|
||||
let allowedTags: Set<string>
|
||||
if (posFilterMode === 'custom' && customPosTags) {
|
||||
allowedTags = new Set(customPosTags)
|
||||
} else {
|
||||
// meaningful 模式
|
||||
allowedTags = MEANINGFUL_POS_TAGS
|
||||
}
|
||||
|
||||
return tagged
|
||||
.filter((item) => allowedTags.has(item.tag))
|
||||
.map((item) => item.word)
|
||||
} catch (error) {
|
||||
console.error('[NLP] 中文分词失败:', error)
|
||||
// 降级:使用简单分词
|
||||
try {
|
||||
const jieba = getJieba()
|
||||
return jieba.cut(cleaned, false)
|
||||
} catch {
|
||||
return cleaned.split('')
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 英文分词(使用 Intl.Segmenter)
|
||||
*/
|
||||
function segmentEnglish(text: string): string[] {
|
||||
const cleaned = cleanText(text)
|
||||
if (!cleaned) return []
|
||||
|
||||
try {
|
||||
const segmenter = new Intl.Segmenter('en', { granularity: 'word' })
|
||||
const segments = segmenter.segment(cleaned)
|
||||
|
||||
return [...segments]
|
||||
.filter((segment) => segment.isWordLike)
|
||||
.map((segment) => segment.segment.toLowerCase())
|
||||
} catch {
|
||||
// 降级:简单按空格分词
|
||||
return cleaned
|
||||
.toLowerCase()
|
||||
.split(/\s+/)
|
||||
.filter((word) => word.length > 0)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 分词选项
|
||||
*/
|
||||
export interface SegmentOptions {
|
||||
/** 最小词长(可选,默认中文2,英文3) */
|
||||
minLength?: number
|
||||
/** 词性过滤模式(仅中文有效) */
|
||||
posFilterMode?: PosFilterMode
|
||||
/** 自定义词性过滤列表 */
|
||||
customPosTags?: string[]
|
||||
/** 是否启用停用词过滤 */
|
||||
enableStopwords?: boolean
|
||||
}
|
||||
|
||||
/**
|
||||
* 通用分词入口
|
||||
* @param text 待分词文本
|
||||
* @param locale 语言
|
||||
* @param options 分词选项
|
||||
* @returns 过滤后的分词结果
|
||||
*/
|
||||
export function segment(
|
||||
text: string,
|
||||
locale: SupportedLocale,
|
||||
options: SegmentOptions = {}
|
||||
): string[] {
|
||||
const {
|
||||
minLength,
|
||||
posFilterMode = 'meaningful',
|
||||
customPosTags,
|
||||
enableStopwords = true,
|
||||
} = options
|
||||
const defaultMinLength = locale === 'zh-CN' ? 2 : 3
|
||||
const effectiveMinLength = minLength ?? defaultMinLength
|
||||
|
||||
let words: string[]
|
||||
|
||||
if (locale === 'zh-CN') {
|
||||
words = segmentChinese(text, { posFilterMode, customPosTags })
|
||||
} else {
|
||||
words = segmentEnglish(text)
|
||||
}
|
||||
|
||||
// 过滤无效词
|
||||
return words.filter((word) => isValidWord(word, locale, effectiveMinLength, enableStopwords))
|
||||
}
|
||||
|
||||
/**
|
||||
* 批量分词并统计词频选项
|
||||
*/
|
||||
export interface BatchSegmentOptions extends SegmentOptions {
|
||||
minCount?: number
|
||||
topN?: number
|
||||
}
|
||||
|
||||
/**
|
||||
* 批量分词并统计词频
|
||||
* @param texts 文本数组
|
||||
* @param locale 语言
|
||||
* @param options 选项
|
||||
* @returns 词频 Map
|
||||
*/
|
||||
export function batchSegmentWithFrequency(
|
||||
texts: string[],
|
||||
locale: SupportedLocale,
|
||||
options: BatchSegmentOptions = {}
|
||||
): Map<string, number> {
|
||||
const { minLength, minCount = 2, topN = 100, posFilterMode, customPosTags, enableStopwords } = options
|
||||
const wordFrequency = new Map<string, number>()
|
||||
|
||||
for (const text of texts) {
|
||||
const words = segment(text, locale, { minLength, posFilterMode, customPosTags, enableStopwords })
|
||||
for (const word of words) {
|
||||
wordFrequency.set(word, (wordFrequency.get(word) || 0) + 1)
|
||||
}
|
||||
}
|
||||
|
||||
// 过滤低频词
|
||||
const filtered = new Map<string, number>()
|
||||
for (const [word, count] of wordFrequency) {
|
||||
if (count >= minCount) {
|
||||
filtered.set(word, count)
|
||||
}
|
||||
}
|
||||
|
||||
// 排序并取 topN
|
||||
const sorted = [...filtered.entries()]
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.slice(0, topN)
|
||||
|
||||
return new Map(sorted)
|
||||
}
|
||||
@@ -0,0 +1,119 @@
|
||||
/**
|
||||
* 停用词表
|
||||
* 用于过滤无意义的高频词
|
||||
*/
|
||||
|
||||
/** 中文停用词 */
|
||||
export const CHINESE_STOPWORDS = new Set([
|
||||
// 代词
|
||||
'我', '你', '他', '她', '它', '我们', '你们', '他们', '她们', '它们',
|
||||
'自己', '别人', '大家', '谁', '什么', '哪', '哪里', '哪儿', '这', '那',
|
||||
'这个', '那个', '这些', '那些', '这里', '那里', '这儿', '那儿', '这样', '那样',
|
||||
// 助词
|
||||
'的', '地', '得', '了', '着', '过', '吗', '呢', '吧', '啊',
|
||||
'呀', '哇', '哦', '嗯', '噢', '喔', '呃', '唉', '哎', '嘛',
|
||||
// 介词
|
||||
'在', '从', '到', '向', '往', '把', '被', '给', '跟', '和',
|
||||
'与', '对', '比', '为', '因', '由', '以', '按', '用', '让',
|
||||
// 连词
|
||||
'和', '与', '或', '或者', '而', '并', '并且', '但', '但是', '可是',
|
||||
'然而', '不过', '只是', '如果', '要是', '假如', '虽然', '尽管', '即使', '所以',
|
||||
'因此', '于是', '那么', '因为', '由于', '既然', '为了', '以便',
|
||||
// 副词
|
||||
'不', '没', '没有', '很', '太', '最', '更', '也', '都', '就',
|
||||
'才', '又', '再', '还', '却', '只', '只是', '已', '已经', '曾',
|
||||
'曾经', '正', '正在', '将', '将要', '会', '能', '可以', '可能', '应该',
|
||||
'必须', '一定', '大概', '也许', '或许', '其实', '确实', '真的', '当然', '一直',
|
||||
'总是', '经常', '常常', '往往', '偶尔', '几乎', '差不多', '简直', '反正', '终于',
|
||||
// 量词
|
||||
'个', '只', '条', '件', '位', '种', '些', '点', '下', '次',
|
||||
// 数词
|
||||
'一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
|
||||
'百', '千', '万', '亿', '两', '几', '多', '少', '第', '每',
|
||||
// 动词(常见无实意动词)
|
||||
'是', '有', '在', '做', '去', '来', '说', '看', '想', '要',
|
||||
'能', '会', '让', '给', '叫', '用', '打', '把', '被', '到',
|
||||
// 其他常见词
|
||||
'上', '下', '前', '后', '里', '外', '中', '内', '左', '右',
|
||||
'东', '南', '西', '北', '时', '时候', '现在', '今天', '明天', '昨天',
|
||||
'年', '月', '日', '号', '点', '分', '秒', '周', '星期',
|
||||
// 网络聊天常见无意义词
|
||||
'好', '好的', '行', '可以', '嗯嗯', '哈', '呵', '额', '恩', '昂',
|
||||
'ok', 'OK', '好吧', '知道', '知道了', '谢谢', '感谢', '抱歉', '不好意思',
|
||||
// 语气词和程度词(虽然词性是名词/动词,但在聊天中无实际意义)
|
||||
'感觉', '有点', '可能', '应该', '好像', '觉得', '认为', '看看', '看到',
|
||||
'说', '问', '找', '弄', '搞', '搞定', '整', '干', '做', '来', '去',
|
||||
'有', '没有', '没', '是不是', '有没有', '能不能', '会不会', '要不要',
|
||||
'怎样', '如何', '为何', '为什么', '怎么', '怎么样', '怎么办',
|
||||
'东西', '事情', '事', '问题', '时候', '地方', '情况', '样子', '意思',
|
||||
'一下', '一点', '一些', '一样', '一起', '一直', '一般', '一定', '差不多',
|
||||
])
|
||||
|
||||
/** 英文停用词 */
|
||||
export const ENGLISH_STOPWORDS = new Set([
|
||||
// Articles
|
||||
'a', 'an', 'the',
|
||||
// Pronouns
|
||||
'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',
|
||||
'you', 'your', 'yours', 'yourself', 'yourselves',
|
||||
'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
|
||||
'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
|
||||
'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those',
|
||||
// Prepositions
|
||||
'in', 'on', 'at', 'by', 'for', 'with', 'about', 'against', 'between',
|
||||
'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to',
|
||||
'from', 'up', 'down', 'out', 'off', 'over', 'under', 'again', 'further',
|
||||
// Conjunctions
|
||||
'and', 'but', 'or', 'nor', 'so', 'yet', 'both', 'either', 'neither',
|
||||
'not', 'only', 'own', 'same', 'than', 'too', 'very', 'just',
|
||||
// Be verbs
|
||||
'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
|
||||
// Have verbs
|
||||
'have', 'has', 'had', 'having',
|
||||
// Do verbs
|
||||
'do', 'does', 'did', 'doing',
|
||||
// Modal verbs
|
||||
'will', 'would', 'shall', 'should', 'can', 'could', 'may', 'might', 'must',
|
||||
// Other common words
|
||||
'if', 'then', 'else', 'when', 'where', 'why', 'how', 'all', 'each',
|
||||
'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
|
||||
'any', 'now', 'here', 'there', 'of', 'as',
|
||||
// Contractions (without apostrophe)
|
||||
'dont', 'doesnt', 'didnt', 'wont', 'wouldnt', 'cant', 'couldnt',
|
||||
'shouldnt', 'isnt', 'arent', 'wasnt', 'werent', 'havent', 'hasnt', 'hadnt',
|
||||
// Chat common words
|
||||
'ok', 'okay', 'yes', 'no', 'yeah', 'yep', 'nope', 'sure', 'thanks',
|
||||
'thank', 'please', 'sorry', 'hi', 'hello', 'hey', 'bye', 'goodbye',
|
||||
'well', 'like', 'know', 'think', 'want', 'need', 'get', 'got', 'go',
|
||||
'going', 'come', 'coming', 'make', 'made', 'take', 'took', 'see', 'saw',
|
||||
'look', 'looking', 'say', 'said', 'tell', 'told', 'ask', 'asked',
|
||||
'let', 'put', 'keep', 'give', 'gave', 'find', 'found', 'try', 'tried',
|
||||
// Time words
|
||||
'today', 'tomorrow', 'yesterday', 'now', 'then', 'always', 'never',
|
||||
'sometimes', 'often', 'usually', 'still', 'already', 'soon', 'later',
|
||||
])
|
||||
|
||||
/**
|
||||
* 获取停用词集合
|
||||
* @param locale 语言
|
||||
* @returns 停用词集合
|
||||
*/
|
||||
export function getStopwords(locale: string): Set<string> {
|
||||
if (locale === 'zh-CN') {
|
||||
return CHINESE_STOPWORDS
|
||||
}
|
||||
return ENGLISH_STOPWORDS
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断是否为停用词
|
||||
* @param word 词语
|
||||
* @param locale 语言
|
||||
* @returns 是否为停用词
|
||||
*/
|
||||
export function isStopword(word: string, locale: string): boolean {
|
||||
const stopwords = getStopwords(locale)
|
||||
// 英文统一转小写比较
|
||||
const normalizedWord = locale === 'en-US' ? word.toLowerCase() : word
|
||||
return stopwords.has(normalizedWord)
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
/**
|
||||
* NLP 模块类型定义
|
||||
*/
|
||||
|
||||
/** 支持的语言 */
|
||||
export type SupportedLocale = 'zh-CN' | 'en-US'
|
||||
|
||||
/** 分词结果 */
|
||||
export interface SegmentResult {
|
||||
/** 分词后的词语列表 */
|
||||
words: string[]
|
||||
/** 原始文本 */
|
||||
original: string
|
||||
}
|
||||
|
||||
/** 词频项 */
|
||||
export interface WordFrequencyItem {
|
||||
/** 词语 */
|
||||
word: string
|
||||
/** 出现次数 */
|
||||
count: number
|
||||
/** 占比百分比 */
|
||||
percentage: number
|
||||
}
|
||||
|
||||
/** 词性统计项 */
|
||||
export interface PosTagStat {
|
||||
/** 词性标签 */
|
||||
tag: string
|
||||
/** 该词性的词语数量 */
|
||||
count: number
|
||||
}
|
||||
|
||||
/** 词频统计结果 */
|
||||
export interface WordFrequencyResult {
|
||||
/** 词频列表(按出现次数降序) */
|
||||
words: WordFrequencyItem[]
|
||||
/** 总词数 */
|
||||
totalWords: number
|
||||
/** 总消息数 */
|
||||
totalMessages: number
|
||||
/** 唯一词数 */
|
||||
uniqueWords: number
|
||||
/** 词性统计(每个词性的词语数量) */
|
||||
posTagStats?: PosTagStat[]
|
||||
}
|
||||
|
||||
/** 词性过滤模式 */
|
||||
export type PosFilterMode = 'all' | 'meaningful' | 'custom'
|
||||
|
||||
/** 词频统计参数 */
|
||||
export interface WordFrequencyParams {
|
||||
/** 会话 ID */
|
||||
sessionId: string
|
||||
/** 用户语言设置 */
|
||||
locale: SupportedLocale
|
||||
/** 时间过滤 */
|
||||
timeFilter?: {
|
||||
startTs?: number
|
||||
endTs?: number
|
||||
}
|
||||
/** 成员 ID(筛选特定成员) */
|
||||
memberId?: number
|
||||
/** 返回前 N 个高频词,默认 100 */
|
||||
topN?: number
|
||||
/** 最小词长,默认中文 2,英文 3 */
|
||||
minWordLength?: number
|
||||
/** 最小出现次数,默认 2 */
|
||||
minCount?: number
|
||||
/** 词性过滤模式:all=全部, meaningful=只保留有意义的词, custom=自定义 */
|
||||
posFilterMode?: PosFilterMode
|
||||
/** 自定义词性过滤列表(posFilterMode='custom' 时使用) */
|
||||
customPosTags?: string[]
|
||||
/** 是否启用停用词过滤,默认 true */
|
||||
enableStopwords?: boolean
|
||||
}
|
||||
|
||||
/** 词性标签信息 */
|
||||
export interface PosTagInfo {
|
||||
/** 词性标签 */
|
||||
tag: string
|
||||
/** 词性名称(中文) */
|
||||
name: string
|
||||
/** 词性描述 */
|
||||
description: string
|
||||
/** 是否为有意义的词性 */
|
||||
meaningful: boolean
|
||||
}
|
||||
|
||||
/** 分词器配置 */
|
||||
export interface SegmenterConfig {
|
||||
/** 语言 */
|
||||
locale: SupportedLocale
|
||||
/** 自定义词典路径(可选,为后期扩展预留) */
|
||||
customDictPath?: string
|
||||
}
|
||||
@@ -62,6 +62,10 @@ import {
|
||||
// 自定义筛选
|
||||
filterMessagesWithContext,
|
||||
getMultipleSessionsMessages,
|
||||
// NLP 查询
|
||||
getWordFrequency,
|
||||
segmentText,
|
||||
getPosTags,
|
||||
} from './query'
|
||||
import { streamImport, streamParseFileInfo, analyzeIncrementalImport, incrementalImport } from './import'
|
||||
|
||||
@@ -148,6 +152,11 @@ const syncHandlers: Record<string, (payload: any) => any> = {
|
||||
filterMessagesWithContext: (p) =>
|
||||
filterMessagesWithContext(p.sessionId, p.keywords, p.timeFilter, p.senderIds, p.contextSize),
|
||||
getMultipleSessionsMessages: (p) => getMultipleSessionsMessages(p.sessionId, p.chatSessionIds),
|
||||
|
||||
// NLP 查询
|
||||
getWordFrequency: (p) => getWordFrequency(p),
|
||||
segmentText: (p) => segmentText(p.text, p.locale, p.minLength),
|
||||
getPosTags: () => getPosTags(),
|
||||
}
|
||||
|
||||
// 异步消息处理器(流式操作)
|
||||
|
||||
@@ -80,3 +80,6 @@ export type {
|
||||
FilterResult,
|
||||
FilterMessage,
|
||||
} from './session'
|
||||
|
||||
// NLP 查询
|
||||
export { getWordFrequency, segmentText, getPosTags } from './nlp'
|
||||
|
||||
@@ -0,0 +1,137 @@
|
||||
/**
|
||||
* NLP 查询模块
|
||||
* 提供词频统计等 NLP 相关查询功能
|
||||
*/
|
||||
|
||||
import { openDatabase, buildTimeFilter, type TimeFilter } from '../core'
|
||||
import { segment, batchSegmentWithFrequency, getPosTagDefinitions, collectPosTagStats } from '../../nlp'
|
||||
import type { SupportedLocale, WordFrequencyResult, WordFrequencyParams, PosTagInfo, PosTagStat } from '../../nlp'
|
||||
|
||||
/**
|
||||
* 获取词频统计
|
||||
* 用于词云展示
|
||||
*/
|
||||
export function getWordFrequency(params: WordFrequencyParams): WordFrequencyResult {
|
||||
const {
|
||||
sessionId,
|
||||
locale,
|
||||
timeFilter,
|
||||
memberId,
|
||||
topN = 100,
|
||||
minWordLength,
|
||||
minCount = 2,
|
||||
posFilterMode = 'meaningful',
|
||||
customPosTags,
|
||||
enableStopwords = true,
|
||||
} = params
|
||||
|
||||
const db = openDatabase(sessionId)
|
||||
if (!db) {
|
||||
return {
|
||||
words: [],
|
||||
totalWords: 0,
|
||||
totalMessages: 0,
|
||||
uniqueWords: 0,
|
||||
}
|
||||
}
|
||||
|
||||
// 构建时间和成员过滤
|
||||
const filter: TimeFilter = {
|
||||
...timeFilter,
|
||||
memberId,
|
||||
}
|
||||
const { clause, params: filterParams } = buildTimeFilter(filter, 'msg')
|
||||
|
||||
// 构建 WHERE 子句,排除系统消息
|
||||
let whereClause = clause
|
||||
if (whereClause.includes('WHERE')) {
|
||||
whereClause += " AND COALESCE(m.account_name, '') != '系统消息' AND msg.type = 0 AND msg.content IS NOT NULL AND TRIM(msg.content) != ''"
|
||||
} else {
|
||||
whereClause = " WHERE COALESCE(m.account_name, '') != '系统消息' AND msg.type = 0 AND msg.content IS NOT NULL AND TRIM(msg.content) != ''"
|
||||
}
|
||||
|
||||
// 查询消息内容
|
||||
const messages = db
|
||||
.prepare(
|
||||
`
|
||||
SELECT msg.content
|
||||
FROM message msg
|
||||
JOIN member m ON msg.sender_id = m.id
|
||||
${whereClause}
|
||||
`
|
||||
)
|
||||
.all(...filterParams) as Array<{ content: string }>
|
||||
|
||||
// 如果没有消息,返回空结果
|
||||
if (messages.length === 0) {
|
||||
return {
|
||||
words: [],
|
||||
totalWords: 0,
|
||||
totalMessages: 0,
|
||||
uniqueWords: 0,
|
||||
}
|
||||
}
|
||||
|
||||
// 提取文本内容
|
||||
const texts = messages.map((m) => m.content)
|
||||
|
||||
// 收集词性统计(用于显示每个词性有多少词,仅中文有效)
|
||||
let posTagStats: PosTagStat[] | undefined
|
||||
if ((locale as SupportedLocale) === 'zh-CN') {
|
||||
const posStatsMap = collectPosTagStats(
|
||||
texts,
|
||||
minWordLength ?? 2,
|
||||
enableStopwords
|
||||
)
|
||||
posTagStats = [...posStatsMap.entries()].map(([tag, count]) => ({ tag, count }))
|
||||
}
|
||||
|
||||
// 批量分词并统计词频
|
||||
const wordFrequency = batchSegmentWithFrequency(texts, locale as SupportedLocale, {
|
||||
minLength: minWordLength,
|
||||
minCount,
|
||||
topN,
|
||||
posFilterMode,
|
||||
customPosTags,
|
||||
enableStopwords,
|
||||
})
|
||||
|
||||
// 计算总词数(用于百分比)
|
||||
let totalWords = 0
|
||||
for (const count of wordFrequency.values()) {
|
||||
totalWords += count
|
||||
}
|
||||
|
||||
// 构建结果
|
||||
const words = [...wordFrequency.entries()].map(([word, count]) => ({
|
||||
word,
|
||||
count,
|
||||
percentage: totalWords > 0 ? Math.round((count / totalWords) * 10000) / 100 : 0,
|
||||
}))
|
||||
|
||||
return {
|
||||
words,
|
||||
totalWords,
|
||||
totalMessages: messages.length,
|
||||
uniqueWords: wordFrequency.size,
|
||||
posTagStats,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 单文本分词(用于调试或其他用途)
|
||||
*/
|
||||
export function segmentText(
|
||||
text: string,
|
||||
locale: SupportedLocale,
|
||||
minLength?: number
|
||||
): string[] {
|
||||
return segment(text, locale, { minLength })
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取词性标签定义
|
||||
*/
|
||||
export function getPosTags(): PosTagInfo[] {
|
||||
return getPosTagDefinitions()
|
||||
}
|
||||
@@ -199,6 +199,15 @@ export function closeWorker(): void {
|
||||
}
|
||||
}
|
||||
|
||||
// ==================== 通用查询 API ====================
|
||||
|
||||
/**
|
||||
* 通用查询函数(用于新增的查询类型)
|
||||
*/
|
||||
export async function query<T = any>(type: string, payload: any): Promise<T> {
|
||||
return sendToWorker<T>(type, payload)
|
||||
}
|
||||
|
||||
// ==================== 导出的异步 API ====================
|
||||
|
||||
export async function getAvailableYears(sessionId: string): Promise<number[]> {
|
||||
|
||||
Vendored
+63
@@ -632,6 +632,61 @@ interface NetworkApi {
|
||||
testProxyConnection: (proxyUrl: string) => Promise<{ success: boolean; error?: string }>
|
||||
}
|
||||
|
||||
// NLP API 类型 - 自然语言处理功能
|
||||
type SupportedLocale = 'zh-CN' | 'en-US'
|
||||
|
||||
/** 词性过滤模式 */
|
||||
type PosFilterMode = 'all' | 'meaningful' | 'custom'
|
||||
|
||||
interface WordFrequencyItem {
|
||||
word: string
|
||||
count: number
|
||||
percentage: number
|
||||
}
|
||||
|
||||
interface PosTagStat {
|
||||
tag: string
|
||||
count: number
|
||||
}
|
||||
|
||||
interface WordFrequencyResult {
|
||||
words: WordFrequencyItem[]
|
||||
totalWords: number
|
||||
totalMessages: number
|
||||
uniqueWords: number
|
||||
posTagStats?: PosTagStat[]
|
||||
}
|
||||
|
||||
interface WordFrequencyParams {
|
||||
sessionId: string
|
||||
locale: SupportedLocale
|
||||
timeFilter?: { startTs?: number; endTs?: number }
|
||||
memberId?: number
|
||||
topN?: number
|
||||
minWordLength?: number
|
||||
minCount?: number
|
||||
/** 词性过滤模式:all=全部, meaningful=只保留有意义的词, custom=自定义 */
|
||||
posFilterMode?: PosFilterMode
|
||||
/** 自定义词性过滤列表(posFilterMode='custom' 时使用) */
|
||||
customPosTags?: string[]
|
||||
/** 是否启用停用词过滤,默认 true */
|
||||
enableStopwords?: boolean
|
||||
}
|
||||
|
||||
/** 词性标签信息 */
|
||||
interface PosTagInfo {
|
||||
tag: string
|
||||
name: string
|
||||
description: string
|
||||
meaningful: boolean
|
||||
}
|
||||
|
||||
interface NlpApi {
|
||||
getWordFrequency: (params: WordFrequencyParams) => Promise<WordFrequencyResult>
|
||||
segmentText: (text: string, locale: SupportedLocale, minLength?: number) => Promise<string[]>
|
||||
getPosTags: () => Promise<PosTagInfo[]>
|
||||
}
|
||||
|
||||
// Session Index API 类型 - 会话索引功能
|
||||
interface SessionStats {
|
||||
sessionCount: number
|
||||
@@ -716,6 +771,7 @@ declare global {
|
||||
cacheApi: CacheApi
|
||||
networkApi: NetworkApi
|
||||
sessionApi: SessionApi
|
||||
nlpApi: NlpApi
|
||||
}
|
||||
}
|
||||
|
||||
@@ -731,6 +787,7 @@ export {
|
||||
AgentApi,
|
||||
CacheApi,
|
||||
NetworkApi,
|
||||
NlpApi,
|
||||
ProxyConfig,
|
||||
SearchMessageResult,
|
||||
AIConversation,
|
||||
@@ -754,4 +811,10 @@ export {
|
||||
EmbeddingConfig,
|
||||
VectorStoreConfig,
|
||||
RerankConfig,
|
||||
WordFrequencyItem,
|
||||
WordFrequencyResult,
|
||||
WordFrequencyParams,
|
||||
SupportedLocale,
|
||||
PosFilterMode,
|
||||
PosTagInfo,
|
||||
}
|
||||
|
||||
@@ -1180,6 +1180,68 @@ const agentApi = {
|
||||
},
|
||||
}
|
||||
|
||||
// NLP API - 自然语言处理功能
|
||||
interface WordFrequencyItem {
|
||||
word: string
|
||||
count: number
|
||||
percentage: number
|
||||
}
|
||||
|
||||
interface WordFrequencyResult {
|
||||
words: WordFrequencyItem[]
|
||||
totalWords: number
|
||||
totalMessages: number
|
||||
uniqueWords: number
|
||||
}
|
||||
|
||||
type PosFilterMode = 'all' | 'meaningful' | 'custom'
|
||||
|
||||
interface WordFrequencyParams {
|
||||
sessionId: string
|
||||
locale: 'zh-CN' | 'en-US'
|
||||
timeFilter?: { startTs?: number; endTs?: number }
|
||||
memberId?: number
|
||||
topN?: number
|
||||
minWordLength?: number
|
||||
minCount?: number
|
||||
/** 词性过滤模式:all=全部, meaningful=只保留有意义的词, custom=自定义 */
|
||||
posFilterMode?: PosFilterMode
|
||||
/** 自定义词性过滤列表(posFilterMode='custom' 时使用) */
|
||||
customPosTags?: string[]
|
||||
/** 是否启用停用词过滤,默认 true */
|
||||
enableStopwords?: boolean
|
||||
}
|
||||
|
||||
interface PosTagInfo {
|
||||
tag: string
|
||||
name: string
|
||||
description: string
|
||||
meaningful: boolean
|
||||
}
|
||||
|
||||
const nlpApi = {
|
||||
/**
|
||||
* 获取词频统计(用于词云)
|
||||
*/
|
||||
getWordFrequency: (params: WordFrequencyParams): Promise<WordFrequencyResult> => {
|
||||
return ipcRenderer.invoke('nlp:getWordFrequency', params)
|
||||
},
|
||||
|
||||
/**
|
||||
* 单文本分词
|
||||
*/
|
||||
segmentText: (text: string, locale: 'zh-CN' | 'en-US', minLength?: number): Promise<string[]> => {
|
||||
return ipcRenderer.invoke('nlp:segmentText', text, locale, minLength)
|
||||
},
|
||||
|
||||
/**
|
||||
* 获取词性标签定义
|
||||
*/
|
||||
getPosTags: (): Promise<PosTagInfo[]> => {
|
||||
return ipcRenderer.invoke('nlp:getPosTags')
|
||||
},
|
||||
}
|
||||
|
||||
// Network API - 网络设置
|
||||
type ProxyMode = 'off' | 'system' | 'manual'
|
||||
|
||||
@@ -1567,6 +1629,7 @@ if (process.contextIsolated) {
|
||||
contextBridge.exposeInMainWorld('cacheApi', cacheApi)
|
||||
contextBridge.exposeInMainWorld('networkApi', networkApi)
|
||||
contextBridge.exposeInMainWorld('sessionApi', sessionApi)
|
||||
contextBridge.exposeInMainWorld('nlpApi', nlpApi)
|
||||
} catch (error) {
|
||||
console.error(error)
|
||||
}
|
||||
@@ -1593,4 +1656,6 @@ if (process.contextIsolated) {
|
||||
window.networkApi = networkApi
|
||||
// @ts-ignore (define in dts)
|
||||
window.sessionApi = sessionApi
|
||||
// @ts-ignore (define in dts)
|
||||
window.nlpApi = nlpApi
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user