feat: 引入分词能力，并新增词云子Tab

2026-05-16 11:29:24 +08:00 · 2026-01-28 00:24:44 +08:00
parent 923c424efe
commit 036141bcb0
28 changed files with 1914 additions and 69 deletions
@@ -0,0 +1,74 @@
+/**
+ * NLP 功能 IPC 处理器
+ * 提供词频统计、分词等 NLP 功能
+ */
+
+import { ipcMain } from 'electron'
+import * as worker from '../worker/workerManager'
+import type { IpcContext } from './types'
+import type { WordFrequencyParams, WordFrequencyResult, SupportedLocale, PosTagInfo } from '../nlp'
+
+/**
+ * 注册 NLP 相关 IPC 处理器
+ */
+export function registerNlpHandlers(_ctx: IpcContext): void {
+  /**
+   * 获取词频统计
+   * 用于词云展示
+   */
+  ipcMain.handle(
+    'nlp:getWordFrequency',
+    async (
+      _event,
+      params: WordFrequencyParams
+    ): Promise<WordFrequencyResult> => {
+      try {
+        const result = await worker.query('getWordFrequency', params)
+        return result as WordFrequencyResult
+      } catch (error) {
+        console.error('[NLP] 获取词频统计失败:', error)
+        return {
+          words: [],
+          totalWords: 0,
+          totalMessages: 0,
+          uniqueWords: 0,
+        }
+      }
+    }
+  )
+
+  /**
+   * 单文本分词
+   * 用于调试或其他用途
+   */
+  ipcMain.handle(
+    'nlp:segmentText',
+    async (
+      _event,
+      text: string,
+      locale: SupportedLocale,
+      minLength?: number
+    ): Promise<string[]> => {
+      try {
+        const result = await worker.query('segmentText', { text, locale, minLength })
+        return result as string[]
+      } catch (error) {
+        console.error('[NLP] 分词失败:', error)
+        return []
+      }
+    }
+  )
+
+  /**
+   * 获取词性标签定义
+   */
+  ipcMain.handle('nlp:getPosTags', async (): Promise<PosTagInfo[]> => {
+    try {
+      const result = await worker.query('getPosTags', {})
+      return result as PosTagInfo[]
+    } catch (error) {
+      console.error('[NLP] 获取词性标签失败:', error)
+      return []
+    }
+  })
+}
@@ -13,6 +13,7 @@ import { registerAIHandlers } from './ipc/ai'
 import { registerMessagesHandlers } from './ipc/messages'
 import { registerCacheHandlers } from './ipc/cache'
 import { registerNetworkHandlers } from './ipc/network'
+import { registerNlpHandlers } from './ipc/nlp'
 import { registerAnalyticsHandlers } from './analytics'
 // 导入 Worker 模块（用于异步分析查询和流式导入）
 import * as worker from './worker/workerManager'
@@ -45,6 +46,7 @@ const mainIpcMain = (win: BrowserWindow) => {
  registerMessagesHandlers(context)
  registerCacheHandlers(context)
  registerNetworkHandlers(context)
+  registerNlpHandlers(context)
  registerAnalyticsHandlers()

  console.log('[IpcMain] All IPC handlers registered successfully')
@@ -0,0 +1,7 @@
+/**
+ * NLP 模块统一导出
+ */
+
+export * from './types'
+export * from './stopwords'
+export * from './segmenter'
@@ -0,0 +1,338 @@
+/**
+ * 分词器模块
+ * 中文使用 @node-rs/jieba，其他语言使用 Intl.Segmenter
+ */
+
+import type { SupportedLocale, PosFilterMode, PosTagInfo } from './types'
+import { isStopword } from './stopwords'
+
+// Jieba 实例类型
+interface JiebaInstance {
+  cut: (text: string, hmm?: boolean) => string[]
+  tag: (text: string) => Array<{ tag: string; word: string }>
+}
+
+// Jieba 实例（延迟初始化）
+let jiebaInstance: JiebaInstance | null = null
+
+/**
+ * 获取 Jieba 实例（延迟加载）
+ */
+function getJieba(): JiebaInstance {
+  if (!jiebaInstance) {
+    try {
+      // eslint-disable-next-line @typescript-eslint/no-require-imports
+      const { Jieba } = require('@node-rs/jieba')
+      // eslint-disable-next-line @typescript-eslint/no-require-imports
+      const { dict } = require('@node-rs/jieba/dict')
+      jiebaInstance = Jieba.withDict(dict)
+      console.log('[NLP] jieba 模块加载成功')
+    } catch (error) {
+      console.error('[NLP] jieba 模块加载失败:', error)
+      throw new Error('jieba 模块加载失败')
+    }
+  }
+  return jiebaInstance
+}
+
+/**
+ * 词性标签定义
+ */
+export const POS_TAG_DEFINITIONS: PosTagInfo[] = [
+  // 名词类
+  { tag: 'n', name: '名词', description: '普通名词', meaningful: true },
+  { tag: 'nr', name: '人名', description: '人名', meaningful: true },
+  { tag: 'ns', name: '地名', description: '地名', meaningful: true },
+  { tag: 'nt', name: '机构名', description: '机构团体名', meaningful: true },
+  { tag: 'nz', name: '其他专名', description: '其他专有名词', meaningful: true },
+  { tag: 'nw', name: '作品名', description: '作品名', meaningful: true },
+  // 动词类
+  { tag: 'v', name: '动词', description: '普通动词', meaningful: true },
+  { tag: 'vn', name: '动名词', description: '动名词', meaningful: true },
+  { tag: 'vd', name: '副动词', description: '副动词', meaningful: true },
+  { tag: 'vg', name: '动语素', description: '动词性语素', meaningful: true },
+  // 形容词类
+  { tag: 'a', name: '形容词', description: '普通形容词', meaningful: true },
+  { tag: 'an', name: '名形词', description: '名形词', meaningful: true },
+  { tag: 'ad', name: '副形词', description: '副形词', meaningful: true },
+  { tag: 'ag', name: '形语素', description: '形容词性语素', meaningful: true },
+  // 其他有意义
+  { tag: 'i', name: '成语', description: '成语', meaningful: true },
+  { tag: 'l', name: '习用语', description: '习用语', meaningful: true },
+  { tag: 'j', name: '简称', description: '简称略语', meaningful: true },
+  // 副词、介词等（通常不太有意义）
+  { tag: 'd', name: '副词', description: '副词', meaningful: false },
+  { tag: 'p', name: '介词', description: '介词', meaningful: false },
+  { tag: 'c', name: '连词', description: '连词', meaningful: false },
+  { tag: 'u', name: '助词', description: '助词', meaningful: false },
+  { tag: 'r', name: '代词', description: '代词', meaningful: false },
+  { tag: 'm', name: '数词', description: '数词', meaningful: false },
+  { tag: 'q', name: '量词', description: '量词', meaningful: false },
+  { tag: 'f', name: '方位词', description: '方位词', meaningful: false },
+  { tag: 't', name: '时间词', description: '时间词', meaningful: false },
+  { tag: 'e', name: '叹词', description: '叹词', meaningful: false },
+  { tag: 'y', name: '语气词', description: '语气词', meaningful: false },
+  { tag: 'o', name: '拟声词', description: '拟声词', meaningful: false },
+  { tag: 'x', name: '非语素字', description: '非语素字', meaningful: false },
+  { tag: 'w', name: '标点符号', description: '标点符号', meaningful: false },
+]
+
+/**
+ * 有意义的词性标签集合
+ */
+export const MEANINGFUL_POS_TAGS = new Set(
+  POS_TAG_DEFINITIONS.filter((t) => t.meaningful).map((t) => t.tag)
+)
+
+/**
+ * 获取所有词性标签信息
+ */
+export function getPosTagDefinitions(): PosTagInfo[] {
+  return POS_TAG_DEFINITIONS
+}
+
+// 用于过滤的正则表达式
+const EMOJI_REGEX = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}]/gu
+const PUNCTUATION_REGEX = /[!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~，。！？、；：""''（）【】《》…—～·\s]/g
+const URL_REGEX = /https?:\/\/[^\s]+/g
+const MENTION_REGEX = /@[^\s@]+/g
+const PURE_NUMBER_REGEX = /^\d+$/
+
+/**
+ * 清理文本
+ * 移除表情、URL、@提及、标点等
+ */
+function cleanText(text: string): string {
+  return text
+    .replace(URL_REGEX, ' ')
+    .replace(MENTION_REGEX, ' ')
+    .replace(EMOJI_REGEX, ' ')
+    .replace(PUNCTUATION_REGEX, ' ')
+    .replace(/\s+/g, ' ')
+    .trim()
+}
+
+/**
+ * 判断是否为有效词语
+ */
+function isValidWord(word: string, locale: SupportedLocale, minLength: number, enableStopwords: boolean = true): boolean {
+  // 空字符串
+  if (!word || word.trim().length === 0) return false
+
+  // 纯数字
+  if (PURE_NUMBER_REGEX.test(word)) return false
+
+  // 长度不足
+  if (word.length < minLength) return false
+
+  // 停用词
+  if (enableStopwords && isStopword(word, locale)) return false
+
+  return true
+}
+
+/**
+ * 中文分词选项
+ */
+interface ChineseSegmentOptions {
+  /** 词性过滤模式 */
+  posFilterMode?: PosFilterMode
+  /** 自定义词性过滤列表 */
+  customPosTags?: string[]
+}
+
+/**
+ * 收集文本的词性统计（用于显示每个词性有多少词）
+ * 只统计中文，英文无词性标注
+ */
+export function collectPosTagStats(
+  texts: string[],
+  minWordLength: number = 2,
+  enableStopwords: boolean = true
+): Map<string, number> {
+  const posStats = new Map<string, number>()
+
+  try {
+    const jieba = getJieba()
+
+    for (const text of texts) {
+      const cleaned = cleanText(text)
+      if (!cleaned) continue
+
+      const tagged = jieba.tag(cleaned)
+
+      for (const item of tagged) {
+        // 检查词是否有效（长度和停用词过滤）
+        if (!isValidWord(item.word, minWordLength, 'zh-CN', enableStopwords)) {
+          continue
+        }
+        posStats.set(item.tag, (posStats.get(item.tag) || 0) + 1)
+      }
+    }
+  } catch (error) {
+    console.error('[NLP] 收集词性统计失败:', error)
+  }
+
+  return posStats
+}
+
+/**
+ * 中文分词（使用 jieba 词性标注）
+ * @param text 文本
+ * @param options 分词选项
+ */
+function segmentChinese(text: string, options: ChineseSegmentOptions = {}): string[] {
+  const { posFilterMode = 'meaningful', customPosTags } = options
+  const cleaned = cleanText(text)
+  if (!cleaned) return []
+
+  try {
+    const jieba = getJieba()
+
+    // 全部模式：直接分词，不做词性过滤
+    if (posFilterMode === 'all') {
+      return jieba.cut(cleaned, false)
+    }
+
+    // 使用词性标注
+    const tagged = jieba.tag(cleaned)
+
+    // 根据模式过滤
+    let allowedTags: Set<string>
+    if (posFilterMode === 'custom' && customPosTags) {
+      allowedTags = new Set(customPosTags)
+    } else {
+      // meaningful 模式
+      allowedTags = MEANINGFUL_POS_TAGS
+    }
+
+    return tagged
+      .filter((item) => allowedTags.has(item.tag))
+      .map((item) => item.word)
+  } catch (error) {
+    console.error('[NLP] 中文分词失败:', error)
+    // 降级：使用简单分词
+    try {
+      const jieba = getJieba()
+      return jieba.cut(cleaned, false)
+    } catch {
+      return cleaned.split('')
+    }
+  }
+}
+
+/**
+ * 英文分词（使用 Intl.Segmenter）
+ */
+function segmentEnglish(text: string): string[] {
+  const cleaned = cleanText(text)
+  if (!cleaned) return []
+
+  try {
+    const segmenter = new Intl.Segmenter('en', { granularity: 'word' })
+    const segments = segmenter.segment(cleaned)
+
+    return [...segments]
+      .filter((segment) => segment.isWordLike)
+      .map((segment) => segment.segment.toLowerCase())
+  } catch {
+    // 降级：简单按空格分词
+    return cleaned
+      .toLowerCase()
+      .split(/\s+/)
+      .filter((word) => word.length > 0)
+  }
+}
+
+/**
+ * 分词选项
+ */
+export interface SegmentOptions {
+  /** 最小词长（可选，默认中文2，英文3） */
+  minLength?: number
+  /** 词性过滤模式（仅中文有效） */
+  posFilterMode?: PosFilterMode
+  /** 自定义词性过滤列表 */
+  customPosTags?: string[]
+  /** 是否启用停用词过滤 */
+  enableStopwords?: boolean
+}
+
+/**
+ * 通用分词入口
+ * @param text 待分词文本
+ * @param locale 语言
+ * @param options 分词选项
+ * @returns 过滤后的分词结果
+ */
+export function segment(
+  text: string,
+  locale: SupportedLocale,
+  options: SegmentOptions = {}
+): string[] {
+  const {
+    minLength,
+    posFilterMode = 'meaningful',
+    customPosTags,
+    enableStopwords = true,
+  } = options
+  const defaultMinLength = locale === 'zh-CN' ? 2 : 3
+  const effectiveMinLength = minLength ?? defaultMinLength
+
+  let words: string[]
+
+  if (locale === 'zh-CN') {
+    words = segmentChinese(text, { posFilterMode, customPosTags })
+  } else {
+    words = segmentEnglish(text)
+  }
+
+  // 过滤无效词
+  return words.filter((word) => isValidWord(word, locale, effectiveMinLength, enableStopwords))
+}
+
+/**
+ * 批量分词并统计词频选项
+ */
+export interface BatchSegmentOptions extends SegmentOptions {
+  minCount?: number
+  topN?: number
+}
+
+/**
+ * 批量分词并统计词频
+ * @param texts 文本数组
+ * @param locale 语言
+ * @param options 选项
+ * @returns 词频 Map
+ */
+export function batchSegmentWithFrequency(
+  texts: string[],
+  locale: SupportedLocale,
+  options: BatchSegmentOptions = {}
+): Map<string, number> {
+  const { minLength, minCount = 2, topN = 100, posFilterMode, customPosTags, enableStopwords } = options
+  const wordFrequency = new Map<string, number>()
+
+  for (const text of texts) {
+    const words = segment(text, locale, { minLength, posFilterMode, customPosTags, enableStopwords })
+    for (const word of words) {
+      wordFrequency.set(word, (wordFrequency.get(word) || 0) + 1)
+    }
+  }
+
+  // 过滤低频词
+  const filtered = new Map<string, number>()
+  for (const [word, count] of wordFrequency) {
+    if (count >= minCount) {
+      filtered.set(word, count)
+    }
+  }
+
+  // 排序并取 topN
+  const sorted = [...filtered.entries()]
+    .sort((a, b) => b[1] - a[1])
+    .slice(0, topN)
+
+  return new Map(sorted)
+}
@@ -0,0 +1,119 @@
+/**
+ * 停用词表
+ * 用于过滤无意义的高频词
+ */
+
+/** 中文停用词 */
+export const CHINESE_STOPWORDS = new Set([
+  // 代词
+  '我', '你', '他', '她', '它', '我们', '你们', '他们', '她们', '它们',
+  '自己', '别人', '大家', '谁', '什么', '哪', '哪里', '哪儿', '这', '那',
+  '这个', '那个', '这些', '那些', '这里', '那里', '这儿', '那儿', '这样', '那样',
+  // 助词
+  '的', '地', '得', '了', '着', '过', '吗', '呢', '吧', '啊',
+  '呀', '哇', '哦', '嗯', '噢', '喔', '呃', '唉', '哎', '嘛',
+  // 介词
+  '在', '从', '到', '向', '往', '把', '被', '给', '跟', '和',
+  '与', '对', '比', '为', '因', '由', '以', '按', '用', '让',
+  // 连词
+  '和', '与', '或', '或者', '而', '并', '并且', '但', '但是', '可是',
+  '然而', '不过', '只是', '如果', '要是', '假如', '虽然', '尽管', '即使', '所以',
+  '因此', '于是', '那么', '因为', '由于', '既然', '为了', '以便',
+  // 副词
+  '不', '没', '没有', '很', '太', '最', '更', '也', '都', '就',
+  '才', '又', '再', '还', '却', '只', '只是', '已', '已经', '曾',
+  '曾经', '正', '正在', '将', '将要', '会', '能', '可以', '可能', '应该',
+  '必须', '一定', '大概', '也许', '或许', '其实', '确实', '真的', '当然', '一直',
+  '总是', '经常', '常常', '往往', '偶尔', '几乎', '差不多', '简直', '反正', '终于',
+  // 量词
+  '个', '只', '条', '件', '位', '种', '些', '点', '下', '次',
+  // 数词
+  '一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
+  '百', '千', '万', '亿', '两', '几', '多', '少', '第', '每',
+  // 动词（常见无实意动词）
+  '是', '有', '在', '做', '去', '来', '说', '看', '想', '要',
+  '能', '会', '让', '给', '叫', '用', '打', '把', '被', '到',
+  // 其他常见词
+  '上', '下', '前', '后', '里', '外', '中', '内', '左', '右',
+  '东', '南', '西', '北', '时', '时候', '现在', '今天', '明天', '昨天',
+  '年', '月', '日', '号', '点', '分', '秒', '周', '星期',
+  // 网络聊天常见无意义词
+  '好', '好的', '行', '可以', '嗯嗯', '哈', '呵', '额', '恩', '昂',
+  'ok', 'OK', '好吧', '知道', '知道了', '谢谢', '感谢', '抱歉', '不好意思',
+  // 语气词和程度词（虽然词性是名词/动词，但在聊天中无实际意义）
+  '感觉', '有点', '可能', '应该', '好像', '觉得', '认为', '看看', '看到',
+  '说', '问', '找', '弄', '搞', '搞定', '整', '干', '做', '来', '去',
+  '有', '没有', '没', '是不是', '有没有', '能不能', '会不会', '要不要',
+  '怎样', '如何', '为何', '为什么', '怎么', '怎么样', '怎么办',
+  '东西', '事情', '事', '问题', '时候', '地方', '情况', '样子', '意思',
+  '一下', '一点', '一些', '一样', '一起', '一直', '一般', '一定', '差不多',
+])
+
+/** 英文停用词 */
+export const ENGLISH_STOPWORDS = new Set([
+  // Articles
+  'a', 'an', 'the',
+  // Pronouns
+  'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',
+  'you', 'your', 'yours', 'yourself', 'yourselves',
+  'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
+  'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
+  'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those',
+  // Prepositions
+  'in', 'on', 'at', 'by', 'for', 'with', 'about', 'against', 'between',
+  'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to',
+  'from', 'up', 'down', 'out', 'off', 'over', 'under', 'again', 'further',
+  // Conjunctions
+  'and', 'but', 'or', 'nor', 'so', 'yet', 'both', 'either', 'neither',
+  'not', 'only', 'own', 'same', 'than', 'too', 'very', 'just',
+  // Be verbs
+  'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
+  // Have verbs
+  'have', 'has', 'had', 'having',
+  // Do verbs
+  'do', 'does', 'did', 'doing',
+  // Modal verbs
+  'will', 'would', 'shall', 'should', 'can', 'could', 'may', 'might', 'must',
+  // Other common words
+  'if', 'then', 'else', 'when', 'where', 'why', 'how', 'all', 'each',
+  'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
+  'any', 'now', 'here', 'there', 'of', 'as',
+  // Contractions (without apostrophe)
+  'dont', 'doesnt', 'didnt', 'wont', 'wouldnt', 'cant', 'couldnt',
+  'shouldnt', 'isnt', 'arent', 'wasnt', 'werent', 'havent', 'hasnt', 'hadnt',
+  // Chat common words
+  'ok', 'okay', 'yes', 'no', 'yeah', 'yep', 'nope', 'sure', 'thanks',
+  'thank', 'please', 'sorry', 'hi', 'hello', 'hey', 'bye', 'goodbye',
+  'well', 'like', 'know', 'think', 'want', 'need', 'get', 'got', 'go',
+  'going', 'come', 'coming', 'make', 'made', 'take', 'took', 'see', 'saw',
+  'look', 'looking', 'say', 'said', 'tell', 'told', 'ask', 'asked',
+  'let', 'put', 'keep', 'give', 'gave', 'find', 'found', 'try', 'tried',
+  // Time words
+  'today', 'tomorrow', 'yesterday', 'now', 'then', 'always', 'never',
+  'sometimes', 'often', 'usually', 'still', 'already', 'soon', 'later',
+])
+
+/**
+ * 获取停用词集合
+ * @param locale 语言
+ * @returns 停用词集合
+ */
+export function getStopwords(locale: string): Set<string> {
+  if (locale === 'zh-CN') {
+    return CHINESE_STOPWORDS
+  }
+  return ENGLISH_STOPWORDS
+}
+
+/**
+ * 判断是否为停用词
+ * @param word 词语
+ * @param locale 语言
+ * @returns 是否为停用词
+ */
+export function isStopword(word: string, locale: string): boolean {
+  const stopwords = getStopwords(locale)
+  // 英文统一转小写比较
+  const normalizedWord = locale === 'en-US' ? word.toLowerCase() : word
+  return stopwords.has(normalizedWord)
+}
@@ -0,0 +1,96 @@
+/**
+ * NLP 模块类型定义
+ */
+
+/** 支持的语言 */
+export type SupportedLocale = 'zh-CN' | 'en-US'
+
+/** 分词结果 */
+export interface SegmentResult {
+  /** 分词后的词语列表 */
+  words: string[]
+  /** 原始文本 */
+  original: string
+}
+
+/** 词频项 */
+export interface WordFrequencyItem {
+  /** 词语 */
+  word: string
+  /** 出现次数 */
+  count: number
+  /** 占比百分比 */
+  percentage: number
+}
+
+/** 词性统计项 */
+export interface PosTagStat {
+  /** 词性标签 */
+  tag: string
+  /** 该词性的词语数量 */
+  count: number
+}
+
+/** 词频统计结果 */
+export interface WordFrequencyResult {
+  /** 词频列表（按出现次数降序） */
+  words: WordFrequencyItem[]
+  /** 总词数 */
+  totalWords: number
+  /** 总消息数 */
+  totalMessages: number
+  /** 唯一词数 */
+  uniqueWords: number
+  /** 词性统计（每个词性的词语数量） */
+  posTagStats?: PosTagStat[]
+}
+
+/** 词性过滤模式 */
+export type PosFilterMode = 'all' | 'meaningful' | 'custom'
+
+/** 词频统计参数 */
+export interface WordFrequencyParams {
+  /** 会话 ID */
+  sessionId: string
+  /** 用户语言设置 */
+  locale: SupportedLocale
+  /** 时间过滤 */
+  timeFilter?: {
+    startTs?: number
+    endTs?: number
+  }
+  /** 成员 ID（筛选特定成员） */
+  memberId?: number
+  /** 返回前 N 个高频词，默认 100 */
+  topN?: number
+  /** 最小词长，默认中文 2，英文 3 */
+  minWordLength?: number
+  /** 最小出现次数，默认 2 */
+  minCount?: number
+  /** 词性过滤模式：all=全部, meaningful=只保留有意义的词, custom=自定义 */
+  posFilterMode?: PosFilterMode
+  /** 自定义词性过滤列表（posFilterMode='custom' 时使用） */
+  customPosTags?: string[]
+  /** 是否启用停用词过滤，默认 true */
+  enableStopwords?: boolean
+}
+
+/** 词性标签信息 */
+export interface PosTagInfo {
+  /** 词性标签 */
+  tag: string
+  /** 词性名称（中文） */
+  name: string
+  /** 词性描述 */
+  description: string
+  /** 是否为有意义的词性 */
+  meaningful: boolean
+}
+
+/** 分词器配置 */
+export interface SegmenterConfig {
+  /** 语言 */
+  locale: SupportedLocale
+  /** 自定义词典路径（可选，为后期扩展预留） */
+  customDictPath?: string
+}
@@ -62,6 +62,10 @@ import {
  // 自定义筛选
  filterMessagesWithContext,
  getMultipleSessionsMessages,
+  // NLP 查询
+  getWordFrequency,
+  segmentText,
+  getPosTags,
 } from './query'
 import { streamImport, streamParseFileInfo, analyzeIncrementalImport, incrementalImport } from './import'

@@ -148,6 +152,11 @@ const syncHandlers: Record<string, (payload: any) => any> = {
  filterMessagesWithContext: (p) =>
    filterMessagesWithContext(p.sessionId, p.keywords, p.timeFilter, p.senderIds, p.contextSize),
  getMultipleSessionsMessages: (p) => getMultipleSessionsMessages(p.sessionId, p.chatSessionIds),
+
+  // NLP 查询
+  getWordFrequency: (p) => getWordFrequency(p),
+  segmentText: (p) => segmentText(p.text, p.locale, p.minLength),
+  getPosTags: () => getPosTags(),
 }

 // 异步消息处理器（流式操作）
@@ -80,3 +80,6 @@ export type {
  FilterResult,
  FilterMessage,
 } from './session'
+
+// NLP 查询
+export { getWordFrequency, segmentText, getPosTags } from './nlp'
@@ -0,0 +1,137 @@
+/**
+ * NLP 查询模块
+ * 提供词频统计等 NLP 相关查询功能
+ */
+
+import { openDatabase, buildTimeFilter, type TimeFilter } from '../core'
+import { segment, batchSegmentWithFrequency, getPosTagDefinitions, collectPosTagStats } from '../../nlp'
+import type { SupportedLocale, WordFrequencyResult, WordFrequencyParams, PosTagInfo, PosTagStat } from '../../nlp'
+
+/**
+ * 获取词频统计
+ * 用于词云展示
+ */
+export function getWordFrequency(params: WordFrequencyParams): WordFrequencyResult {
+  const {
+    sessionId,
+    locale,
+    timeFilter,
+    memberId,
+    topN = 100,
+    minWordLength,
+    minCount = 2,
+    posFilterMode = 'meaningful',
+    customPosTags,
+    enableStopwords = true,
+  } = params
+
+  const db = openDatabase(sessionId)
+  if (!db) {
+    return {
+      words: [],
+      totalWords: 0,
+      totalMessages: 0,
+      uniqueWords: 0,
+    }
+  }
+
+  // 构建时间和成员过滤
+  const filter: TimeFilter = {
+    ...timeFilter,
+    memberId,
+  }
+  const { clause, params: filterParams } = buildTimeFilter(filter, 'msg')
+
+  // 构建 WHERE 子句，排除系统消息
+  let whereClause = clause
+  if (whereClause.includes('WHERE')) {
+    whereClause += " AND COALESCE(m.account_name, '') != '系统消息' AND msg.type = 0 AND msg.content IS NOT NULL AND TRIM(msg.content) != ''"
+  } else {
+    whereClause = " WHERE COALESCE(m.account_name, '') != '系统消息' AND msg.type = 0 AND msg.content IS NOT NULL AND TRIM(msg.content) != ''"
+  }
+
+  // 查询消息内容
+  const messages = db
+    .prepare(
+      `
+      SELECT msg.content
+      FROM message msg
+      JOIN member m ON msg.sender_id = m.id
+      ${whereClause}
+      `
+    )
+    .all(...filterParams) as Array<{ content: string }>
+
+  // 如果没有消息，返回空结果
+  if (messages.length === 0) {
+    return {
+      words: [],
+      totalWords: 0,
+      totalMessages: 0,
+      uniqueWords: 0,
+    }
+  }
+
+  // 提取文本内容
+  const texts = messages.map((m) => m.content)
+
+  // 收集词性统计（用于显示每个词性有多少词，仅中文有效）
+  let posTagStats: PosTagStat[] | undefined
+  if ((locale as SupportedLocale) === 'zh-CN') {
+    const posStatsMap = collectPosTagStats(
+      texts,
+      minWordLength ?? 2,
+      enableStopwords
+    )
+    posTagStats = [...posStatsMap.entries()].map(([tag, count]) => ({ tag, count }))
+  }
+
+  // 批量分词并统计词频
+  const wordFrequency = batchSegmentWithFrequency(texts, locale as SupportedLocale, {
+    minLength: minWordLength,
+    minCount,
+    topN,
+    posFilterMode,
+    customPosTags,
+    enableStopwords,
+  })
+
+  // 计算总词数（用于百分比）
+  let totalWords = 0
+  for (const count of wordFrequency.values()) {
+    totalWords += count
+  }
+
+  // 构建结果
+  const words = [...wordFrequency.entries()].map(([word, count]) => ({
+    word,
+    count,
+    percentage: totalWords > 0 ? Math.round((count / totalWords) * 10000) / 100 : 0,
+  }))
+
+  return {
+    words,
+    totalWords,
+    totalMessages: messages.length,
+    uniqueWords: wordFrequency.size,
+    posTagStats,
+  }
+}
+
+/**
+ * 单文本分词（用于调试或其他用途）
+ */
+export function segmentText(
+  text: string,
+  locale: SupportedLocale,
+  minLength?: number
+): string[] {
+  return segment(text, locale, { minLength })
+}
+
+/**
+ * 获取词性标签定义
+ */
+export function getPosTags(): PosTagInfo[] {
+  return getPosTagDefinitions()
+}
@@ -199,6 +199,15 @@ export function closeWorker(): void {
  }
 }

+// ==================== 通用查询 API ====================
+
+/**
+ * 通用查询函数（用于新增的查询类型）
+ */
+export async function query<T = any>(type: string, payload: any): Promise<T> {
+  return sendToWorker<T>(type, payload)
+}
+
 // ==================== 导出的异步 API ====================

 export async function getAvailableYears(sessionId: string): Promise<number[]> {
@@ -632,6 +632,61 @@ interface NetworkApi {
  testProxyConnection: (proxyUrl: string) => Promise<{ success: boolean; error?: string }>
 }

+// NLP API 类型 - 自然语言处理功能
+type SupportedLocale = 'zh-CN' | 'en-US'
+
+/** 词性过滤模式 */
+type PosFilterMode = 'all' | 'meaningful' | 'custom'
+
+interface WordFrequencyItem {
+  word: string
+  count: number
+  percentage: number
+}
+
+interface PosTagStat {
+  tag: string
+  count: number
+}
+
+interface WordFrequencyResult {
+  words: WordFrequencyItem[]
+  totalWords: number
+  totalMessages: number
+  uniqueWords: number
+  posTagStats?: PosTagStat[]
+}
+
+interface WordFrequencyParams {
+  sessionId: string
+  locale: SupportedLocale
+  timeFilter?: { startTs?: number; endTs?: number }
+  memberId?: number
+  topN?: number
+  minWordLength?: number
+  minCount?: number
+  /** 词性过滤模式：all=全部, meaningful=只保留有意义的词, custom=自定义 */
+  posFilterMode?: PosFilterMode
+  /** 自定义词性过滤列表（posFilterMode='custom' 时使用） */
+  customPosTags?: string[]
+  /** 是否启用停用词过滤，默认 true */
+  enableStopwords?: boolean
+}
+
+/** 词性标签信息 */
+interface PosTagInfo {
+  tag: string
+  name: string
+  description: string
+  meaningful: boolean
+}
+
+interface NlpApi {
+  getWordFrequency: (params: WordFrequencyParams) => Promise<WordFrequencyResult>
+  segmentText: (text: string, locale: SupportedLocale, minLength?: number) => Promise<string[]>
+  getPosTags: () => Promise<PosTagInfo[]>
+}
+
 // Session Index API 类型 - 会话索引功能
 interface SessionStats {
  sessionCount: number
@@ -716,6 +771,7 @@ declare global {
    cacheApi: CacheApi
    networkApi: NetworkApi
    sessionApi: SessionApi
+    nlpApi: NlpApi
  }
 }

@@ -731,6 +787,7 @@ export {
  AgentApi,
  CacheApi,
  NetworkApi,
+  NlpApi,
  ProxyConfig,
  SearchMessageResult,
  AIConversation,
@@ -754,4 +811,10 @@ export {
  EmbeddingConfig,
  VectorStoreConfig,
  RerankConfig,
+  WordFrequencyItem,
+  WordFrequencyResult,
+  WordFrequencyParams,
+  SupportedLocale,
+  PosFilterMode,
+  PosTagInfo,
 }
@@ -1180,6 +1180,68 @@ const agentApi = {
  },
 }

+// NLP API - 自然语言处理功能
+interface WordFrequencyItem {
+  word: string
+  count: number
+  percentage: number
+}
+
+interface WordFrequencyResult {
+  words: WordFrequencyItem[]
+  totalWords: number
+  totalMessages: number
+  uniqueWords: number
+}
+
+type PosFilterMode = 'all' | 'meaningful' | 'custom'
+
+interface WordFrequencyParams {
+  sessionId: string
+  locale: 'zh-CN' | 'en-US'
+  timeFilter?: { startTs?: number; endTs?: number }
+  memberId?: number
+  topN?: number
+  minWordLength?: number
+  minCount?: number
+  /** 词性过滤模式：all=全部, meaningful=只保留有意义的词, custom=自定义 */
+  posFilterMode?: PosFilterMode
+  /** 自定义词性过滤列表（posFilterMode='custom' 时使用） */
+  customPosTags?: string[]
+  /** 是否启用停用词过滤，默认 true */
+  enableStopwords?: boolean
+}
+
+interface PosTagInfo {
+  tag: string
+  name: string
+  description: string
+  meaningful: boolean
+}
+
+const nlpApi = {
+  /**
+   * 获取词频统计（用于词云）
+   */
+  getWordFrequency: (params: WordFrequencyParams): Promise<WordFrequencyResult> => {
+    return ipcRenderer.invoke('nlp:getWordFrequency', params)
+  },
+
+  /**
+   * 单文本分词
+   */
+  segmentText: (text: string, locale: 'zh-CN' | 'en-US', minLength?: number): Promise<string[]> => {
+    return ipcRenderer.invoke('nlp:segmentText', text, locale, minLength)
+  },
+
+  /**
+   * 获取词性标签定义
+   */
+  getPosTags: (): Promise<PosTagInfo[]> => {
+    return ipcRenderer.invoke('nlp:getPosTags')
+  },
+}
+
 // Network API - 网络设置
 type ProxyMode = 'off' | 'system' | 'manual'

@@ -1567,6 +1629,7 @@ if (process.contextIsolated) {
    contextBridge.exposeInMainWorld('cacheApi', cacheApi)
    contextBridge.exposeInMainWorld('networkApi', networkApi)
    contextBridge.exposeInMainWorld('sessionApi', sessionApi)
+    contextBridge.exposeInMainWorld('nlpApi', nlpApi)
  } catch (error) {
    console.error(error)
  }
@@ -1593,4 +1656,6 @@ if (process.contextIsolated) {
  window.networkApi = networkApi
  // @ts-ignore (define in dts)
  window.sessionApi = sessionApi
+  // @ts-ignore (define in dts)
+  window.nlpApi = nlpApi
 }