ChatLab/electron/main/nlp/segmenter.ts

/**
 * 分词器模块
 * 中文使用 @node-rs/jieba，其他语言使用 Intl.Segmenter
 *
 * 支持多词库：默认内置简体中文词库，可通过 dictType 加载繁体中文等自定义词库。
 * 自定义词库文件存储在 nlpDir 目录下（由 Worker 初始化时传入）。
 */

import * as fs from 'fs'
import * as path from 'path'
import type { SupportedLocale, PosFilterMode, PosTagInfo, DictType } from './types'
import { isStopword } from './stopwords'

export type { DictType }

interface JiebaInstance {
  cut: (text: string, hmm?: boolean) => string[]
  tag: (text: string) => Array<{ tag: string; word: string }>
}

let _nlpDir: string | null = null

const jiebaInstances = new Map<DictType, JiebaInstance>()

/**
 * 由 Worker 初始化时调用，设置自定义词库目录路径
 */
export function initNlpDir(nlpDir: string): void {
  _nlpDir = nlpDir
}

/**
 * 尝试从 nlpDir 加载词库文件，返回 Buffer 或 null
 */
function tryLoadDictFromDisk(dictId: string): Buffer | null {
  if (!_nlpDir) return null
  const dictPath = path.join(_nlpDir, `${dictId}.dict`)
  if (!fs.existsSync(dictPath)) return null
  try {
    return fs.readFileSync(dictPath)
  } catch {
    return null
  }
}

/**
 * 获取 Jieba 实例（支持多词库）
 *
 * 所有词库均从 nlpDir 磁盘加载（由应用启动时自动下载）。
 * default 和 zh-CN 共用同一实例。
 */
export function getJieba(dictType: DictType = 'default'): JiebaInstance {
  const effectiveType = dictType === 'default' ? 'zh-CN' : dictType
  const cached = jiebaInstances.get(effectiveType)
  if (cached) return cached

  try {
    // eslint-disable-next-line @typescript-eslint/no-require-imports
    const { Jieba } = require('@node-rs/jieba')

    const diskDict = tryLoadDictFromDisk(effectiveType)
    if (!diskDict) {
      throw new Error(`Dict file not found for: ${effectiveType}. Please ensure the dictionary has been downloaded.`)
    }

    const instance: JiebaInstance = Jieba.withDict(diskDict)
    console.log(`[NLP] jieba dict loaded: ${effectiveType} (${diskDict.length} bytes)`)

    jiebaInstances.set(effectiveType, instance)
    return instance
  } catch (error) {
    console.error(`[NLP] Failed to load jieba module (dict=${effectiveType}):`, error)
    throw new Error(`jieba 模块加载失败 (${effectiveType})`)
  }
}

/**
 * 清除指定词库的缓存实例（词库更新后调用）
 */
export function clearJiebaInstance(dictType: DictType): void {
  jiebaInstances.delete(dictType)
  console.log(`[NLP] jieba instance cleared: ${dictType}`)
}

/**
 * 词性标签定义
 */
export const POS_TAG_DEFINITIONS: PosTagInfo[] = [
  // 名词类
  { tag: 'n', name: '名词', description: '普通名词', meaningful: true },
  { tag: 'nr', name: '人名', description: '人名', meaningful: true },
  { tag: 'ns', name: '地名', description: '地名', meaningful: true },
  { tag: 'nt', name: '机构名', description: '机构团体名', meaningful: true },
  { tag: 'nz', name: '其他专名', description: '其他专有名词', meaningful: true },
  { tag: 'nw', name: '作品名', description: '作品名', meaningful: true },
  // 动词类（普通动词通常不太有意义，如"是""有""说"等）
  { tag: 'v', name: '动词', description: '普通动词', meaningful: false },
  { tag: 'vn', name: '动名词', description: '动名词', meaningful: true },
  { tag: 'vd', name: '副动词', description: '副动词', meaningful: false },
  { tag: 'vg', name: '动语素', description: '动词性语素', meaningful: false },
  // 形容词类
  { tag: 'a', name: '形容词', description: '普通形容词', meaningful: true },
  { tag: 'an', name: '名形词', description: '名形词', meaningful: true },
  { tag: 'ad', name: '副形词', description: '副形词', meaningful: true },
  { tag: 'ag', name: '形语素', description: '形容词性语素', meaningful: true },
  // 其他有意义
  { tag: 'i', name: '成语', description: '成语', meaningful: true },
  { tag: 'l', name: '习用语', description: '习用语', meaningful: true },
  { tag: 'j', name: '简称', description: '简称略语', meaningful: true },
  // 副词、介词等（通常不太有意义）
  { tag: 'd', name: '副词', description: '副词', meaningful: false },
  { tag: 'p', name: '介词', description: '介词', meaningful: false },
  { tag: 'c', name: '连词', description: '连词', meaningful: false },
  { tag: 'u', name: '助词', description: '助词', meaningful: false },
  { tag: 'r', name: '代词', description: '代词', meaningful: false },
  { tag: 'm', name: '数词', description: '数词', meaningful: false },
  { tag: 'q', name: '量词', description: '量词', meaningful: false },
  { tag: 'f', name: '方位词', description: '方位词', meaningful: false },
  { tag: 't', name: '时间词', description: '时间词', meaningful: false },
  { tag: 'e', name: '叹词', description: '叹词', meaningful: false },
  { tag: 'y', name: '语气词', description: '语气词', meaningful: false },
  { tag: 'o', name: '拟声词', description: '拟声词', meaningful: false },
  { tag: 'x', name: '非语素字', description: '非语素字', meaningful: false },
  { tag: 'w', name: '标点符号', description: '标点符号', meaningful: false },
]

/**
 * 有意义的词性标签集合
 */
export const MEANINGFUL_POS_TAGS = new Set(POS_TAG_DEFINITIONS.filter((t) => t.meaningful).map((t) => t.tag))

/**
 * 获取所有词性标签信息
 */
export function getPosTagDefinitions(): PosTagInfo[] {
  return POS_TAG_DEFINITIONS
}

// 用于过滤的正则表达式
const EMOJI_REGEX =
  /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}]/gu
const PUNCTUATION_REGEX = /[!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~，。！？、；：""''（）【】《》…—～·\s]/g
const URL_REGEX = /https?:\/\/[^\s]+/g
const MENTION_REGEX = /@[^\s@]+/g
const PURE_NUMBER_REGEX = /^\d+$/

/**
 * 清理文本
 * 移除表情、URL、@提及、标点等
 */
function cleanText(text: string): string {
  return text
    .replace(URL_REGEX, ' ')
    .replace(MENTION_REGEX, ' ')
    .replace(EMOJI_REGEX, ' ')
    .replace(PUNCTUATION_REGEX, ' ')
    .replace(/\s+/g, ' ')
    .trim()
}

/**
 * 判断是否为有效词语
 */
function isValidWord(
  word: string,
  locale: SupportedLocale,
  minLength: number,
  enableStopwords: boolean = true
): boolean {
  // 空字符串
  if (!word || word.trim().length === 0) return false

  // 纯数字
  if (PURE_NUMBER_REGEX.test(word)) return false

  // 长度不足
  if (word.length < minLength) return false

  // 停用词
  if (enableStopwords && isStopword(word, locale)) return false

  return true
}

/**
 * 中文分词选项
 */
interface ChineseSegmentOptions {
  /** 词性过滤模式 */
  posFilterMode?: PosFilterMode
  /** 自定义词性过滤列表 */
  customPosTags?: string[]
  /** 词库类型 */
  dictType?: DictType
}

/**
 * 收集文本的词性统计（用于显示每个词性有多少词）
 * 只统计中文，英文无词性标注
 */
export function collectPosTagStats(
  texts: string[],
  minWordLength: number = 2,
  enableStopwords: boolean = true,
  dictType: DictType = 'default'
): Map<string, number> {
  const posStats = new Map<string, number>()

  try {
    const jieba = getJieba(dictType)

    for (const text of texts) {
      const cleaned = cleanText(text)
      if (!cleaned) continue

      const tagged = jieba.tag(cleaned)

      for (const item of tagged) {
        // 这里固定按中文停用词规则校验，但参数顺序必须与 isValidWord 保持一致，
        // 否则会把最小词长误传成 locale，最终触发 locale.startsWith 的运行时错误。
        if (!isValidWord(item.word, 'zh-CN', minWordLength, enableStopwords)) {
          continue
        }
        posStats.set(item.tag, (posStats.get(item.tag) || 0) + 1)
      }
    }
  } catch (error) {
    console.error('[NLP] Failed to collect POS stats:', error)
  }

  return posStats
}

/**
 * 中文分词（使用 jieba 词性标注）
 * @param text 文本
 * @param options 分词选项
 */
function segmentChinese(text: string, options: ChineseSegmentOptions = {}): string[] {
  const { posFilterMode = 'meaningful', customPosTags, dictType = 'default' } = options
  const cleaned = cleanText(text)
  if (!cleaned) return []

  try {
    const jieba = getJieba(dictType)

    if (posFilterMode === 'all') {
      return jieba.cut(cleaned, false)
    }

    const tagged = jieba.tag(cleaned)

    let allowedTags: Set<string>
    if (posFilterMode === 'custom' && customPosTags) {
      allowedTags = new Set(customPosTags)
    } else {
      allowedTags = MEANINGFUL_POS_TAGS
    }

    return tagged.filter((item) => allowedTags.has(item.tag)).map((item) => item.word)
  } catch (error) {
    console.error('[NLP] Chinese segmentation failed:', error)
    try {
      const jieba = getJieba('default')
      return jieba.cut(cleaned, false)
    } catch {
      return cleaned.split('')
    }
  }
}

/**
 * 英文分词（使用 Intl.Segmenter）
 */
function segmentEnglish(text: string): string[] {
  const cleaned = cleanText(text)
  if (!cleaned) return []

  try {
    const segmenter = new Intl.Segmenter('en', { granularity: 'word' })
    const segments = segmenter.segment(cleaned)

    return [...segments].filter((segment) => segment.isWordLike).map((segment) => segment.segment.toLowerCase())
  } catch {
    return cleaned
      .toLowerCase()
      .split(/\s+/)
      .filter((word) => word.length > 0)
  }
}

/**
 * 日语分词（使用 Intl.Segmenter）
 */
function segmentJapanese(text: string): string[] {
  const cleaned = cleanText(text)
  if (!cleaned) return []

  try {
    const segmenter = new Intl.Segmenter('ja', { granularity: 'word' })
    const segments = segmenter.segment(cleaned)

    return [...segments].filter((segment) => segment.isWordLike).map((segment) => segment.segment)
  } catch {
    return cleaned.split('').filter((ch) => ch.trim().length > 0)
  }
}

/**
 * 分词选项
 */
export interface SegmentOptions {
  /** 最小词长（可选，默认中文2，英文3） */
  minLength?: number
  /** 词性过滤模式（仅中文有效） */
  posFilterMode?: PosFilterMode
  /** 自定义词性过滤列表 */
  customPosTags?: string[]
  /** 是否启用停用词过滤 */
  enableStopwords?: boolean
  /** 词库类型（仅中文有效） */
  dictType?: DictType
}

/**
 * 通用分词入口
 * @param text 待分词文本
 * @param locale 语言
 * @param options 分词选项
 * @returns 过滤后的分词结果
 */
export function segment(text: string, locale: SupportedLocale, options: SegmentOptions = {}): string[] {
  const {
    minLength,
    posFilterMode = 'meaningful',
    customPosTags,
    enableStopwords = true,
    dictType = 'default',
  } = options
  const isChinese = locale.startsWith('zh')
  const isJapanese = locale === 'ja-JP'
  const defaultMinLength = isChinese || isJapanese ? 2 : 3
  const effectiveMinLength = minLength ?? defaultMinLength

  let words: string[]

  if (isChinese) {
    words = segmentChinese(text, { posFilterMode, customPosTags, dictType })
  } else if (isJapanese) {
    words = segmentJapanese(text)
  } else {
    words = segmentEnglish(text)
  }

  return words.filter((word) => isValidWord(word, locale, effectiveMinLength, enableStopwords))
}

/**
 * 批量分词并统计词频选项
 */
export interface BatchSegmentOptions extends SegmentOptions {
  minCount?: number
  topN?: number
  /** 要排除的词语列表 */
  excludeWords?: string[]
}

export interface BatchSegmentResult {
  /** topN 裁剪后的词频 Map */
  words: Map<string, number>
  /** 裁剪前的去重词数（过滤低频词后） */
  uniqueWords: number
  /** 裁剪前的总词次（过滤低频词后所有词的出现次数之和） */
  totalWords: number
}

/**
 * 批量分词并统计词频
 * @param texts 文本数组
 * @param locale 语言
 * @param options 选项
 */
export function batchSegmentWithFrequency(
  texts: string[],
  locale: SupportedLocale,
  options: BatchSegmentOptions = {}
): BatchSegmentResult {
  const { minLength, minCount = 2, topN = 100, posFilterMode, customPosTags, enableStopwords, dictType, excludeWords } = options
  const wordFrequency = new Map<string, number>()
  const excludeSet = excludeWords?.length ? new Set(excludeWords.map((w) => w.toLowerCase())) : null

  for (const text of texts) {
    const words = segment(text, locale, { minLength, posFilterMode, customPosTags, enableStopwords, dictType })
    for (const word of words) {
      if (excludeSet && excludeSet.has(word.toLowerCase())) continue
      wordFrequency.set(word, (wordFrequency.get(word) || 0) + 1)
    }
  }

  // 过滤低频词
  const filtered = new Map<string, number>()
  let totalWords = 0
  for (const [word, count] of wordFrequency) {
    if (count >= minCount) {
      filtered.set(word, count)
      totalWords += count
    }
  }

  // 排序并取 topN
  const sorted = [...filtered.entries()].sort((a, b) => b[1] - a[1]).slice(0, topN)

  return {
    words: new Map(sorted),
    uniqueWords: filtered.size,
    totalWords,
  }
}