Files
ChatLab/electron/main/nlp/segmenter.ts
T

419 lines
13 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* 分词器模块
* 中文使用 @node-rs/jieba,其他语言使用 Intl.Segmenter
*
* 支持多词库:默认内置简体中文词库,可通过 dictType 加载繁体中文等自定义词库。
* 自定义词库文件存储在 nlpDir 目录下(由 Worker 初始化时传入)。
*/
import * as fs from 'fs'
import * as path from 'path'
import type { SupportedLocale, PosFilterMode, PosTagInfo, DictType } from './types'
import { isStopword } from './stopwords'
export type { DictType }
interface JiebaInstance {
cut: (text: string, hmm?: boolean) => string[]
tag: (text: string) => Array<{ tag: string; word: string }>
}
let _nlpDir: string | null = null
const jiebaInstances = new Map<DictType, JiebaInstance>()
/**
* 由 Worker 初始化时调用,设置自定义词库目录路径
*/
export function initNlpDir(nlpDir: string): void {
_nlpDir = nlpDir
}
/**
* 尝试从 nlpDir 加载词库文件,返回 Buffer 或 null
*/
function tryLoadDictFromDisk(dictId: string): Buffer | null {
if (!_nlpDir) return null
const dictPath = path.join(_nlpDir, `${dictId}.dict`)
if (!fs.existsSync(dictPath)) return null
try {
return fs.readFileSync(dictPath)
} catch {
return null
}
}
/**
* 获取 Jieba 实例(支持多词库)
*
* 所有词库均从 nlpDir 磁盘加载(由应用启动时自动下载)。
* default 和 zh-CN 共用同一实例。
*/
export function getJieba(dictType: DictType = 'default'): JiebaInstance {
const effectiveType = dictType === 'default' ? 'zh-CN' : dictType
const cached = jiebaInstances.get(effectiveType)
if (cached) return cached
try {
// eslint-disable-next-line @typescript-eslint/no-require-imports
const { Jieba } = require('@node-rs/jieba')
const diskDict = tryLoadDictFromDisk(effectiveType)
if (!diskDict) {
throw new Error(`Dict file not found for: ${effectiveType}. Please ensure the dictionary has been downloaded.`)
}
const instance: JiebaInstance = Jieba.withDict(diskDict)
console.log(`[NLP] jieba dict loaded: ${effectiveType} (${diskDict.length} bytes)`)
jiebaInstances.set(effectiveType, instance)
return instance
} catch (error) {
console.error(`[NLP] Failed to load jieba module (dict=${effectiveType}):`, error)
throw new Error(`jieba 模块加载失败 (${effectiveType})`)
}
}
/**
* 清除指定词库的缓存实例(词库更新后调用)
*/
export function clearJiebaInstance(dictType: DictType): void {
jiebaInstances.delete(dictType)
console.log(`[NLP] jieba instance cleared: ${dictType}`)
}
/**
* 词性标签定义
*/
export const POS_TAG_DEFINITIONS: PosTagInfo[] = [
// 名词类
{ tag: 'n', name: '名词', description: '普通名词', meaningful: true },
{ tag: 'nr', name: '人名', description: '人名', meaningful: true },
{ tag: 'ns', name: '地名', description: '地名', meaningful: true },
{ tag: 'nt', name: '机构名', description: '机构团体名', meaningful: true },
{ tag: 'nz', name: '其他专名', description: '其他专有名词', meaningful: true },
{ tag: 'nw', name: '作品名', description: '作品名', meaningful: true },
// 动词类(普通动词通常不太有意义,如"是""有""说"等)
{ tag: 'v', name: '动词', description: '普通动词', meaningful: false },
{ tag: 'vn', name: '动名词', description: '动名词', meaningful: true },
{ tag: 'vd', name: '副动词', description: '副动词', meaningful: false },
{ tag: 'vg', name: '动语素', description: '动词性语素', meaningful: false },
// 形容词类
{ tag: 'a', name: '形容词', description: '普通形容词', meaningful: true },
{ tag: 'an', name: '名形词', description: '名形词', meaningful: true },
{ tag: 'ad', name: '副形词', description: '副形词', meaningful: true },
{ tag: 'ag', name: '形语素', description: '形容词性语素', meaningful: true },
// 其他有意义
{ tag: 'i', name: '成语', description: '成语', meaningful: true },
{ tag: 'l', name: '习用语', description: '习用语', meaningful: true },
{ tag: 'j', name: '简称', description: '简称略语', meaningful: true },
// 副词、介词等(通常不太有意义)
{ tag: 'd', name: '副词', description: '副词', meaningful: false },
{ tag: 'p', name: '介词', description: '介词', meaningful: false },
{ tag: 'c', name: '连词', description: '连词', meaningful: false },
{ tag: 'u', name: '助词', description: '助词', meaningful: false },
{ tag: 'r', name: '代词', description: '代词', meaningful: false },
{ tag: 'm', name: '数词', description: '数词', meaningful: false },
{ tag: 'q', name: '量词', description: '量词', meaningful: false },
{ tag: 'f', name: '方位词', description: '方位词', meaningful: false },
{ tag: 't', name: '时间词', description: '时间词', meaningful: false },
{ tag: 'e', name: '叹词', description: '叹词', meaningful: false },
{ tag: 'y', name: '语气词', description: '语气词', meaningful: false },
{ tag: 'o', name: '拟声词', description: '拟声词', meaningful: false },
{ tag: 'x', name: '非语素字', description: '非语素字', meaningful: false },
{ tag: 'w', name: '标点符号', description: '标点符号', meaningful: false },
]
/**
* 有意义的词性标签集合
*/
export const MEANINGFUL_POS_TAGS = new Set(POS_TAG_DEFINITIONS.filter((t) => t.meaningful).map((t) => t.tag))
/**
* 获取所有词性标签信息
*/
export function getPosTagDefinitions(): PosTagInfo[] {
return POS_TAG_DEFINITIONS
}
// 用于过滤的正则表达式
const EMOJI_REGEX =
/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}]/gu
const PUNCTUATION_REGEX = /[!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~,。!?、;:""''()【】《》…—~·\s]/g
const URL_REGEX = /https?:\/\/[^\s]+/g
const MENTION_REGEX = /@[^\s@]+/g
const PURE_NUMBER_REGEX = /^\d+$/
/**
* 清理文本
* 移除表情、URL、@提及、标点等
*/
function cleanText(text: string): string {
return text
.replace(URL_REGEX, ' ')
.replace(MENTION_REGEX, ' ')
.replace(EMOJI_REGEX, ' ')
.replace(PUNCTUATION_REGEX, ' ')
.replace(/\s+/g, ' ')
.trim()
}
/**
* 判断是否为有效词语
*/
function isValidWord(
word: string,
locale: SupportedLocale,
minLength: number,
enableStopwords: boolean = true
): boolean {
// 空字符串
if (!word || word.trim().length === 0) return false
// 纯数字
if (PURE_NUMBER_REGEX.test(word)) return false
// 长度不足
if (word.length < minLength) return false
// 停用词
if (enableStopwords && isStopword(word, locale)) return false
return true
}
/**
* 中文分词选项
*/
interface ChineseSegmentOptions {
/** 词性过滤模式 */
posFilterMode?: PosFilterMode
/** 自定义词性过滤列表 */
customPosTags?: string[]
/** 词库类型 */
dictType?: DictType
}
/**
* 收集文本的词性统计(用于显示每个词性有多少词)
* 只统计中文,英文无词性标注
*/
export function collectPosTagStats(
texts: string[],
minWordLength: number = 2,
enableStopwords: boolean = true,
dictType: DictType = 'default'
): Map<string, number> {
const posStats = new Map<string, number>()
try {
const jieba = getJieba(dictType)
for (const text of texts) {
const cleaned = cleanText(text)
if (!cleaned) continue
const tagged = jieba.tag(cleaned)
for (const item of tagged) {
// 这里固定按中文停用词规则校验,但参数顺序必须与 isValidWord 保持一致,
// 否则会把最小词长误传成 locale,最终触发 locale.startsWith 的运行时错误。
if (!isValidWord(item.word, 'zh-CN', minWordLength, enableStopwords)) {
continue
}
posStats.set(item.tag, (posStats.get(item.tag) || 0) + 1)
}
}
} catch (error) {
console.error('[NLP] Failed to collect POS stats:', error)
}
return posStats
}
/**
* 中文分词(使用 jieba 词性标注)
* @param text 文本
* @param options 分词选项
*/
function segmentChinese(text: string, options: ChineseSegmentOptions = {}): string[] {
const { posFilterMode = 'meaningful', customPosTags, dictType = 'default' } = options
const cleaned = cleanText(text)
if (!cleaned) return []
try {
const jieba = getJieba(dictType)
if (posFilterMode === 'all') {
return jieba.cut(cleaned, false)
}
const tagged = jieba.tag(cleaned)
let allowedTags: Set<string>
if (posFilterMode === 'custom' && customPosTags) {
allowedTags = new Set(customPosTags)
} else {
allowedTags = MEANINGFUL_POS_TAGS
}
return tagged.filter((item) => allowedTags.has(item.tag)).map((item) => item.word)
} catch (error) {
console.error('[NLP] Chinese segmentation failed:', error)
try {
const jieba = getJieba('default')
return jieba.cut(cleaned, false)
} catch {
return cleaned.split('')
}
}
}
/**
* 英文分词(使用 Intl.Segmenter
*/
function segmentEnglish(text: string): string[] {
const cleaned = cleanText(text)
if (!cleaned) return []
try {
const segmenter = new Intl.Segmenter('en', { granularity: 'word' })
const segments = segmenter.segment(cleaned)
return [...segments].filter((segment) => segment.isWordLike).map((segment) => segment.segment.toLowerCase())
} catch {
return cleaned
.toLowerCase()
.split(/\s+/)
.filter((word) => word.length > 0)
}
}
/**
* 日语分词(使用 Intl.Segmenter
*/
function segmentJapanese(text: string): string[] {
const cleaned = cleanText(text)
if (!cleaned) return []
try {
const segmenter = new Intl.Segmenter('ja', { granularity: 'word' })
const segments = segmenter.segment(cleaned)
return [...segments].filter((segment) => segment.isWordLike).map((segment) => segment.segment)
} catch {
return cleaned.split('').filter((ch) => ch.trim().length > 0)
}
}
/**
* 分词选项
*/
export interface SegmentOptions {
/** 最小词长(可选,默认中文2,英文3) */
minLength?: number
/** 词性过滤模式(仅中文有效) */
posFilterMode?: PosFilterMode
/** 自定义词性过滤列表 */
customPosTags?: string[]
/** 是否启用停用词过滤 */
enableStopwords?: boolean
/** 词库类型(仅中文有效) */
dictType?: DictType
}
/**
* 通用分词入口
* @param text 待分词文本
* @param locale 语言
* @param options 分词选项
* @returns 过滤后的分词结果
*/
export function segment(text: string, locale: SupportedLocale, options: SegmentOptions = {}): string[] {
const {
minLength,
posFilterMode = 'meaningful',
customPosTags,
enableStopwords = true,
dictType = 'default',
} = options
const isChinese = locale.startsWith('zh')
const isJapanese = locale === 'ja-JP'
const defaultMinLength = isChinese || isJapanese ? 2 : 3
const effectiveMinLength = minLength ?? defaultMinLength
let words: string[]
if (isChinese) {
words = segmentChinese(text, { posFilterMode, customPosTags, dictType })
} else if (isJapanese) {
words = segmentJapanese(text)
} else {
words = segmentEnglish(text)
}
return words.filter((word) => isValidWord(word, locale, effectiveMinLength, enableStopwords))
}
/**
* 批量分词并统计词频选项
*/
export interface BatchSegmentOptions extends SegmentOptions {
minCount?: number
topN?: number
/** 要排除的词语列表 */
excludeWords?: string[]
}
export interface BatchSegmentResult {
/** topN 裁剪后的词频 Map */
words: Map<string, number>
/** 裁剪前的去重词数(过滤低频词后) */
uniqueWords: number
/** 裁剪前的总词次(过滤低频词后所有词的出现次数之和) */
totalWords: number
}
/**
* 批量分词并统计词频
* @param texts 文本数组
* @param locale 语言
* @param options 选项
*/
export function batchSegmentWithFrequency(
texts: string[],
locale: SupportedLocale,
options: BatchSegmentOptions = {}
): BatchSegmentResult {
const { minLength, minCount = 2, topN = 100, posFilterMode, customPosTags, enableStopwords, dictType, excludeWords } = options
const wordFrequency = new Map<string, number>()
const excludeSet = excludeWords?.length ? new Set(excludeWords.map((w) => w.toLowerCase())) : null
for (const text of texts) {
const words = segment(text, locale, { minLength, posFilterMode, customPosTags, enableStopwords, dictType })
for (const word of words) {
if (excludeSet && excludeSet.has(word.toLowerCase())) continue
wordFrequency.set(word, (wordFrequency.get(word) || 0) + 1)
}
}
// 过滤低频词
const filtered = new Map<string, number>()
let totalWords = 0
for (const [word, count] of wordFrequency) {
if (count >= minCount) {
filtered.set(word, count)
totalWords += count
}
}
// 排序并取 topN
const sorted = [...filtered.entries()].sort((a, b) => b[1] - a[1]).slice(0, topN)
return {
words: new Map(sorted),
uniqueWords: filtered.size,
totalWords,
}
}