mirror of
https://github.com/hellodigua/ChatLab.git
synced 2026-04-29 08:12:41 +08:00
328 lines
10 KiB
TypeScript
328 lines
10 KiB
TypeScript
/**
|
||
* 分词器模块
|
||
* 中文使用 @node-rs/jieba,其他语言使用 Intl.Segmenter
|
||
*/
|
||
|
||
import type { SupportedLocale, PosFilterMode, PosTagInfo } from './types'
|
||
import { isStopword } from './stopwords'
|
||
|
||
// Jieba 实例类型
|
||
interface JiebaInstance {
|
||
cut: (text: string, hmm?: boolean) => string[]
|
||
tag: (text: string) => Array<{ tag: string; word: string }>
|
||
}
|
||
|
||
// Jieba 实例(延迟初始化)
|
||
let jiebaInstance: JiebaInstance | null = null
|
||
|
||
/**
|
||
* 获取 Jieba 实例(延迟加载)
|
||
*/
|
||
function getJieba(): JiebaInstance {
|
||
if (!jiebaInstance) {
|
||
try {
|
||
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||
const { Jieba } = require('@node-rs/jieba')
|
||
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||
const { dict } = require('@node-rs/jieba/dict')
|
||
jiebaInstance = Jieba.withDict(dict)
|
||
console.log('[NLP] jieba 模块加载成功')
|
||
} catch (error) {
|
||
console.error('[NLP] jieba 模块加载失败:', error)
|
||
throw new Error('jieba 模块加载失败')
|
||
}
|
||
}
|
||
return jiebaInstance
|
||
}
|
||
|
||
/**
|
||
* 词性标签定义
|
||
*/
|
||
export const POS_TAG_DEFINITIONS: PosTagInfo[] = [
|
||
// 名词类
|
||
{ tag: 'n', name: '名词', description: '普通名词', meaningful: true },
|
||
{ tag: 'nr', name: '人名', description: '人名', meaningful: true },
|
||
{ tag: 'ns', name: '地名', description: '地名', meaningful: true },
|
||
{ tag: 'nt', name: '机构名', description: '机构团体名', meaningful: true },
|
||
{ tag: 'nz', name: '其他专名', description: '其他专有名词', meaningful: true },
|
||
{ tag: 'nw', name: '作品名', description: '作品名', meaningful: true },
|
||
// 动词类(普通动词通常不太有意义,如"是""有""说"等)
|
||
{ tag: 'v', name: '动词', description: '普通动词', meaningful: false },
|
||
{ tag: 'vn', name: '动名词', description: '动名词', meaningful: true },
|
||
{ tag: 'vd', name: '副动词', description: '副动词', meaningful: false },
|
||
{ tag: 'vg', name: '动语素', description: '动词性语素', meaningful: false },
|
||
// 形容词类
|
||
{ tag: 'a', name: '形容词', description: '普通形容词', meaningful: true },
|
||
{ tag: 'an', name: '名形词', description: '名形词', meaningful: true },
|
||
{ tag: 'ad', name: '副形词', description: '副形词', meaningful: true },
|
||
{ tag: 'ag', name: '形语素', description: '形容词性语素', meaningful: true },
|
||
// 其他有意义
|
||
{ tag: 'i', name: '成语', description: '成语', meaningful: true },
|
||
{ tag: 'l', name: '习用语', description: '习用语', meaningful: true },
|
||
{ tag: 'j', name: '简称', description: '简称略语', meaningful: true },
|
||
// 副词、介词等(通常不太有意义)
|
||
{ tag: 'd', name: '副词', description: '副词', meaningful: false },
|
||
{ tag: 'p', name: '介词', description: '介词', meaningful: false },
|
||
{ tag: 'c', name: '连词', description: '连词', meaningful: false },
|
||
{ tag: 'u', name: '助词', description: '助词', meaningful: false },
|
||
{ tag: 'r', name: '代词', description: '代词', meaningful: false },
|
||
{ tag: 'm', name: '数词', description: '数词', meaningful: false },
|
||
{ tag: 'q', name: '量词', description: '量词', meaningful: false },
|
||
{ tag: 'f', name: '方位词', description: '方位词', meaningful: false },
|
||
{ tag: 't', name: '时间词', description: '时间词', meaningful: false },
|
||
{ tag: 'e', name: '叹词', description: '叹词', meaningful: false },
|
||
{ tag: 'y', name: '语气词', description: '语气词', meaningful: false },
|
||
{ tag: 'o', name: '拟声词', description: '拟声词', meaningful: false },
|
||
{ tag: 'x', name: '非语素字', description: '非语素字', meaningful: false },
|
||
{ tag: 'w', name: '标点符号', description: '标点符号', meaningful: false },
|
||
]
|
||
|
||
/**
|
||
* 有意义的词性标签集合
|
||
*/
|
||
export const MEANINGFUL_POS_TAGS = new Set(POS_TAG_DEFINITIONS.filter((t) => t.meaningful).map((t) => t.tag))
|
||
|
||
/**
|
||
* 获取所有词性标签信息
|
||
*/
|
||
export function getPosTagDefinitions(): PosTagInfo[] {
|
||
return POS_TAG_DEFINITIONS
|
||
}
|
||
|
||
// 用于过滤的正则表达式
|
||
const EMOJI_REGEX =
|
||
/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}]/gu
|
||
const PUNCTUATION_REGEX = /[!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~,。!?、;:""''()【】《》…—~·\s]/g
|
||
const URL_REGEX = /https?:\/\/[^\s]+/g
|
||
const MENTION_REGEX = /@[^\s@]+/g
|
||
const PURE_NUMBER_REGEX = /^\d+$/
|
||
|
||
/**
|
||
* 清理文本
|
||
* 移除表情、URL、@提及、标点等
|
||
*/
|
||
function cleanText(text: string): string {
|
||
return text
|
||
.replace(URL_REGEX, ' ')
|
||
.replace(MENTION_REGEX, ' ')
|
||
.replace(EMOJI_REGEX, ' ')
|
||
.replace(PUNCTUATION_REGEX, ' ')
|
||
.replace(/\s+/g, ' ')
|
||
.trim()
|
||
}
|
||
|
||
/**
|
||
* 判断是否为有效词语
|
||
*/
|
||
function isValidWord(
|
||
word: string,
|
||
locale: SupportedLocale,
|
||
minLength: number,
|
||
enableStopwords: boolean = true
|
||
): boolean {
|
||
// 空字符串
|
||
if (!word || word.trim().length === 0) return false
|
||
|
||
// 纯数字
|
||
if (PURE_NUMBER_REGEX.test(word)) return false
|
||
|
||
// 长度不足
|
||
if (word.length < minLength) return false
|
||
|
||
// 停用词
|
||
if (enableStopwords && isStopword(word, locale)) return false
|
||
|
||
return true
|
||
}
|
||
|
||
/**
|
||
* 中文分词选项
|
||
*/
|
||
interface ChineseSegmentOptions {
|
||
/** 词性过滤模式 */
|
||
posFilterMode?: PosFilterMode
|
||
/** 自定义词性过滤列表 */
|
||
customPosTags?: string[]
|
||
}
|
||
|
||
/**
|
||
* 收集文本的词性统计(用于显示每个词性有多少词)
|
||
* 只统计中文,英文无词性标注
|
||
*/
|
||
export function collectPosTagStats(
|
||
texts: string[],
|
||
minWordLength: number = 2,
|
||
enableStopwords: boolean = true
|
||
): Map<string, number> {
|
||
const posStats = new Map<string, number>()
|
||
|
||
try {
|
||
const jieba = getJieba()
|
||
|
||
for (const text of texts) {
|
||
const cleaned = cleanText(text)
|
||
if (!cleaned) continue
|
||
|
||
const tagged = jieba.tag(cleaned)
|
||
|
||
for (const item of tagged) {
|
||
// 检查词是否有效(长度和停用词过滤)
|
||
if (!isValidWord(item.word, minWordLength, 'zh-CN', enableStopwords)) {
|
||
continue
|
||
}
|
||
posStats.set(item.tag, (posStats.get(item.tag) || 0) + 1)
|
||
}
|
||
}
|
||
} catch (error) {
|
||
console.error('[NLP] 收集词性统计失败:', error)
|
||
}
|
||
|
||
return posStats
|
||
}
|
||
|
||
/**
|
||
* 中文分词(使用 jieba 词性标注)
|
||
* @param text 文本
|
||
* @param options 分词选项
|
||
*/
|
||
function segmentChinese(text: string, options: ChineseSegmentOptions = {}): string[] {
|
||
const { posFilterMode = 'meaningful', customPosTags } = options
|
||
const cleaned = cleanText(text)
|
||
if (!cleaned) return []
|
||
|
||
try {
|
||
const jieba = getJieba()
|
||
|
||
// 全部模式:直接分词,不做词性过滤
|
||
if (posFilterMode === 'all') {
|
||
return jieba.cut(cleaned, false)
|
||
}
|
||
|
||
// 使用词性标注
|
||
const tagged = jieba.tag(cleaned)
|
||
|
||
// 根据模式过滤
|
||
let allowedTags: Set<string>
|
||
if (posFilterMode === 'custom' && customPosTags) {
|
||
allowedTags = new Set(customPosTags)
|
||
} else {
|
||
// meaningful 模式
|
||
allowedTags = MEANINGFUL_POS_TAGS
|
||
}
|
||
|
||
return tagged.filter((item) => allowedTags.has(item.tag)).map((item) => item.word)
|
||
} catch (error) {
|
||
console.error('[NLP] 中文分词失败:', error)
|
||
// 降级:使用简单分词
|
||
try {
|
||
const jieba = getJieba()
|
||
return jieba.cut(cleaned, false)
|
||
} catch {
|
||
return cleaned.split('')
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 英文分词(使用 Intl.Segmenter)
|
||
*/
|
||
function segmentEnglish(text: string): string[] {
|
||
const cleaned = cleanText(text)
|
||
if (!cleaned) return []
|
||
|
||
try {
|
||
const segmenter = new Intl.Segmenter('en', { granularity: 'word' })
|
||
const segments = segmenter.segment(cleaned)
|
||
|
||
return [...segments].filter((segment) => segment.isWordLike).map((segment) => segment.segment.toLowerCase())
|
||
} catch {
|
||
// 降级:简单按空格分词
|
||
return cleaned
|
||
.toLowerCase()
|
||
.split(/\s+/)
|
||
.filter((word) => word.length > 0)
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 分词选项
|
||
*/
|
||
export interface SegmentOptions {
|
||
/** 最小词长(可选,默认中文2,英文3) */
|
||
minLength?: number
|
||
/** 词性过滤模式(仅中文有效) */
|
||
posFilterMode?: PosFilterMode
|
||
/** 自定义词性过滤列表 */
|
||
customPosTags?: string[]
|
||
/** 是否启用停用词过滤 */
|
||
enableStopwords?: boolean
|
||
}
|
||
|
||
/**
|
||
* 通用分词入口
|
||
* @param text 待分词文本
|
||
* @param locale 语言
|
||
* @param options 分词选项
|
||
* @returns 过滤后的分词结果
|
||
*/
|
||
export function segment(text: string, locale: SupportedLocale, options: SegmentOptions = {}): string[] {
|
||
const { minLength, posFilterMode = 'meaningful', customPosTags, enableStopwords = true } = options
|
||
const defaultMinLength = locale === 'zh-CN' ? 2 : 3
|
||
const effectiveMinLength = minLength ?? defaultMinLength
|
||
|
||
let words: string[]
|
||
|
||
if (locale === 'zh-CN') {
|
||
words = segmentChinese(text, { posFilterMode, customPosTags })
|
||
} else {
|
||
words = segmentEnglish(text)
|
||
}
|
||
|
||
// 过滤无效词
|
||
return words.filter((word) => isValidWord(word, locale, effectiveMinLength, enableStopwords))
|
||
}
|
||
|
||
/**
|
||
* 批量分词并统计词频选项
|
||
*/
|
||
export interface BatchSegmentOptions extends SegmentOptions {
|
||
minCount?: number
|
||
topN?: number
|
||
}
|
||
|
||
/**
|
||
* 批量分词并统计词频
|
||
* @param texts 文本数组
|
||
* @param locale 语言
|
||
* @param options 选项
|
||
* @returns 词频 Map
|
||
*/
|
||
export function batchSegmentWithFrequency(
|
||
texts: string[],
|
||
locale: SupportedLocale,
|
||
options: BatchSegmentOptions = {}
|
||
): Map<string, number> {
|
||
const { minLength, minCount = 2, topN = 100, posFilterMode, customPosTags, enableStopwords } = options
|
||
const wordFrequency = new Map<string, number>()
|
||
|
||
for (const text of texts) {
|
||
const words = segment(text, locale, { minLength, posFilterMode, customPosTags, enableStopwords })
|
||
for (const word of words) {
|
||
wordFrequency.set(word, (wordFrequency.get(word) || 0) + 1)
|
||
}
|
||
}
|
||
|
||
// 过滤低频词
|
||
const filtered = new Map<string, number>()
|
||
for (const [word, count] of wordFrequency) {
|
||
if (count >= minCount) {
|
||
filtered.set(word, count)
|
||
}
|
||
}
|
||
|
||
// 排序并取 topN
|
||
const sorted = [...filtered.entries()].sort((a, b) => b[1] - a[1]).slice(0, topN)
|
||
|
||
return new Map(sorted)
|
||
}
|