mirror of
https://github.com/hellodigua/ChatLab.git
synced 2026-05-24 07:30:52 +08:00
feat: 重构话题,话题新增话题卡片
This commit is contained in:
@@ -365,18 +365,26 @@ export interface BatchSegmentOptions extends SegmentOptions {
|
||||
excludeWords?: string[]
|
||||
}
|
||||
|
||||
export interface BatchSegmentResult {
|
||||
/** topN 裁剪后的词频 Map */
|
||||
words: Map<string, number>
|
||||
/** 裁剪前的去重词数(过滤低频词后) */
|
||||
uniqueWords: number
|
||||
/** 裁剪前的总词次(过滤低频词后所有词的出现次数之和) */
|
||||
totalWords: number
|
||||
}
|
||||
|
||||
/**
|
||||
* 批量分词并统计词频
|
||||
* @param texts 文本数组
|
||||
* @param locale 语言
|
||||
* @param options 选项
|
||||
* @returns 词频 Map
|
||||
*/
|
||||
export function batchSegmentWithFrequency(
|
||||
texts: string[],
|
||||
locale: SupportedLocale,
|
||||
options: BatchSegmentOptions = {}
|
||||
): Map<string, number> {
|
||||
): BatchSegmentResult {
|
||||
const { minLength, minCount = 2, topN = 100, posFilterMode, customPosTags, enableStopwords, dictType, excludeWords } = options
|
||||
const wordFrequency = new Map<string, number>()
|
||||
const excludeSet = excludeWords?.length ? new Set(excludeWords.map((w) => w.toLowerCase())) : null
|
||||
@@ -391,14 +399,20 @@ export function batchSegmentWithFrequency(
|
||||
|
||||
// 过滤低频词
|
||||
const filtered = new Map<string, number>()
|
||||
let totalWords = 0
|
||||
for (const [word, count] of wordFrequency) {
|
||||
if (count >= minCount) {
|
||||
filtered.set(word, count)
|
||||
totalWords += count
|
||||
}
|
||||
}
|
||||
|
||||
// 排序并取 topN
|
||||
const sorted = [...filtered.entries()].sort((a, b) => b[1] - a[1]).slice(0, topN)
|
||||
|
||||
return new Map(sorted)
|
||||
return {
|
||||
words: new Map(sorted),
|
||||
uniqueWords: filtered.size,
|
||||
totalWords,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,14 @@
|
||||
|
||||
import { openDatabase, buildTimeFilter, type TimeFilter } from '../core'
|
||||
import { segment, batchSegmentWithFrequency, getPosTagDefinitions, collectPosTagStats } from '../../nlp'
|
||||
import type { SupportedLocale, WordFrequencyResult, WordFrequencyParams, PosTagInfo, PosTagStat, DictType } from '../../nlp'
|
||||
import type {
|
||||
SupportedLocale,
|
||||
WordFrequencyResult,
|
||||
WordFrequencyParams,
|
||||
PosTagInfo,
|
||||
PosTagStat,
|
||||
DictType,
|
||||
} from '../../nlp'
|
||||
|
||||
/**
|
||||
* 获取词频统计
|
||||
@@ -80,7 +87,7 @@ export function getWordFrequency(params: WordFrequencyParams): WordFrequencyResu
|
||||
posTagStats = [...posStatsMap.entries()].map(([tag, count]) => ({ tag, count }))
|
||||
}
|
||||
|
||||
const wordFrequency = batchSegmentWithFrequency(texts, locale as SupportedLocale, {
|
||||
const result = batchSegmentWithFrequency(texts, locale as SupportedLocale, {
|
||||
minLength: minWordLength,
|
||||
minCount,
|
||||
topN,
|
||||
@@ -91,22 +98,22 @@ export function getWordFrequency(params: WordFrequencyParams): WordFrequencyResu
|
||||
excludeWords,
|
||||
})
|
||||
|
||||
let totalWords = 0
|
||||
for (const count of wordFrequency.values()) {
|
||||
totalWords += count
|
||||
let topNTotalWords = 0
|
||||
for (const count of result.words.values()) {
|
||||
topNTotalWords += count
|
||||
}
|
||||
|
||||
const words = [...wordFrequency.entries()].map(([word, count]) => ({
|
||||
const words = [...result.words.entries()].map(([word, count]) => ({
|
||||
word,
|
||||
count,
|
||||
percentage: totalWords > 0 ? Math.round((count / totalWords) * 10000) / 100 : 0,
|
||||
percentage: topNTotalWords > 0 ? Math.round((count / topNTotalWords) * 10000) / 100 : 0,
|
||||
}))
|
||||
|
||||
return {
|
||||
words,
|
||||
totalWords,
|
||||
totalWords: result.totalWords,
|
||||
totalMessages: messages.length,
|
||||
uniqueWords: wordFrequency.size,
|
||||
uniqueWords: result.uniqueWords,
|
||||
posTagStats,
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user