feat: 重构话题,话题新增话题卡片

This commit is contained in:
digua
2026-04-13 00:21:59 +08:00
committed by digua
parent 6254a234ec
commit e628390111
15 changed files with 809 additions and 271 deletions
+17 -3
View File
@@ -365,18 +365,26 @@ export interface BatchSegmentOptions extends SegmentOptions {
excludeWords?: string[]
}
export interface BatchSegmentResult {
/** topN 裁剪后的词频 Map */
words: Map<string, number>
/** 裁剪前的去重词数(过滤低频词后) */
uniqueWords: number
/** 裁剪前的总词次(过滤低频词后所有词的出现次数之和) */
totalWords: number
}
/**
* 批量分词并统计词频
* @param texts 文本数组
* @param locale 语言
* @param options 选项
* @returns 词频 Map
*/
export function batchSegmentWithFrequency(
texts: string[],
locale: SupportedLocale,
options: BatchSegmentOptions = {}
): Map<string, number> {
): BatchSegmentResult {
const { minLength, minCount = 2, topN = 100, posFilterMode, customPosTags, enableStopwords, dictType, excludeWords } = options
const wordFrequency = new Map<string, number>()
const excludeSet = excludeWords?.length ? new Set(excludeWords.map((w) => w.toLowerCase())) : null
@@ -391,14 +399,20 @@ export function batchSegmentWithFrequency(
// 过滤低频词
const filtered = new Map<string, number>()
let totalWords = 0
for (const [word, count] of wordFrequency) {
if (count >= minCount) {
filtered.set(word, count)
totalWords += count
}
}
// 排序并取 topN
const sorted = [...filtered.entries()].sort((a, b) => b[1] - a[1]).slice(0, topN)
return new Map(sorted)
return {
words: new Map(sorted),
uniqueWords: filtered.size,
totalWords,
}
}
+16 -9
View File
@@ -5,7 +5,14 @@
import { openDatabase, buildTimeFilter, type TimeFilter } from '../core'
import { segment, batchSegmentWithFrequency, getPosTagDefinitions, collectPosTagStats } from '../../nlp'
import type { SupportedLocale, WordFrequencyResult, WordFrequencyParams, PosTagInfo, PosTagStat, DictType } from '../../nlp'
import type {
SupportedLocale,
WordFrequencyResult,
WordFrequencyParams,
PosTagInfo,
PosTagStat,
DictType,
} from '../../nlp'
/**
* 获取词频统计
@@ -80,7 +87,7 @@ export function getWordFrequency(params: WordFrequencyParams): WordFrequencyResu
posTagStats = [...posStatsMap.entries()].map(([tag, count]) => ({ tag, count }))
}
const wordFrequency = batchSegmentWithFrequency(texts, locale as SupportedLocale, {
const result = batchSegmentWithFrequency(texts, locale as SupportedLocale, {
minLength: minWordLength,
minCount,
topN,
@@ -91,22 +98,22 @@ export function getWordFrequency(params: WordFrequencyParams): WordFrequencyResu
excludeWords,
})
let totalWords = 0
for (const count of wordFrequency.values()) {
totalWords += count
let topNTotalWords = 0
for (const count of result.words.values()) {
topNTotalWords += count
}
const words = [...wordFrequency.entries()].map(([word, count]) => ({
const words = [...result.words.entries()].map(([word, count]) => ({
word,
count,
percentage: totalWords > 0 ? Math.round((count / totalWords) * 10000) / 100 : 0,
percentage: topNTotalWords > 0 ? Math.round((count / topNTotalWords) * 10000) / 100 : 0,
}))
return {
words,
totalWords,
totalWords: result.totalWords,
totalMessages: messages.length,
uniqueWords: wordFrequency.size,
uniqueWords: result.uniqueWords,
posTagStats,
}
}