feat: 重构话题，话题新增话题卡片

2026-05-24 07:30:52 +08:00 · 2026-04-13 00:21:59 +08:00
parent 6254a234ec
commit e628390111
15 changed files with 809 additions and 271 deletions
@@ -365,18 +365,26 @@ export interface BatchSegmentOptions extends SegmentOptions {
  excludeWords?: string[]
 }

+export interface BatchSegmentResult {
+  /** topN 裁剪后的词频 Map */
+  words: Map<string, number>
+  /** 裁剪前的去重词数（过滤低频词后） */
+  uniqueWords: number
+  /** 裁剪前的总词次（过滤低频词后所有词的出现次数之和） */
+  totalWords: number
+}
+
 /**
 * 批量分词并统计词频
 * @param texts 文本数组
 * @param locale 语言
 * @param options 选项
- * @returns 词频 Map
 */
 export function batchSegmentWithFrequency(
  texts: string[],
  locale: SupportedLocale,
  options: BatchSegmentOptions = {}
-): Map<string, number> {
+): BatchSegmentResult {
  const { minLength, minCount = 2, topN = 100, posFilterMode, customPosTags, enableStopwords, dictType, excludeWords } = options
  const wordFrequency = new Map<string, number>()
  const excludeSet = excludeWords?.length ? new Set(excludeWords.map((w) => w.toLowerCase())) : null
@@ -391,14 +399,20 @@ export function batchSegmentWithFrequency(

  // 过滤低频词
  const filtered = new Map<string, number>()
+  let totalWords = 0
  for (const [word, count] of wordFrequency) {
    if (count >= minCount) {
      filtered.set(word, count)
+      totalWords += count
    }
  }

  // 排序并取 topN
  const sorted = [...filtered.entries()].sort((a, b) => b[1] - a[1]).slice(0, topN)

-  return new Map(sorted)
+  return {
+    words: new Map(sorted),
+    uniqueWords: filtered.size,
+    totalWords,
+  }
 }
@@ -5,7 +5,14 @@

 import { openDatabase, buildTimeFilter, type TimeFilter } from '../core'
 import { segment, batchSegmentWithFrequency, getPosTagDefinitions, collectPosTagStats } from '../../nlp'
-import type { SupportedLocale, WordFrequencyResult, WordFrequencyParams, PosTagInfo, PosTagStat, DictType } from '../../nlp'
+import type {
+  SupportedLocale,
+  WordFrequencyResult,
+  WordFrequencyParams,
+  PosTagInfo,
+  PosTagStat,
+  DictType,
+} from '../../nlp'

 /**
 * 获取词频统计
@@ -80,7 +87,7 @@ export function getWordFrequency(params: WordFrequencyParams): WordFrequencyResu
    posTagStats = [...posStatsMap.entries()].map(([tag, count]) => ({ tag, count }))
  }

-  const wordFrequency = batchSegmentWithFrequency(texts, locale as SupportedLocale, {
+  const result = batchSegmentWithFrequency(texts, locale as SupportedLocale, {
    minLength: minWordLength,
    minCount,
    topN,
@@ -91,22 +98,22 @@ export function getWordFrequency(params: WordFrequencyParams): WordFrequencyResu
    excludeWords,
  })

-  let totalWords = 0
-  for (const count of wordFrequency.values()) {
-    totalWords += count
+  let topNTotalWords = 0
+  for (const count of result.words.values()) {
+    topNTotalWords += count
  }

-  const words = [...wordFrequency.entries()].map(([word, count]) => ({
+  const words = [...result.words.entries()].map(([word, count]) => ({
    word,
    count,
-    percentage: totalWords > 0 ? Math.round((count / totalWords) * 10000) / 100 : 0,
+    percentage: topNTotalWords > 0 ? Math.round((count / topNTotalWords) * 10000) / 100 : 0,
  }))

  return {
    words,
-    totalWords,
+    totalWords: result.totalWords,
    totalMessages: messages.length,
-    uniqueWords: wordFrequency.size,
+    uniqueWords: result.uniqueWords,
    posTagStats,
  }
 }