feat: 新增语言偏好Tab

2026-05-22 06:10:37 +08:00 · 2026-04-17 20:56:45 +08:00
parent a1a587f791
commit 0aa99af048
29 changed files with 2235 additions and 61 deletions
@@ -480,6 +480,21 @@ export function registerChatHandlers(ctx: IpcContext): void {
    }
  )

+  /**
+   * 获取语言偏好分析数据（私聊专用）
+   */
+  ipcMain.handle(
+    'chat:getLanguagePreferenceAnalysis',
+    async (_, sessionId: string, locale: string, filter?: { startTs?: number; endTs?: number }, dictType?: string) => {
+      try {
+        return await worker.getLanguagePreferenceAnalysis({ sessionId, locale, timeFilter: filter, dictType })
+      } catch (error) {
+        console.error('Failed to get language preference analysis:', error)
+        return { members: [], sharedWords: [], similarityScore: 0 }
+      }
+    }
+  )
+
  /**
   * 获取 @ 互动分析数据
   */
@@ -29,6 +29,7 @@ import {
  getSession,
  getChatOverview,
  getCatchphraseAnalysis,
+  getLanguagePreferenceAnalysis,
  getMentionAnalysis,
  getMentionGraph,
  getLaughAnalysis,
@@ -104,6 +105,7 @@ const CACHEABLE_QUERIES = new Set([
  'getMessageTypeDistribution',
  'getTimeRange',
  'getCatchphraseAnalysis',
+  'getLanguagePreferenceAnalysis',
  'getMentionAnalysis',
  'getMentionGraph',
  'getLaughAnalysis',
@@ -184,6 +186,7 @@ const syncHandlers: Record<string, (payload: any) => any> = {

  // 高级分析
  getCatchphraseAnalysis: (p) => getCatchphraseAnalysis(p.sessionId, p.filter),
+  getLanguagePreferenceAnalysis: (p) => getLanguagePreferenceAnalysis(p),
  getMentionAnalysis: (p) => getMentionAnalysis(p.sessionId, p.filter),
  getMentionGraph: (p) => getMentionGraph(p.sessionId, p.filter),
  getLaughAnalysis: (p) => getLaughAnalysis(p.sessionId, p.filter, p.keywords),
@@ -18,6 +18,9 @@ export type {
  ClusterGraphOptions,
 } from './social'

+// 语言偏好分析（私聊专用）
+export { getLanguagePreferenceAnalysis } from './languagePreference'
+
 // 关系分析（私聊主动性）
 export { getRelationshipStats } from './relationship'
 export type {
@@ -0,0 +1,293 @@
+/**
+ * 语言偏好分析模块（私聊专用）
+ *
+ * 一次性对两个成员的全部文字消息做 NLP 分析：
+ * - 词频 + 词性分布
+ * - 语气词画像
+ * - 标点性格
+ * - 口头禅（整句匹配）
+ * - 跨成员：共同高频词、语言同频度
+ */
+
+import { openDatabase, buildTimeFilter, type TimeFilter } from '../../core'
+import { getJieba, isStopword, MEANINGFUL_POS_TAGS } from '../../../nlp'
+import type { SupportedLocale, DictType } from '../../../nlp'
+
+// ---------- 标点 regex ----------
+
+const RE_ELLIPSIS = /\.{2,}|…+|。{2,}/g
+const RE_EXCLAMATION = /[!！]+/g
+const RE_QUESTION = /[?？]+/g
+const RE_TILDE = /[~～]+/g
+const RE_PERIOD = /[.。](?![.。])/g
+const RE_ENDS_WITH_PUNCT = /[.。!！?？~～…,，;；:：、)\]）】》"'」』\-—]$/
+
+// ---------- POS 归类 ----------
+
+const NOUN_TAGS = new Set(['n', 'nr', 'ns', 'nt', 'nz', 'nw'])
+const VERB_TAGS = new Set(['v', 'vn', 'vd', 'vg'])
+const ADJ_TAGS = new Set(['a', 'an', 'ad', 'ag'])
+const ADV_TAGS = new Set(['d'])
+const MODAL_TAGS = new Set(['y', 'e'])
+
+// ---------- URL / Emoji / mention 清理 ----------
+
+const RE_URL = /https?:\/\/[^\s]+/g
+const RE_MENTION = /@[^\s@]+/g
+const RE_EMOJI =
+  /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}]/gu
+const RE_PUNCTUATION = /[!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~，。！？、；：""''（）【】《》…—～·\s]/g
+const RE_PURE_NUMBER = /^\d+$/
+
+function cleanTextForNlp(text: string): string {
+  return text
+    .replace(RE_URL, ' ')
+    .replace(RE_MENTION, ' ')
+    .replace(RE_EMOJI, ' ')
+    .replace(RE_PUNCTUATION, ' ')
+    .replace(/\s+/g, ' ')
+    .trim()
+}
+
+// ---------- 工具函数 ----------
+
+function countMatches(text: string, regex: RegExp): number {
+  regex.lastIndex = 0
+  const m = text.match(regex)
+  return m ? m.length : 0
+}
+
+function cosineSimilarity(a: number[], b: number[]): number {
+  let dot = 0
+  let magA = 0
+  let magB = 0
+  for (let i = 0; i < a.length; i++) {
+    dot += a[i] * b[i]
+    magA += a[i] * a[i]
+    magB += b[i] * b[i]
+  }
+  const denom = Math.sqrt(magA) * Math.sqrt(magB)
+  return denom === 0 ? 0 : dot / denom
+}
+
+// ---------- 主入口 ----------
+
+interface LanguagePreferenceParams {
+  sessionId: string
+  locale: string
+  timeFilter?: TimeFilter
+  dictType?: string
+}
+
+export function getLanguagePreferenceAnalysis(params: LanguagePreferenceParams): any {
+  const { sessionId, locale, timeFilter, dictType = 'default' } = params
+  const db = openDatabase(sessionId)
+  if (!db) return { members: [], sharedWords: [], similarityScore: 0 }
+
+  const { clause, params: filterParams } = buildTimeFilter(timeFilter)
+
+  let whereClause = clause
+  const textFilter =
+    " COALESCE(m.account_name, '') != '系统消息' AND msg.type = 0 AND msg.content IS NOT NULL AND LENGTH(TRIM(msg.content)) >= 2"
+  if (whereClause.includes('WHERE')) {
+    whereClause += ' AND ' + textFilter
+  } else {
+    whereClause = ' WHERE ' + textFilter
+  }
+
+  // 一次查询取全部文字消息，按 member 分组处理
+  const rows = db
+    .prepare(
+      `
+      SELECT
+        m.id as memberId,
+        COALESCE(m.group_nickname, m.account_name, m.platform_id) as name,
+        msg.content as content
+      FROM message msg
+      JOIN member m ON msg.sender_id = m.id
+      ${whereClause}
+      ORDER BY m.id
+      `
+    )
+    .all(...filterParams) as Array<{ memberId: number; name: string; content: string }>
+
+  if (rows.length === 0) {
+    return { members: [], sharedWords: [], similarityScore: 0 }
+  }
+
+  // 按成员分组
+  const memberMessages = new Map<number, { name: string; messages: string[] }>()
+  for (const row of rows) {
+    let entry = memberMessages.get(row.memberId)
+    if (!entry) {
+      entry = { name: row.name, messages: [] }
+      memberMessages.set(row.memberId, entry)
+    }
+    entry.messages.push(row.content)
+  }
+
+  const isChinese = locale.startsWith('zh')
+  const effectiveLocale = (locale || 'zh-CN') as SupportedLocale
+  const minWordLength = isChinese ? 2 : 3
+
+  const memberProfiles: any[] = []
+
+  for (const [memberId, { name, messages }] of memberMessages) {
+    // 词频 + POS
+    const wordFreq = new Map<string, number>()
+    const posCount = { noun: 0, verb: 0, adjective: 0, adverb: 0, modalParticle: 0, interjection: 0, other: 0 }
+    const modalFreq = new Map<string, number>()
+    let totalWordCount = 0
+
+    // 标点
+    const punct = { ellipsis: 0, exclamation: 0, question: 0, tilde: 0, period: 0, noPunct: 0, total: 0 }
+
+    // 口头禅（整句频率）
+    const phraseFreq = new Map<string, number>()
+
+    for (const content of messages) {
+      // 标点分析（原始文本）
+      punct.ellipsis += countMatches(content, RE_ELLIPSIS)
+      punct.exclamation += countMatches(content, RE_EXCLAMATION)
+      punct.question += countMatches(content, RE_QUESTION)
+      punct.tilde += countMatches(content, RE_TILDE)
+      punct.period += countMatches(content, RE_PERIOD)
+      const trimmed = content.trim()
+      if (trimmed.length > 0 && !RE_ENDS_WITH_PUNCT.test(trimmed)) {
+        punct.noPunct++
+      }
+      punct.total++
+
+      // 口头禅
+      const normalised = trimmed
+      if (normalised.length >= 2) {
+        phraseFreq.set(normalised, (phraseFreq.get(normalised) || 0) + 1)
+      }
+
+      // NLP
+      const cleaned = cleanTextForNlp(content)
+      if (!cleaned) continue
+
+      if (isChinese) {
+        try {
+          const jieba = getJieba(dictType as DictType)
+          const tagged = jieba.tag(cleaned)
+          for (const { word, tag } of tagged) {
+            if (!word || word.trim().length === 0) continue
+            if (RE_PURE_NUMBER.test(word)) continue
+            if (word.length < minWordLength && !MODAL_TAGS.has(tag)) continue
+
+            // POS 归类
+            if (NOUN_TAGS.has(tag)) posCount.noun++
+            else if (VERB_TAGS.has(tag)) posCount.verb++
+            else if (ADJ_TAGS.has(tag)) posCount.adjective++
+            else if (ADV_TAGS.has(tag)) posCount.adverb++
+            else if (tag === 'y') posCount.modalParticle++
+            else if (tag === 'e') posCount.interjection++
+            else posCount.other++
+
+            // 语气词 / 叹词
+            if (MODAL_TAGS.has(tag)) {
+              modalFreq.set(word, (modalFreq.get(word) || 0) + 1)
+            }
+
+            // 有意义的词 → 词频
+            if (MEANINGFUL_POS_TAGS.has(tag) || MODAL_TAGS.has(tag)) {
+              if (!isStopword(word, effectiveLocale)) {
+                wordFreq.set(word, (wordFreq.get(word) || 0) + 1)
+                totalWordCount++
+              }
+            }
+          }
+        } catch {
+          // jieba 失败时跳过
+        }
+      } else {
+        // 非中文：简单 Intl.Segmenter 分词
+        try {
+          const segmenter = new Intl.Segmenter(locale, { granularity: 'word' })
+          for (const seg of segmenter.segment(cleaned)) {
+            if (!seg.isWordLike) continue
+            const w = seg.segment.toLowerCase()
+            if (w.length < minWordLength) continue
+            if (RE_PURE_NUMBER.test(w)) continue
+            if (isStopword(w, effectiveLocale)) continue
+            wordFreq.set(w, (wordFreq.get(w) || 0) + 1)
+            totalWordCount++
+            posCount.other++
+          }
+        } catch {
+          // fallback
+        }
+      }
+    }
+
+    // 过滤低频词 & 排序
+    const filteredWords = [...wordFreq.entries()].filter(([, c]) => c >= 2).sort((a, b) => b[1] - a[1])
+    const uniqueWords = filteredWords.length
+    const topWords = filteredWords.slice(0, 100).map(([word, count]) => ({ word, count }))
+
+    const lexicalDiversity = totalWordCount > 0 ? Math.round((uniqueWords / totalWordCount) * 10000) / 100 : 0
+
+    // 语气词 Top 20
+    const modalParticles = [...modalFreq.entries()]
+      .sort((a, b) => b[1] - a[1])
+      .slice(0, 20)
+      .map(([word, count]) => ({ word, count }))
+
+    // 口头禅 Top 50（count >= 2）
+    const catchphrases = [...phraseFreq.entries()]
+      .filter(([, c]) => c >= 2)
+      .sort((a, b) => b[1] - a[1])
+      .slice(0, 50)
+      .map(([content, count]) => ({ content, count }))
+
+    memberProfiles.push({
+      memberId,
+      name,
+      totalMessages: messages.length,
+      totalWords: totalWordCount,
+      uniqueWords,
+      lexicalDiversity,
+      topWords,
+      posDistribution: posCount,
+      modalParticles,
+      punctuation: punct,
+      catchphrases,
+    })
+  }
+
+  // 按消息总数降序
+  memberProfiles.sort((a, b) => b.totalMessages - a.totalMessages)
+
+  // 跨成员：共同高频词 & 语言同频度
+  let sharedWords: any[] = []
+  let similarityScore = 0
+
+  if (memberProfiles.length >= 2) {
+    const a = memberProfiles[0]
+    const b = memberProfiles[1]
+
+    // 共同高频词
+    const wordsA = new Map(a.topWords.map((w: any) => [w.word, w.count]))
+    const wordsB = new Map(b.topWords.map((w: any) => [w.word, w.count]))
+    const shared: Array<{ word: string; countA: number; countB: number }> = []
+    for (const [word, countA] of wordsA) {
+      const countB = wordsB.get(word)
+      if (countB) {
+        shared.push({ word, countA, countB })
+      }
+    }
+    shared.sort((x, y) => x.countA + x.countB - (y.countA + y.countB))
+    shared.reverse()
+    sharedWords = shared.slice(0, 30)
+
+    // 余弦相似度：基于 POS 分布向量
+    const posKeys = ['noun', 'verb', 'adjective', 'adverb', 'modalParticle', 'interjection', 'other'] as const
+    const vecA = posKeys.map((k) => a.posDistribution[k] as number)
+    const vecB = posKeys.map((k) => b.posDistribution[k] as number)
+    similarityScore = Math.round(cosineSimilarity(vecA, vecB) * 100)
+  }
+
+  return { members: memberProfiles, sharedWords, similarityScore }
+}
@@ -33,6 +33,7 @@ export type { MembersPaginationParams, MembersPaginatedResult } from './basic'
 // 高级分析
 export {
  getCatchphraseAnalysis,
+  getLanguagePreferenceAnalysis,
  getMentionAnalysis,
  getMentionGraph,
  getLaughAnalysis,
@@ -375,6 +375,15 @@ export async function getCatchphraseAnalysis(sessionId: string, filter?: any): P
  return sendToWorker('getCatchphraseAnalysis', { sessionId, filter })
 }

+export async function getLanguagePreferenceAnalysis(params: {
+  sessionId: string
+  locale: string
+  timeFilter?: any
+  dictType?: string
+}): Promise<any> {
+  return sendToWorker('getLanguagePreferenceAnalysis', params)
+}
+
 export async function getMentionAnalysis(sessionId: string, filter?: any): Promise<any> {
  return sendToWorker('getMentionAnalysis', { sessionId, filter })
 }