feat: 新增语言偏好Tab

This commit is contained in:
digua
2026-04-17 20:56:45 +08:00
committed by digua
parent a1a587f791
commit 0aa99af048
29 changed files with 2235 additions and 61 deletions
+15
View File
@@ -480,6 +480,21 @@ export function registerChatHandlers(ctx: IpcContext): void {
}
)
/**
* 获取语言偏好分析数据(私聊专用)
*/
ipcMain.handle(
'chat:getLanguagePreferenceAnalysis',
async (_, sessionId: string, locale: string, filter?: { startTs?: number; endTs?: number }, dictType?: string) => {
try {
return await worker.getLanguagePreferenceAnalysis({ sessionId, locale, timeFilter: filter, dictType })
} catch (error) {
console.error('Failed to get language preference analysis:', error)
return { members: [], sharedWords: [], similarityScore: 0 }
}
}
)
/**
* 获取 @ 互动分析数据
*/
+3
View File
@@ -29,6 +29,7 @@ import {
getSession,
getChatOverview,
getCatchphraseAnalysis,
getLanguagePreferenceAnalysis,
getMentionAnalysis,
getMentionGraph,
getLaughAnalysis,
@@ -104,6 +105,7 @@ const CACHEABLE_QUERIES = new Set([
'getMessageTypeDistribution',
'getTimeRange',
'getCatchphraseAnalysis',
'getLanguagePreferenceAnalysis',
'getMentionAnalysis',
'getMentionGraph',
'getLaughAnalysis',
@@ -184,6 +186,7 @@ const syncHandlers: Record<string, (payload: any) => any> = {
// 高级分析
getCatchphraseAnalysis: (p) => getCatchphraseAnalysis(p.sessionId, p.filter),
getLanguagePreferenceAnalysis: (p) => getLanguagePreferenceAnalysis(p),
getMentionAnalysis: (p) => getMentionAnalysis(p.sessionId, p.filter),
getMentionGraph: (p) => getMentionGraph(p.sessionId, p.filter),
getLaughAnalysis: (p) => getLaughAnalysis(p.sessionId, p.filter, p.keywords),
@@ -18,6 +18,9 @@ export type {
ClusterGraphOptions,
} from './social'
// 语言偏好分析(私聊专用)
export { getLanguagePreferenceAnalysis } from './languagePreference'
// 关系分析(私聊主动性)
export { getRelationshipStats } from './relationship'
export type {
@@ -0,0 +1,293 @@
/**
* 语言偏好分析模块(私聊专用)
*
* 一次性对两个成员的全部文字消息做 NLP 分析:
* - 词频 + 词性分布
* - 语气词画像
* - 标点性格
* - 口头禅(整句匹配)
* - 跨成员:共同高频词、语言同频度
*/
import { openDatabase, buildTimeFilter, type TimeFilter } from '../../core'
import { getJieba, isStopword, MEANINGFUL_POS_TAGS } from '../../../nlp'
import type { SupportedLocale, DictType } from '../../../nlp'
// ---------- 标点 regex ----------
const RE_ELLIPSIS = /\.{2,}|…+|。{2,}/g
const RE_EXCLAMATION = /[!]+/g
const RE_QUESTION = /[?]+/g
const RE_TILDE = /[~]+/g
const RE_PERIOD = /[.。](?![.。])/g
const RE_ENDS_WITH_PUNCT = /[.。!?~~…,;::、)\])】》"'」』\-—]$/
// ---------- POS 归类 ----------
const NOUN_TAGS = new Set(['n', 'nr', 'ns', 'nt', 'nz', 'nw'])
const VERB_TAGS = new Set(['v', 'vn', 'vd', 'vg'])
const ADJ_TAGS = new Set(['a', 'an', 'ad', 'ag'])
const ADV_TAGS = new Set(['d'])
const MODAL_TAGS = new Set(['y', 'e'])
// ---------- URL / Emoji / mention 清理 ----------
const RE_URL = /https?:\/\/[^\s]+/g
const RE_MENTION = /@[^\s@]+/g
const RE_EMOJI =
/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}]/gu
const RE_PUNCTUATION = /[!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~,。!?、;:""''()【】《》…—~·\s]/g
const RE_PURE_NUMBER = /^\d+$/
function cleanTextForNlp(text: string): string {
return text
.replace(RE_URL, ' ')
.replace(RE_MENTION, ' ')
.replace(RE_EMOJI, ' ')
.replace(RE_PUNCTUATION, ' ')
.replace(/\s+/g, ' ')
.trim()
}
// ---------- 工具函数 ----------
function countMatches(text: string, regex: RegExp): number {
regex.lastIndex = 0
const m = text.match(regex)
return m ? m.length : 0
}
function cosineSimilarity(a: number[], b: number[]): number {
let dot = 0
let magA = 0
let magB = 0
for (let i = 0; i < a.length; i++) {
dot += a[i] * b[i]
magA += a[i] * a[i]
magB += b[i] * b[i]
}
const denom = Math.sqrt(magA) * Math.sqrt(magB)
return denom === 0 ? 0 : dot / denom
}
// ---------- 主入口 ----------
interface LanguagePreferenceParams {
sessionId: string
locale: string
timeFilter?: TimeFilter
dictType?: string
}
export function getLanguagePreferenceAnalysis(params: LanguagePreferenceParams): any {
const { sessionId, locale, timeFilter, dictType = 'default' } = params
const db = openDatabase(sessionId)
if (!db) return { members: [], sharedWords: [], similarityScore: 0 }
const { clause, params: filterParams } = buildTimeFilter(timeFilter)
let whereClause = clause
const textFilter =
" COALESCE(m.account_name, '') != '系统消息' AND msg.type = 0 AND msg.content IS NOT NULL AND LENGTH(TRIM(msg.content)) >= 2"
if (whereClause.includes('WHERE')) {
whereClause += ' AND ' + textFilter
} else {
whereClause = ' WHERE ' + textFilter
}
// 一次查询取全部文字消息,按 member 分组处理
const rows = db
.prepare(
`
SELECT
m.id as memberId,
COALESCE(m.group_nickname, m.account_name, m.platform_id) as name,
msg.content as content
FROM message msg
JOIN member m ON msg.sender_id = m.id
${whereClause}
ORDER BY m.id
`
)
.all(...filterParams) as Array<{ memberId: number; name: string; content: string }>
if (rows.length === 0) {
return { members: [], sharedWords: [], similarityScore: 0 }
}
// 按成员分组
const memberMessages = new Map<number, { name: string; messages: string[] }>()
for (const row of rows) {
let entry = memberMessages.get(row.memberId)
if (!entry) {
entry = { name: row.name, messages: [] }
memberMessages.set(row.memberId, entry)
}
entry.messages.push(row.content)
}
const isChinese = locale.startsWith('zh')
const effectiveLocale = (locale || 'zh-CN') as SupportedLocale
const minWordLength = isChinese ? 2 : 3
const memberProfiles: any[] = []
for (const [memberId, { name, messages }] of memberMessages) {
// 词频 + POS
const wordFreq = new Map<string, number>()
const posCount = { noun: 0, verb: 0, adjective: 0, adverb: 0, modalParticle: 0, interjection: 0, other: 0 }
const modalFreq = new Map<string, number>()
let totalWordCount = 0
// 标点
const punct = { ellipsis: 0, exclamation: 0, question: 0, tilde: 0, period: 0, noPunct: 0, total: 0 }
// 口头禅(整句频率)
const phraseFreq = new Map<string, number>()
for (const content of messages) {
// 标点分析(原始文本)
punct.ellipsis += countMatches(content, RE_ELLIPSIS)
punct.exclamation += countMatches(content, RE_EXCLAMATION)
punct.question += countMatches(content, RE_QUESTION)
punct.tilde += countMatches(content, RE_TILDE)
punct.period += countMatches(content, RE_PERIOD)
const trimmed = content.trim()
if (trimmed.length > 0 && !RE_ENDS_WITH_PUNCT.test(trimmed)) {
punct.noPunct++
}
punct.total++
// 口头禅
const normalised = trimmed
if (normalised.length >= 2) {
phraseFreq.set(normalised, (phraseFreq.get(normalised) || 0) + 1)
}
// NLP
const cleaned = cleanTextForNlp(content)
if (!cleaned) continue
if (isChinese) {
try {
const jieba = getJieba(dictType as DictType)
const tagged = jieba.tag(cleaned)
for (const { word, tag } of tagged) {
if (!word || word.trim().length === 0) continue
if (RE_PURE_NUMBER.test(word)) continue
if (word.length < minWordLength && !MODAL_TAGS.has(tag)) continue
// POS 归类
if (NOUN_TAGS.has(tag)) posCount.noun++
else if (VERB_TAGS.has(tag)) posCount.verb++
else if (ADJ_TAGS.has(tag)) posCount.adjective++
else if (ADV_TAGS.has(tag)) posCount.adverb++
else if (tag === 'y') posCount.modalParticle++
else if (tag === 'e') posCount.interjection++
else posCount.other++
// 语气词 / 叹词
if (MODAL_TAGS.has(tag)) {
modalFreq.set(word, (modalFreq.get(word) || 0) + 1)
}
// 有意义的词 → 词频
if (MEANINGFUL_POS_TAGS.has(tag) || MODAL_TAGS.has(tag)) {
if (!isStopword(word, effectiveLocale)) {
wordFreq.set(word, (wordFreq.get(word) || 0) + 1)
totalWordCount++
}
}
}
} catch {
// jieba 失败时跳过
}
} else {
// 非中文:简单 Intl.Segmenter 分词
try {
const segmenter = new Intl.Segmenter(locale, { granularity: 'word' })
for (const seg of segmenter.segment(cleaned)) {
if (!seg.isWordLike) continue
const w = seg.segment.toLowerCase()
if (w.length < minWordLength) continue
if (RE_PURE_NUMBER.test(w)) continue
if (isStopword(w, effectiveLocale)) continue
wordFreq.set(w, (wordFreq.get(w) || 0) + 1)
totalWordCount++
posCount.other++
}
} catch {
// fallback
}
}
}
// 过滤低频词 & 排序
const filteredWords = [...wordFreq.entries()].filter(([, c]) => c >= 2).sort((a, b) => b[1] - a[1])
const uniqueWords = filteredWords.length
const topWords = filteredWords.slice(0, 100).map(([word, count]) => ({ word, count }))
const lexicalDiversity = totalWordCount > 0 ? Math.round((uniqueWords / totalWordCount) * 10000) / 100 : 0
// 语气词 Top 20
const modalParticles = [...modalFreq.entries()]
.sort((a, b) => b[1] - a[1])
.slice(0, 20)
.map(([word, count]) => ({ word, count }))
// 口头禅 Top 50count >= 2
const catchphrases = [...phraseFreq.entries()]
.filter(([, c]) => c >= 2)
.sort((a, b) => b[1] - a[1])
.slice(0, 50)
.map(([content, count]) => ({ content, count }))
memberProfiles.push({
memberId,
name,
totalMessages: messages.length,
totalWords: totalWordCount,
uniqueWords,
lexicalDiversity,
topWords,
posDistribution: posCount,
modalParticles,
punctuation: punct,
catchphrases,
})
}
// 按消息总数降序
memberProfiles.sort((a, b) => b.totalMessages - a.totalMessages)
// 跨成员:共同高频词 & 语言同频度
let sharedWords: any[] = []
let similarityScore = 0
if (memberProfiles.length >= 2) {
const a = memberProfiles[0]
const b = memberProfiles[1]
// 共同高频词
const wordsA = new Map(a.topWords.map((w: any) => [w.word, w.count]))
const wordsB = new Map(b.topWords.map((w: any) => [w.word, w.count]))
const shared: Array<{ word: string; countA: number; countB: number }> = []
for (const [word, countA] of wordsA) {
const countB = wordsB.get(word)
if (countB) {
shared.push({ word, countA, countB })
}
}
shared.sort((x, y) => x.countA + x.countB - (y.countA + y.countB))
shared.reverse()
sharedWords = shared.slice(0, 30)
// 余弦相似度:基于 POS 分布向量
const posKeys = ['noun', 'verb', 'adjective', 'adverb', 'modalParticle', 'interjection', 'other'] as const
const vecA = posKeys.map((k) => a.posDistribution[k] as number)
const vecB = posKeys.map((k) => b.posDistribution[k] as number)
similarityScore = Math.round(cosineSimilarity(vecA, vecB) * 100)
}
return { members: memberProfiles, sharedWords, similarityScore }
}
+1
View File
@@ -33,6 +33,7 @@ export type { MembersPaginationParams, MembersPaginatedResult } from './basic'
// 高级分析
export {
getCatchphraseAnalysis,
getLanguagePreferenceAnalysis,
getMentionAnalysis,
getMentionGraph,
getLaughAnalysis,
+9
View File
@@ -375,6 +375,15 @@ export async function getCatchphraseAnalysis(sessionId: string, filter?: any): P
return sendToWorker('getCatchphraseAnalysis', { sessionId, filter })
}
export async function getLanguagePreferenceAnalysis(params: {
sessionId: string
locale: string
timeFilter?: any
dictType?: string
}): Promise<any> {
return sendToWorker('getLanguagePreferenceAnalysis', params)
}
export async function getMentionAnalysis(sessionId: string, filter?: any): Promise<any> {
return sendToWorker('getMentionAnalysis', { sessionId, filter })
}