Files
ChatLab/electron/main/worker/query/advanced/languagePreference.ts
T
2026-04-17 23:44:23 +08:00

294 lines
9.5 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* 语言偏好分析模块(私聊专用)
*
* 一次性对两个成员的全部文字消息做 NLP 分析:
* - 词频 + 词性分布
* - 语气词画像
* - 标点性格
* - 口头禅(整句匹配)
* - 跨成员:共同高频词、语言同频度
*/
import { openDatabase, buildTimeFilter, type TimeFilter } from '../../core'
import { getJieba, isStopword, MEANINGFUL_POS_TAGS } from '../../../nlp'
import type { SupportedLocale, DictType } from '../../../nlp'
// ---------- 标点 regex ----------
const RE_ELLIPSIS = /\.{2,}|…+|。{2,}/g
const RE_EXCLAMATION = /[!]+/g
const RE_QUESTION = /[?]+/g
const RE_TILDE = /[~]+/g
const RE_PERIOD = /[.。](?![.。])/g
const RE_ENDS_WITH_PUNCT = /[.。!?~~…,;::、)\])】》"'」』\-—]$/
// ---------- POS 归类 ----------
const NOUN_TAGS = new Set(['n', 'nr', 'ns', 'nt', 'nz', 'nw'])
const VERB_TAGS = new Set(['v', 'vn', 'vd', 'vg'])
const ADJ_TAGS = new Set(['a', 'an', 'ad', 'ag'])
const ADV_TAGS = new Set(['d'])
const MODAL_TAGS = new Set(['y', 'e'])
// ---------- URL / Emoji / mention 清理 ----------
const RE_URL = /https?:\/\/[^\s]+/g
const RE_MENTION = /@[^\s@]+/g
const RE_EMOJI =
/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}]/gu
const RE_PUNCTUATION = /[!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~,。!?、;:""''()【】《》…—~·\s]/g
const RE_PURE_NUMBER = /^\d+$/
function cleanTextForNlp(text: string): string {
return text
.replace(RE_URL, ' ')
.replace(RE_MENTION, ' ')
.replace(RE_EMOJI, ' ')
.replace(RE_PUNCTUATION, ' ')
.replace(/\s+/g, ' ')
.trim()
}
// ---------- 工具函数 ----------
function countMatches(text: string, regex: RegExp): number {
regex.lastIndex = 0
const m = text.match(regex)
return m ? m.length : 0
}
function cosineSimilarity(a: number[], b: number[]): number {
let dot = 0
let magA = 0
let magB = 0
for (let i = 0; i < a.length; i++) {
dot += a[i] * b[i]
magA += a[i] * a[i]
magB += b[i] * b[i]
}
const denom = Math.sqrt(magA) * Math.sqrt(magB)
return denom === 0 ? 0 : dot / denom
}
// ---------- 主入口 ----------
interface LanguagePreferenceParams {
sessionId: string
locale: string
timeFilter?: TimeFilter
dictType?: string
}
export function getLanguagePreferenceAnalysis(params: LanguagePreferenceParams): any {
const { sessionId, locale, timeFilter, dictType = 'default' } = params
const db = openDatabase(sessionId)
if (!db) return { members: [], sharedWords: [], similarityScore: 0 }
const { clause, params: filterParams } = buildTimeFilter(timeFilter)
let whereClause = clause
const textFilter =
" COALESCE(m.account_name, '') != '系统消息' AND msg.type = 0 AND msg.content IS NOT NULL AND LENGTH(TRIM(msg.content)) >= 2"
if (whereClause.includes('WHERE')) {
whereClause += ' AND ' + textFilter
} else {
whereClause = ' WHERE ' + textFilter
}
// 一次查询取全部文字消息,按 member 分组处理
const rows = db
.prepare(
`
SELECT
m.id as memberId,
COALESCE(m.group_nickname, m.account_name, m.platform_id) as name,
msg.content as content
FROM message msg
JOIN member m ON msg.sender_id = m.id
${whereClause}
ORDER BY m.id
`
)
.all(...filterParams) as Array<{ memberId: number; name: string; content: string }>
if (rows.length === 0) {
return { members: [], sharedWords: [], similarityScore: 0 }
}
// 按成员分组
const memberMessages = new Map<number, { name: string; messages: string[] }>()
for (const row of rows) {
let entry = memberMessages.get(row.memberId)
if (!entry) {
entry = { name: row.name, messages: [] }
memberMessages.set(row.memberId, entry)
}
entry.messages.push(row.content)
}
const isChinese = locale.startsWith('zh')
const effectiveLocale = (locale || 'zh-CN') as SupportedLocale
const minWordLength = isChinese ? 2 : 3
const memberProfiles: any[] = []
for (const [memberId, { name, messages }] of memberMessages) {
// 词频 + POS
const wordFreq = new Map<string, number>()
const posCount = { noun: 0, verb: 0, adjective: 0, adverb: 0, modalParticle: 0, interjection: 0, other: 0 }
const modalFreq = new Map<string, number>()
let totalWordCount = 0
// 标点
const punct = { ellipsis: 0, exclamation: 0, question: 0, tilde: 0, period: 0, noPunct: 0, total: 0 }
// 口头禅(整句频率)
const phraseFreq = new Map<string, number>()
for (const content of messages) {
// 标点分析(原始文本)
punct.ellipsis += countMatches(content, RE_ELLIPSIS)
punct.exclamation += countMatches(content, RE_EXCLAMATION)
punct.question += countMatches(content, RE_QUESTION)
punct.tilde += countMatches(content, RE_TILDE)
punct.period += countMatches(content, RE_PERIOD)
const trimmed = content.trim()
if (trimmed.length > 0 && !RE_ENDS_WITH_PUNCT.test(trimmed)) {
punct.noPunct++
}
punct.total++
// 口头禅
const normalised = trimmed
if (normalised.length >= 2) {
phraseFreq.set(normalised, (phraseFreq.get(normalised) || 0) + 1)
}
// NLP
const cleaned = cleanTextForNlp(content)
if (!cleaned) continue
if (isChinese) {
try {
const jieba = getJieba(dictType as DictType)
const tagged = jieba.tag(cleaned)
for (const { word, tag } of tagged) {
if (!word || word.trim().length === 0) continue
if (RE_PURE_NUMBER.test(word)) continue
if (word.length < minWordLength && !MODAL_TAGS.has(tag)) continue
// POS 归类
if (NOUN_TAGS.has(tag)) posCount.noun++
else if (VERB_TAGS.has(tag)) posCount.verb++
else if (ADJ_TAGS.has(tag)) posCount.adjective++
else if (ADV_TAGS.has(tag)) posCount.adverb++
else if (tag === 'y') posCount.modalParticle++
else if (tag === 'e') posCount.interjection++
else posCount.other++
// 语气词 / 叹词
if (MODAL_TAGS.has(tag)) {
modalFreq.set(word, (modalFreq.get(word) || 0) + 1)
}
// 有意义的词 → 词频
if (MEANINGFUL_POS_TAGS.has(tag) || MODAL_TAGS.has(tag)) {
if (!isStopword(word, effectiveLocale)) {
wordFreq.set(word, (wordFreq.get(word) || 0) + 1)
totalWordCount++
}
}
}
} catch {
// jieba 失败时跳过
}
} else {
// 非中文:简单 Intl.Segmenter 分词
try {
const segmenter = new Intl.Segmenter(locale, { granularity: 'word' })
for (const seg of segmenter.segment(cleaned)) {
if (!seg.isWordLike) continue
const w = seg.segment.toLowerCase()
if (w.length < minWordLength) continue
if (RE_PURE_NUMBER.test(w)) continue
if (isStopword(w, effectiveLocale)) continue
wordFreq.set(w, (wordFreq.get(w) || 0) + 1)
totalWordCount++
posCount.other++
}
} catch {
// fallback
}
}
}
// 过滤低频词 & 排序
const filteredWords = [...wordFreq.entries()].filter(([, c]) => c >= 2).sort((a, b) => b[1] - a[1])
const uniqueWords = filteredWords.length
const topWords = filteredWords.slice(0, 100).map(([word, count]) => ({ word, count }))
const lexicalDiversity = totalWordCount > 0 ? Math.round((uniqueWords / totalWordCount) * 10000) / 100 : 0
// 语气词 Top 20
const modalParticles = [...modalFreq.entries()]
.sort((a, b) => b[1] - a[1])
.slice(0, 20)
.map(([word, count]) => ({ word, count }))
// 口头禅 Top 50count >= 2
const catchphrases = [...phraseFreq.entries()]
.filter(([, c]) => c >= 2)
.sort((a, b) => b[1] - a[1])
.slice(0, 50)
.map(([content, count]) => ({ content, count }))
memberProfiles.push({
memberId,
name,
totalMessages: messages.length,
totalWords: totalWordCount,
uniqueWords,
lexicalDiversity,
topWords,
posDistribution: posCount,
modalParticles,
punctuation: punct,
catchphrases,
})
}
// 按消息总数降序
memberProfiles.sort((a, b) => b.totalMessages - a.totalMessages)
// 跨成员:共同高频词 & 语言同频度
let sharedWords: any[] = []
let similarityScore = 0
if (memberProfiles.length >= 2) {
const a = memberProfiles[0]
const b = memberProfiles[1]
// 共同高频词
const wordsA = new Map(a.topWords.map((w: any) => [w.word, w.count]))
const wordsB = new Map(b.topWords.map((w: any) => [w.word, w.count]))
const shared: Array<{ word: string; countA: number; countB: number }> = []
for (const [word, countA] of wordsA) {
const countB = wordsB.get(word)
if (countB) {
shared.push({ word, countA, countB })
}
}
shared.sort((x, y) => x.countA + x.countB - (y.countA + y.countB))
shared.reverse()
sharedWords = shared.slice(0, 30)
// 余弦相似度:基于 POS 分布向量
const posKeys = ['noun', 'verb', 'adjective', 'adverb', 'modalParticle', 'interjection', 'other'] as const
const vecA = posKeys.map((k) => a.posDistribution[k] as number)
const vecB = posKeys.map((k) => b.posDistribution[k] as number)
similarityScore = Math.round(cosineSimilarity(vecA, vecB) * 100)
}
return { members: memberProfiles, sharedWords, similarityScore }
}