diff --git a/electron-builder.yml b/electron-builder.yml index de4a59f5..a17c36de 100644 --- a/electron-builder.yml +++ b/electron-builder.yml @@ -19,6 +19,11 @@ files: - '!{.eslintignore,.eslintrc.cjs,.prettierignore,.prettierrc.yaml,CHANGELOG.md,README.md}' - '!{.env,.env.*,.npmrc,pnpm-lock.yaml}' - '!{tsconfig.json,tsconfig.node.json,tsconfig.web.json}' + # jieba 词库文件已改为启动时远程下载,无需打包内置 + - '!node_modules/@node-rs/jieba/dict.txt' + - '!node_modules/@node-rs/jieba/idf.txt' + - '!node_modules/@node-rs/jieba/dict.js' + - '!node_modules/@node-rs/jieba/dict.d.ts' # 哪些文件将不会被压缩,而是解压到构建目录 asarUnpack: - resources/** diff --git a/electron/main/ipc/nlp.ts b/electron/main/ipc/nlp.ts index 40df10dc..81769f26 100644 --- a/electron/main/ipc/nlp.ts +++ b/electron/main/ipc/nlp.ts @@ -1,12 +1,13 @@ /** * NLP 功能 IPC 处理器 - * 提供词频统计、分词等 NLP 功能 + * 提供词频统计、分词等 NLP 功能,以及词库管理 */ import { ipcMain } from 'electron' import * as worker from '../worker/workerManager' import type { IpcContext } from './types' import type { WordFrequencyParams, WordFrequencyResult, SupportedLocale, PosTagInfo } from '../nlp' +import { getDictList, downloadDict, deleteDict, isDictDownloaded, type DictInfo } from '../nlp/dictManager' /** * 注册 NLP 相关 IPC 处理器 @@ -60,4 +61,25 @@ export function registerNlpHandlers(_ctx: IpcContext): void { return [] } }) + + // ==================== 词库管理 ==================== + + ipcMain.handle('nlp:getDictList', async (): Promise => { + return getDictList() + }) + + ipcMain.handle('nlp:isDictDownloaded', async (_event, dictId: string): Promise => { + return isDictDownloaded(dictId) + }) + + ipcMain.handle( + 'nlp:downloadDict', + async (_event, dictId: string): Promise<{ success: boolean; error?: string }> => { + return downloadDict(dictId) + } + ) + + ipcMain.handle('nlp:deleteDict', async (_event, dictId: string): Promise<{ success: boolean; error?: string }> => { + return deleteDict(dictId) + }) } diff --git a/electron/main/nlp/dictManager.ts b/electron/main/nlp/dictManager.ts new file mode 100644 index 00000000..b286bfe4 --- /dev/null +++ b/electron/main/nlp/dictManager.ts @@ -0,0 +1,172 @@ +/** + * NLP 词库管理器 + * 负责自定义词库的下载、查询、删除 + * 词库存储在 userData/nlp/ 目录下 + */ + +import * as fs from 'fs' +import * as path from 'path' +import { app } from 'electron' +import axios from 'axios' + +const NLP_DIR_NAME = 'nlp' +const DICT_DOWNLOAD_URL_BASE = 'https://chatlab.fun/assets/nlp' + +export interface DictInfo { + id: string + label: string + locale: string + downloaded: boolean + fileSize?: number +} + +const AVAILABLE_DICTS: Omit[] = [ + { id: 'zh-CN', label: '简体中文', locale: 'zh-CN' }, + { id: 'zh-TW', label: '繁體中文', locale: 'zh-TW' }, +] + +export function getNlpDir(): string { + const userDataPath = app.getPath('userData') + return path.join(userDataPath, 'data', NLP_DIR_NAME) +} + +function ensureNlpDir(): void { + const dir = getNlpDir() + if (!fs.existsSync(dir)) { + fs.mkdirSync(dir, { recursive: true }) + } +} + +function getDictFilePath(dictId: string): string { + return path.join(getNlpDir(), `${dictId}.dict`) +} + +function getDictDownloadUrl(dictId: string): string { + return `${DICT_DOWNLOAD_URL_BASE}/${dictId}.dict` +} + +export function isDictDownloaded(dictId: string): boolean { + return fs.existsSync(getDictFilePath(dictId)) +} + +export function getDictList(): DictInfo[] { + return AVAILABLE_DICTS.map((d) => { + const filePath = getDictFilePath(d.id) + const downloaded = fs.existsSync(filePath) + let fileSize: number | undefined + if (downloaded) { + try { + fileSize = fs.statSync(filePath).size + } catch { + /* ignore */ + } + } + return { ...d, downloaded, fileSize } + }) +} + +export function loadDictBuffer(dictId: string): Buffer | null { + const filePath = getDictFilePath(dictId) + if (!fs.existsSync(filePath)) return null + try { + return fs.readFileSync(filePath) + } catch (error) { + console.error(`[NLP DictManager] Failed to read dict file: ${filePath}`, error) + return null + } +} + +export async function downloadDict( + dictId: string, + onProgress?: (percent: number) => void +): Promise<{ success: boolean; error?: string }> { + const dictDef = AVAILABLE_DICTS.find((d) => d.id === dictId) + if (!dictDef) { + return { success: false, error: `Unknown dict: ${dictId}` } + } + + ensureNlpDir() + const url = getDictDownloadUrl(dictId) + const filePath = getDictFilePath(dictId) + const tmpPath = filePath + '.tmp' + + try { + const response = await axios.get(url, { + responseType: 'arraybuffer', + timeout: 120_000, + onDownloadProgress: (progressEvent) => { + if (progressEvent.total && onProgress) { + onProgress(Math.round((progressEvent.loaded / progressEvent.total) * 100)) + } + }, + }) + + const buffer = Buffer.from(response.data) + + // 词库文件至少应 > 1MB,且不应以 HTML 标签开头 + const MIN_DICT_SIZE = 1_000_000 + if (buffer.length < MIN_DICT_SIZE) { + const preview = buffer.subarray(0, 200).toString('utf-8') + console.error(`[NLP DictManager] Downloaded file too small (${buffer.length} bytes), preview: ${preview}`) + return { success: false, error: `Downloaded file is invalid (${buffer.length} bytes). The dictionary URL may not be available yet.` } + } + + const head = buffer.subarray(0, 50).toString('utf-8').trim() + if (head.startsWith(' { + if (isDictDownloaded('zh-CN')) return + + console.log('[NLP DictManager] zh-CN dict not found, starting background download...') + const result = await downloadDict('zh-CN') + if (result.success) { + console.log('[NLP DictManager] zh-CN dict auto-downloaded successfully') + } else { + console.warn('[NLP DictManager] zh-CN dict auto-download failed:', result.error) + } +} + +export function deleteDict(dictId: string): { success: boolean; error?: string } { + const filePath = getDictFilePath(dictId) + if (!fs.existsSync(filePath)) { + return { success: true } + } + try { + fs.unlinkSync(filePath) + console.log(`[NLP DictManager] Dict deleted: ${dictId}`) + return { success: true } + } catch (error) { + const msg = error instanceof Error ? error.message : String(error) + console.error(`[NLP DictManager] Delete failed for ${dictId}:`, msg) + return { success: false, error: msg } + } +} diff --git a/electron/main/nlp/ftsTokenizer.ts b/electron/main/nlp/ftsTokenizer.ts index 159fe497..756b2d7f 100644 --- a/electron/main/nlp/ftsTokenizer.ts +++ b/electron/main/nlp/ftsTokenizer.ts @@ -6,29 +6,11 @@ * * 使用 jieba 处理中文(天然兼容中英混合文本), * Intl.Segmenter 处理纯英文/日文。 + * + * 复用 segmenter 模块的 jieba 实例池,默认使用 zh-CN 词库。 */ -interface JiebaInstance { - cut: (text: string, hmm?: boolean) => string[] -} - -let jiebaInstance: JiebaInstance | null = null - -function getJieba(): JiebaInstance { - if (!jiebaInstance) { - try { - // eslint-disable-next-line @typescript-eslint/no-require-imports - const { Jieba } = require('@node-rs/jieba') - // eslint-disable-next-line @typescript-eslint/no-require-imports - const { dict } = require('@node-rs/jieba/dict') - jiebaInstance = Jieba.withDict(dict) - } catch (error) { - console.error('[FTS] Failed to load jieba module:', error) - throw new Error('jieba 模块加载失败') - } - } - return jiebaInstance! -} +import { getJieba } from './segmenter' /** * 对文本进行 FTS 分词,返回空格分隔的 token 字符串。 diff --git a/electron/main/nlp/index.ts b/electron/main/nlp/index.ts index 2e9c129d..f8fa2c2e 100644 --- a/electron/main/nlp/index.ts +++ b/electron/main/nlp/index.ts @@ -5,3 +5,5 @@ export * from './types' export * from './stopwords' export * from './segmenter' +// dictManager 需要 electron app 模块,只能在主进程中直接导入 +// import { ... } from './dictManager' diff --git a/electron/main/nlp/segmenter.ts b/electron/main/nlp/segmenter.ts index 3a4c32a1..e6d7b12a 100644 --- a/electron/main/nlp/segmenter.ts +++ b/electron/main/nlp/segmenter.ts @@ -1,42 +1,85 @@ /** * 分词器模块 * 中文使用 @node-rs/jieba,其他语言使用 Intl.Segmenter + * + * 支持多词库:默认内置简体中文词库,可通过 dictType 加载繁体中文等自定义词库。 + * 自定义词库文件存储在 nlpDir 目录下(由 Worker 初始化时传入)。 */ -import type { SupportedLocale, PosFilterMode, PosTagInfo } from './types' +import * as fs from 'fs' +import * as path from 'path' +import type { SupportedLocale, PosFilterMode, PosTagInfo, DictType } from './types' import { isStopword } from './stopwords' -// Jieba 实例类型 +export type { DictType } + interface JiebaInstance { cut: (text: string, hmm?: boolean) => string[] tag: (text: string) => Array<{ tag: string; word: string }> } -// Jieba 实例(延迟初始化) -let jiebaInstance: JiebaInstance | null = null +let _nlpDir: string | null = null + +const jiebaInstances = new Map() /** - * 获取 Jieba 实例(延迟加载) + * 由 Worker 初始化时调用,设置自定义词库目录路径 */ -function getJieba(): JiebaInstance { - if (!jiebaInstance) { - try { - // eslint-disable-next-line @typescript-eslint/no-require-imports - const { Jieba } = require('@node-rs/jieba') - // eslint-disable-next-line @typescript-eslint/no-require-imports - const { dict } = require('@node-rs/jieba/dict') - jiebaInstance = Jieba.withDict(dict) - console.log('[NLP] jieba module loaded') - } catch (error) { - console.error('[NLP] Failed to load jieba module:', error) - throw new Error('jieba 模块加载失败') +export function initNlpDir(nlpDir: string): void { + _nlpDir = nlpDir +} + +/** + * 尝试从 nlpDir 加载词库文件,返回 Buffer 或 null + */ +function tryLoadDictFromDisk(dictId: string): Buffer | null { + if (!_nlpDir) return null + const dictPath = path.join(_nlpDir, `${dictId}.dict`) + if (!fs.existsSync(dictPath)) return null + try { + return fs.readFileSync(dictPath) + } catch { + return null + } +} + +/** + * 获取 Jieba 实例(支持多词库) + * + * 所有词库均从 nlpDir 磁盘加载(由应用启动时自动下载)。 + * default 和 zh-CN 共用同一实例。 + */ +export function getJieba(dictType: DictType = 'default'): JiebaInstance { + const effectiveType = dictType === 'default' ? 'zh-CN' : dictType + const cached = jiebaInstances.get(effectiveType) + if (cached) return cached + + try { + // eslint-disable-next-line @typescript-eslint/no-require-imports + const { Jieba } = require('@node-rs/jieba') + + const diskDict = tryLoadDictFromDisk(effectiveType) + if (!diskDict) { + throw new Error(`Dict file not found for: ${effectiveType}. Please ensure the dictionary has been downloaded.`) } + + const instance: JiebaInstance = Jieba.withDict(diskDict) + console.log(`[NLP] jieba dict loaded: ${effectiveType} (${diskDict.length} bytes)`) + + jiebaInstances.set(effectiveType, instance) + return instance + } catch (error) { + console.error(`[NLP] Failed to load jieba module (dict=${effectiveType}):`, error) + throw new Error(`jieba 模块加载失败 (${effectiveType})`) } - const instance = jiebaInstance - if (!instance) { - throw new Error('jieba 模块未初始化') - } - return instance +} + +/** + * 清除指定词库的缓存实例(词库更新后调用) + */ +export function clearJiebaInstance(dictType: DictType): void { + jiebaInstances.delete(dictType) + console.log(`[NLP] jieba instance cleared: ${dictType}`) } /** @@ -147,6 +190,8 @@ interface ChineseSegmentOptions { posFilterMode?: PosFilterMode /** 自定义词性过滤列表 */ customPosTags?: string[] + /** 词库类型 */ + dictType?: DictType } /** @@ -156,12 +201,13 @@ interface ChineseSegmentOptions { export function collectPosTagStats( texts: string[], minWordLength: number = 2, - enableStopwords: boolean = true + enableStopwords: boolean = true, + dictType: DictType = 'default' ): Map { const posStats = new Map() try { - const jieba = getJieba() + const jieba = getJieba(dictType) for (const text of texts) { const cleaned = cleanText(text) @@ -191,36 +237,31 @@ export function collectPosTagStats( * @param options 分词选项 */ function segmentChinese(text: string, options: ChineseSegmentOptions = {}): string[] { - const { posFilterMode = 'meaningful', customPosTags } = options + const { posFilterMode = 'meaningful', customPosTags, dictType = 'default' } = options const cleaned = cleanText(text) if (!cleaned) return [] try { - const jieba = getJieba() + const jieba = getJieba(dictType) - // 全部模式:直接分词,不做词性过滤 if (posFilterMode === 'all') { return jieba.cut(cleaned, false) } - // 使用词性标注 const tagged = jieba.tag(cleaned) - // 根据模式过滤 let allowedTags: Set if (posFilterMode === 'custom' && customPosTags) { allowedTags = new Set(customPosTags) } else { - // meaningful 模式 allowedTags = MEANINGFUL_POS_TAGS } return tagged.filter((item) => allowedTags.has(item.tag)).map((item) => item.word) } catch (error) { console.error('[NLP] Chinese segmentation failed:', error) - // 降级:使用简单分词 try { - const jieba = getJieba() + const jieba = getJieba('default') return jieba.cut(cleaned, false) } catch { return cleaned.split('') @@ -277,6 +318,8 @@ export interface SegmentOptions { customPosTags?: string[] /** 是否启用停用词过滤 */ enableStopwords?: boolean + /** 词库类型(仅中文有效) */ + dictType?: DictType } /** @@ -287,7 +330,13 @@ export interface SegmentOptions { * @returns 过滤后的分词结果 */ export function segment(text: string, locale: SupportedLocale, options: SegmentOptions = {}): string[] { - const { minLength, posFilterMode = 'meaningful', customPosTags, enableStopwords = true } = options + const { + minLength, + posFilterMode = 'meaningful', + customPosTags, + enableStopwords = true, + dictType = 'default', + } = options const isChinese = locale.startsWith('zh') const isJapanese = locale === 'ja-JP' const defaultMinLength = isChinese || isJapanese ? 2 : 3 @@ -296,7 +345,7 @@ export function segment(text: string, locale: SupportedLocale, options: SegmentO let words: string[] if (isChinese) { - words = segmentChinese(text, { posFilterMode, customPosTags }) + words = segmentChinese(text, { posFilterMode, customPosTags, dictType }) } else if (isJapanese) { words = segmentJapanese(text) } else { @@ -326,11 +375,11 @@ export function batchSegmentWithFrequency( locale: SupportedLocale, options: BatchSegmentOptions = {} ): Map { - const { minLength, minCount = 2, topN = 100, posFilterMode, customPosTags, enableStopwords } = options + const { minLength, minCount = 2, topN = 100, posFilterMode, customPosTags, enableStopwords, dictType } = options const wordFrequency = new Map() for (const text of texts) { - const words = segment(text, locale, { minLength, posFilterMode, customPosTags, enableStopwords }) + const words = segment(text, locale, { minLength, posFilterMode, customPosTags, enableStopwords, dictType }) for (const word of words) { wordFrequency.set(word, (wordFrequency.get(word) || 0) + 1) } diff --git a/electron/main/nlp/types.ts b/electron/main/nlp/types.ts index 9f786dee..be099c6c 100644 --- a/electron/main/nlp/types.ts +++ b/electron/main/nlp/types.ts @@ -48,6 +48,9 @@ export interface WordFrequencyResult { /** 词性过滤模式 */ export type PosFilterMode = 'all' | 'meaningful' | 'custom' +/** 词库类型 */ +export type DictType = 'default' | 'zh-CN' | 'zh-TW' + /** 词频统计参数 */ export interface WordFrequencyParams { /** 会话 ID */ @@ -73,6 +76,8 @@ export interface WordFrequencyParams { customPosTags?: string[] /** 是否启用停用词过滤,默认 true */ enableStopwords?: boolean + /** 词库类型:default=内置简体中文, zh-TW=繁体中文 */ + dictType?: DictType } /** 词性标签信息 */ diff --git a/electron/main/worker/dbWorker.ts b/electron/main/worker/dbWorker.ts index 28bf09e5..381b26a4 100644 --- a/electron/main/worker/dbWorker.ts +++ b/electron/main/worker/dbWorker.ts @@ -72,10 +72,16 @@ import { getPosTags, } from './query' import { streamImport, streamParseFileInfo, analyzeIncrementalImport, incrementalImport } from './import' +import { initNlpDir } from '../nlp/segmenter' // 初始化数据库目录 initDbDir(workerData.dbDir, workerData.cacheDir) +// 初始化 NLP 词库目录 +if (workerData.nlpDir) { + initNlpDir(workerData.nlpDir) +} + // ==================== 分析结果缓存 ==================== const ANALYSIS_CACHE_PREFIX = 'analysis:' @@ -124,6 +130,7 @@ function buildAnalysisCacheKey(type: string, payload: any): string { if (payload.topN) parts.push(`n${payload.topN}`) if (payload.minLength) parts.push(`ml${payload.minLength}`) if (payload.posTags) parts.push(`pt${JSON.stringify(payload.posTags)}`) + if (payload.dictType && payload.dictType !== 'default') parts.push(`dt${payload.dictType}`) return parts.join(':') } diff --git a/electron/main/worker/query/nlp.ts b/electron/main/worker/query/nlp.ts index 2c016a86..a900278e 100644 --- a/electron/main/worker/query/nlp.ts +++ b/electron/main/worker/query/nlp.ts @@ -5,7 +5,7 @@ import { openDatabase, buildTimeFilter, type TimeFilter } from '../core' import { segment, batchSegmentWithFrequency, getPosTagDefinitions, collectPosTagStats } from '../../nlp' -import type { SupportedLocale, WordFrequencyResult, WordFrequencyParams, PosTagInfo, PosTagStat } from '../../nlp' +import type { SupportedLocale, WordFrequencyResult, WordFrequencyParams, PosTagInfo, PosTagStat, DictType } from '../../nlp' /** * 获取词频统计 @@ -23,6 +23,7 @@ export function getWordFrequency(params: WordFrequencyParams): WordFrequencyResu posFilterMode = 'meaningful', customPosTags, enableStopwords = true, + dictType = 'default', } = params const db = openDatabase(sessionId) @@ -35,14 +36,12 @@ export function getWordFrequency(params: WordFrequencyParams): WordFrequencyResu } } - // 构建时间和成员过滤 const filter: TimeFilter = { ...timeFilter, memberId, } const { clause, params: filterParams } = buildTimeFilter(filter, 'msg') - // 构建 WHERE 子句,排除系统消息 let whereClause = clause if (whereClause.includes('WHERE')) { whereClause += @@ -52,7 +51,6 @@ export function getWordFrequency(params: WordFrequencyParams): WordFrequencyResu " WHERE COALESCE(m.account_name, '') != '系统消息' AND msg.type = 0 AND msg.content IS NOT NULL AND TRIM(msg.content) != ''" } - // 查询消息内容 const messages = db .prepare( ` @@ -64,7 +62,6 @@ export function getWordFrequency(params: WordFrequencyParams): WordFrequencyResu ) .all(...filterParams) as Array<{ content: string }> - // 如果没有消息,返回空结果 if (messages.length === 0) { return { words: [], @@ -74,18 +71,14 @@ export function getWordFrequency(params: WordFrequencyParams): WordFrequencyResu } } - // 提取文本内容 const texts = messages.map((m) => m.content) - // 收集词性统计(用于显示每个词性有多少词,仅中文有效) let posTagStats: PosTagStat[] | undefined - // 词性统计只对中文生效,这里先做类型兜底,避免异常 locale 直接触发 startsWith 报错。 if (typeof locale === 'string' && locale.startsWith('zh')) { - const posStatsMap = collectPosTagStats(texts, minWordLength ?? 2, enableStopwords) + const posStatsMap = collectPosTagStats(texts, minWordLength ?? 2, enableStopwords, dictType as DictType) posTagStats = [...posStatsMap.entries()].map(([tag, count]) => ({ tag, count })) } - // 批量分词并统计词频 const wordFrequency = batchSegmentWithFrequency(texts, locale as SupportedLocale, { minLength: minWordLength, minCount, @@ -93,15 +86,14 @@ export function getWordFrequency(params: WordFrequencyParams): WordFrequencyResu posFilterMode, customPosTags, enableStopwords, + dictType: dictType as DictType, }) - // 计算总词数(用于百分比) let totalWords = 0 for (const count of wordFrequency.values()) { totalWords += count } - // 构建结果 const words = [...wordFrequency.entries()].map(([word, count]) => ({ word, count, diff --git a/electron/main/worker/workerManager.ts b/electron/main/worker/workerManager.ts index fff85eeb..181e0840 100644 --- a/electron/main/worker/workerManager.ts +++ b/electron/main/worker/workerManager.ts @@ -10,6 +10,7 @@ import type { ParseProgress } from '../parser' import type { StreamImportResult } from './import' import { openDatabase } from '../database/core' import { getDatabaseDir, getCacheDir, ensureDir } from '../paths' +import { getNlpDir } from '../nlp/dictManager' // Worker 实例 let worker: Worker | null = null @@ -70,6 +71,7 @@ export function initWorker(): void { workerData: { dbDir: getDbDir(), cacheDir: getCacheDir(), + nlpDir: getNlpDir(), }, }) diff --git a/electron/preload/apis/utils.ts b/electron/preload/apis/utils.ts index edef559a..18894988 100644 --- a/electron/preload/apis/utils.ts +++ b/electron/preload/apis/utils.ts @@ -94,26 +94,33 @@ export interface ChatSessionItem { // ==================== NLP API ==================== export const nlpApi = { - /** - * 获取词频统计(用于词云) - */ getWordFrequency: (params: WordFrequencyParams): Promise => { return ipcRenderer.invoke('nlp:getWordFrequency', params) }, - /** - * 单文本分词 - */ segmentText: (text: string, locale: 'zh-CN' | 'en-US', minLength?: number): Promise => { return ipcRenderer.invoke('nlp:segmentText', text, locale, minLength) }, - /** - * 获取词性标签定义 - */ getPosTags: (): Promise => { return ipcRenderer.invoke('nlp:getPosTags') }, + + getDictList: (): Promise> => { + return ipcRenderer.invoke('nlp:getDictList') + }, + + isDictDownloaded: (dictId: string): Promise => { + return ipcRenderer.invoke('nlp:isDictDownloaded', dictId) + }, + + downloadDict: (dictId: string): Promise<{ success: boolean; error?: string }> => { + return ipcRenderer.invoke('nlp:downloadDict', dictId) + }, + + deleteDict: (dictId: string): Promise<{ success: boolean; error?: string }> => { + return ipcRenderer.invoke('nlp:deleteDict', dictId) + }, } // ==================== Network API ==================== diff --git a/electron/preload/index.d.ts b/electron/preload/index.d.ts index dd55fa46..0dae2627 100644 --- a/electron/preload/index.d.ts +++ b/electron/preload/index.d.ts @@ -923,6 +923,16 @@ interface WordFrequencyResult { posTagStats?: PosTagStat[] } +type DictType = 'default' | 'zh-CN' | 'zh-TW' + +interface DictInfo { + id: string + label: string + locale: string + downloaded: boolean + fileSize?: number +} + interface WordFrequencyParams { sessionId: string locale: SupportedLocale @@ -937,6 +947,8 @@ interface WordFrequencyParams { customPosTags?: string[] /** 是否启用停用词过滤,默认 true */ enableStopwords?: boolean + /** 词库类型:default=内置简体中文, zh-TW=繁体中文 */ + dictType?: DictType } /** 词性标签信息 */ @@ -951,6 +963,10 @@ interface NlpApi { getWordFrequency: (params: WordFrequencyParams) => Promise segmentText: (text: string, locale: SupportedLocale, minLength?: number) => Promise getPosTags: () => Promise + getDictList: () => Promise + isDictDownloaded: (dictId: string) => Promise + downloadDict: (dictId: string) => Promise<{ success: boolean; error?: string }> + deleteDict: (dictId: string) => Promise<{ success: boolean; error?: string }> } // ChatLab API 服务类型 diff --git a/src/components/analysis/quotes/WordcloudTab.vue b/src/components/analysis/quotes/WordcloudTab.vue index 6b3c1549..10dee2b1 100644 --- a/src/components/analysis/quotes/WordcloudTab.vue +++ b/src/components/analysis/quotes/WordcloudTab.vue @@ -7,10 +7,12 @@ import { LoadingState, EmptyState, UITabs } from '@/components/UI' import UserSelect from '@/components/common/UserSelect.vue' import { useSettingsStore } from '@/stores/settings' import { useLayoutStore } from '@/stores/layout' +import { useToast } from '@/composables/useToast' const { t } = useI18n() const settingsStore = useSettingsStore() const layoutStore = useLayoutStore() +const toast = useToast() interface TimeFilter { startTs?: number @@ -25,6 +27,7 @@ interface PosTagInfo { } type PosFilterMode = 'all' | 'meaningful' | 'custom' +type DictType = 'default' | 'zh-CN' | 'zh-TW' const props = defineProps<{ sessionId: string @@ -32,7 +35,6 @@ const props = defineProps<{ memberId?: number | null }>() -// 状态 const isLoading = ref(false) const wordcloudData = ref({ words: [] }) const stats = ref({ @@ -69,7 +71,90 @@ const posTagStats = ref>(new Map()) const selectedMemberId = ref(null) // 获取当前语言设置 +// ==================== 词库切换 ==================== +const selectedDictType = ref('default') +const dictList = ref>([]) +const isDictDownloading = ref(false) +const downloadingDictId = ref(null) +const showDictPromptModal = ref(false) +const DICT_PROMPT_DISMISSED_KEY = 'chatlab_zhTW_dict_prompt_dismissed' + const locale = computed(() => settingsStore.locale as 'zh-CN' | 'en-US') +const isTraditionalChinese = computed(() => settingsStore.locale === 'zh-TW') + +const dictOptions = computed(() => { + return dictList.value + .filter((d) => d.downloaded) + .map((d) => ({ + label: t(`quotes.wordcloud.dict.${d.id === 'zh-CN' ? 'zhCN' : d.id === 'zh-TW' ? 'zhTW' : d.id}`), + value: d.id as DictType, + })) +}) + +const hasAnyDict = computed(() => { + return dictList.value.some((d) => d.downloaded) +}) + +const undownloadedDicts = computed(() => { + return dictList.value.filter((d) => !d.downloaded) +}) + +async function refreshDictList() { + try { + dictList.value = await window.nlpApi.getDictList() + // 繁体中文用户自动切换到 zh-TW(如已下载) + if (isTraditionalChinese.value && selectedDictType.value === 'default') { + const zhTW = dictList.value.find((d) => d.id === 'zh-TW') + if (zhTW?.downloaded) { + selectedDictType.value = 'zh-TW' + } + } + // 如果 zh-CN 已下载,default 应该用 zh-CN + const zhCN = dictList.value.find((d) => d.id === 'zh-CN') + if (zhCN?.downloaded && selectedDictType.value === 'default') { + selectedDictType.value = 'zh-CN' + } + } catch (error) { + console.error('Failed to get dict list:', error) + } +} + +async function handleDownloadDict(dictId: string) { + isDictDownloading.value = true + downloadingDictId.value = dictId + try { + const result = await window.nlpApi.downloadDict(dictId) + if (result.success) { + await refreshDictList() + selectedDictType.value = dictId as DictType + toast.success(t('quotes.wordcloud.dict.downloadSuccess')) + } else { + toast.fail(t('quotes.wordcloud.dict.downloadFailed'), { description: result.error }) + } + } catch (error) { + toast.fail(t('quotes.wordcloud.dict.downloadFailed')) + } finally { + isDictDownloading.value = false + downloadingDictId.value = null + showDictPromptModal.value = false + } +} + +function dismissDictPrompt() { + showDictPromptModal.value = false + localStorage.setItem(DICT_PROMPT_DISMISSED_KEY, 'true') +} + +function maybeShowDictPrompt() { + const zhTW = dictList.value.find((d) => d.id === 'zh-TW') + if ( + isTraditionalChinese.value && + zhTW && !zhTW.downloaded && + !localStorage.getItem(DICT_PROMPT_DISMISSED_KEY) + ) { + showDictPromptModal.value = true + } +} // 词性过滤模式选项 const posFilterModeOptions = computed(() => [ @@ -120,7 +205,7 @@ async function loadPosTagDefinitions() { // 加载词频数据 async function loadWordFrequency() { - if (!props.sessionId) return + if (!props.sessionId || !hasAnyDict.value) return isLoading.value = true try { @@ -134,6 +219,7 @@ async function loadWordFrequency() { posFilterMode: posFilterMode.value, customPosTags: posFilterMode.value === 'custom' ? [...customPosTags.value] : undefined, enableStopwords: enableStopwords.value, + dictType: selectedDictType.value, }) wordcloudData.value = { @@ -175,6 +261,7 @@ watch( maxWords.value, posFilterMode.value, enableStopwords.value, + selectedDictType.value, ], () => { loadWordFrequency() @@ -205,9 +292,10 @@ function handleWordClick(word: string) { }) } -// 初始化 -onMounted(() => { +onMounted(async () => { loadPosTagDefinitions() + await refreshDictList() + maybeShowDictPrompt() }) @@ -218,9 +306,34 @@ onMounted(() => {
+ +
+ +

+ {{ t('quotes.wordcloud.dict.needDownload') }} +

+
+ + {{ t(`quotes.wordcloud.dict.download_${dict.id}`, dict.label) }} + +
+
+ @@ -312,6 +425,32 @@ onMounted(() => {
+ +
+

+ {{ t('quotes.wordcloud.config.dict') }} +

+
+ +
+ + {{ t(`quotes.wordcloud.dict.download_${dict.id}`, t('quotes.wordcloud.dict.download')) }} + + + {{ t('quotes.wordcloud.dict.downloadHint') }} + +
+
+
+

@@ -331,7 +470,6 @@ onMounted(() => {

{{ t('quotes.wordcloud.posFilter.customHint') }}

-
{
-
{
+ + + + + diff --git a/src/i18n/locales/en-US/quotes.json b/src/i18n/locales/en-US/quotes.json index 80fbaf49..fae35c53 100644 --- a/src/i18n/locales/en-US/quotes.json +++ b/src/i18n/locales/en-US/quotes.json @@ -16,7 +16,8 @@ "sizeScale": "Font size", "userFilter": "User filter", "posFilter": "POS filter", - "enableStopwords": "Filter stopwords" + "enableStopwords": "Filter stopwords", + "dict": "Dictionary" }, "size": { "small": "S", @@ -36,6 +37,21 @@ "empty": { "title": "No word cloud data", "description": "Not enough text messages to generate a word cloud with current filters" + }, + "dict": { + "zhCN": "Simplified Chinese", + "zhTW": "Traditional Chinese", + "download": "Download dictionary", + "download_zh-CN": "Download Simplified Chinese dict", + "download_zh-TW": "Download Traditional Chinese dict", + "needDownload": "Please download a dictionary to use word cloud analysis", + "downloadHint": "~8MB", + "downloadSuccess": "Dictionary downloaded successfully", + "downloadFailed": "Dictionary download failed", + "promptTitle": "Traditional Chinese Dictionary", + "promptDescription": "We detected you're using Traditional Chinese. For best word cloud results, a Traditional Chinese dictionary is recommended. Download now? (~8MB)", + "promptDownload": "Download now", + "promptLater": "Maybe later" } }, "keywords": { diff --git a/src/i18n/locales/ja-JP/quotes.json b/src/i18n/locales/ja-JP/quotes.json index 13ee2c00..d8536a2d 100644 --- a/src/i18n/locales/ja-JP/quotes.json +++ b/src/i18n/locales/ja-JP/quotes.json @@ -16,7 +16,8 @@ "sizeScale": "フォントサイズ", "userFilter": "ユーザーフィルター", "posFilter": "品詞フィルター", - "enableStopwords": "ストップワードを除外" + "enableStopwords": "ストップワードを除外", + "dict": "辞書" }, "size": { "small": "小", @@ -36,6 +37,21 @@ "empty": { "title": "ワードクラウドデータがありません", "description": "現在の絞り込み条件では、ワードクラウドを作れるほどテキストメッセージがありません" + }, + "dict": { + "zhCN": "簡体字中国語", + "zhTW": "繁体字中国語", + "download": "辞書をダウンロード", + "download_zh-CN": "簡体字辞書をダウンロード", + "download_zh-TW": "繁体字辞書をダウンロード", + "needDownload": "ワードクラウドを使用するには辞書のダウンロードが必要です", + "downloadHint": "~8MB", + "downloadSuccess": "辞書のダウンロードが完了しました", + "downloadFailed": "辞書のダウンロードに失敗しました", + "promptTitle": "繁体字中国語辞書", + "promptDescription": "繁体字中国語をご使用中です。ワードクラウドの精度を上げるため、繁体字辞書のダウンロードをお勧めします。今すぐダウンロードしますか?(約8MB)", + "promptDownload": "今すぐダウンロード", + "promptLater": "後で" } }, "keywords": { diff --git a/src/i18n/locales/zh-CN/quotes.json b/src/i18n/locales/zh-CN/quotes.json index 3fde23ea..d6546061 100644 --- a/src/i18n/locales/zh-CN/quotes.json +++ b/src/i18n/locales/zh-CN/quotes.json @@ -16,7 +16,8 @@ "sizeScale": "字体大小", "userFilter": "用户筛选", "posFilter": "词性过滤", - "enableStopwords": "过滤停用词" + "enableStopwords": "过滤停用词", + "dict": "分词词库" }, "size": { "small": "小", @@ -36,6 +37,21 @@ "empty": { "title": "暂无词云数据", "description": "当前筛选条件下没有足够的文本消息用于生成词云" + }, + "dict": { + "zhCN": "简体中文", + "zhTW": "繁体中文", + "download": "下载词库", + "download_zh-CN": "下载简体词库", + "download_zh-TW": "下载繁体词库", + "needDownload": "使用词云分析需要先下载分词词库", + "downloadHint": "~8MB", + "downloadSuccess": "词库下载成功", + "downloadFailed": "词库下载失败", + "promptTitle": "繁体中文词库", + "promptDescription": "检测到您使用繁体中文,词云分析需要繁体词库才能获得最佳效果。是否立即下载?(约 8MB)", + "promptDownload": "立即下载", + "promptLater": "稍后再说" } }, "keywords": { diff --git a/src/i18n/locales/zh-TW/quotes.json b/src/i18n/locales/zh-TW/quotes.json index ede96b6b..949dd27e 100644 --- a/src/i18n/locales/zh-TW/quotes.json +++ b/src/i18n/locales/zh-TW/quotes.json @@ -16,7 +16,8 @@ "sizeScale": "字體大小", "userFilter": "使用者篩選", "posFilter": "詞性過濾", - "enableStopwords": "過濾停用詞" + "enableStopwords": "過濾停用詞", + "dict": "分詞詞庫" }, "size": { "small": "小", @@ -36,6 +37,21 @@ "empty": { "title": "暫無詞雲資料", "description": "目前篩選條件下的文字訊息不足,無法產生詞雲" + }, + "dict": { + "zhCN": "簡體中文", + "zhTW": "繁體中文", + "download": "下載詞庫", + "download_zh-CN": "下載簡體詞庫", + "download_zh-TW": "下載繁體詞庫", + "needDownload": "使用詞雲分析需要先下載分詞詞庫", + "downloadHint": "~8MB", + "downloadSuccess": "詞庫下載成功", + "downloadFailed": "詞庫下載失敗", + "promptTitle": "繁體中文詞庫", + "promptDescription": "偵測到您使用繁體中文,詞雲分析需要繁體詞庫才能取得最佳效果。是否立即下載?(約 8MB)", + "promptDownload": "立即下載", + "promptLater": "稍後再說" } }, "keywords": {