feat: 尝试增加一下聊天里面的语音转文字功能

2026-03-22 07:32:29 +08:00 · 2026-01-17 05:14:14 +08:00
parent 095c8f0db6
commit 72e2d82158
18 changed files with 999 additions and 66 deletions
--- a/electron/main.ts
+++ b/electron/main.ts
@@ -15,6 +15,7 @@ import { groupAnalyticsService } from './services/groupAnalyticsService'
 import { annualReportService } from './services/annualReportService'
 import { exportService, ExportOptions } from './services/exportService'
 import { KeyService } from './services/keyService'
+import { voiceTranscribeService } from './services/voiceTranscribeService'


 // 配置自动更新
@@ -442,6 +443,10 @@ function registerIpcHandlers() {
    return chatService.getVoiceData(sessionId, msgId)
  })

+  ipcMain.handle('chat:getVoiceTranscript', async (_, sessionId: string, msgId: string) => {
+    return chatService.getVoiceTranscript(sessionId, msgId)
+  })
+
  ipcMain.handle('chat:getMessageById', async (_, sessionId: string, localId: number) => {
    return chatService.getMessageById(sessionId, localId)
  })
@@ -516,6 +521,16 @@ function registerIpcHandlers() {
    return { success: true }
  })

+  ipcMain.handle('whisper:downloadModel', async (event, payload: { modelName: string; downloadDir?: string; source?: string }) => {
+    return voiceTranscribeService.downloadModel(payload, (progress) => {
+      event.sender.send('whisper:downloadProgress', progress)
+    })
+  })
+
+  ipcMain.handle('whisper:getModelStatus', async (_, payload: { modelName: string; downloadDir?: string }) => {
+    return voiceTranscribeService.getModelStatus(payload)
+  })
+
  // 群聊分析相关
  ipcMain.handle('groupAnalytics:getGroupChats', async () => {
    return groupAnalyticsService.getGroupChats()
--- a/electron/preload.ts
+++ b/electron/preload.ts
@@ -106,7 +106,8 @@ contextBridge.exposeInMainWorld('electronAPI', {
    close: () => ipcRenderer.invoke('chat:close'),
    getSessionDetail: (sessionId: string) => ipcRenderer.invoke('chat:getSessionDetail', sessionId),
    getImageData: (sessionId: string, msgId: string) => ipcRenderer.invoke('chat:getImageData', sessionId, msgId),
-    getVoiceData: (sessionId: string, msgId: string) => ipcRenderer.invoke('chat:getVoiceData', sessionId, msgId)
+    getVoiceData: (sessionId: string, msgId: string) => ipcRenderer.invoke('chat:getVoiceData', sessionId, msgId),
+    getVoiceTranscript: (sessionId: string, msgId: string) => ipcRenderer.invoke('chat:getVoiceTranscript', sessionId, msgId)
  },


@@ -174,5 +175,16 @@ contextBridge.exposeInMainWorld('electronAPI', {
      ipcRenderer.invoke('export:exportSessions', sessionIds, outputDir, options),
    exportSession: (sessionId: string, outputPath: string, options: any) =>
      ipcRenderer.invoke('export:exportSession', sessionId, outputPath, options)
+  },
+
+  whisper: {
+    downloadModel: (payload: { modelName: string; downloadDir?: string; source?: string }) =>
+      ipcRenderer.invoke('whisper:downloadModel', payload),
+    getModelStatus: (payload: { modelName: string; downloadDir?: string }) =>
+      ipcRenderer.invoke('whisper:getModelStatus', payload),
+    onDownloadProgress: (callback: (payload: { modelName: string; downloadedBytes: number; totalBytes?: number; percent?: number }) => void) => {
+      ipcRenderer.on('whisper:downloadProgress', (_, payload) => callback(payload))
+      return () => ipcRenderer.removeAllListeners('whisper:downloadProgress')
+    }
  }
 })
--- a/electron/services/chatService.ts
+++ b/electron/services/chatService.ts
@@ -16,6 +16,7 @@ import { ConfigService } from './config'
 import { wcdbService } from './wcdbService'
 import { MessageCacheService } from './messageCacheService'
 import { ContactCacheService, ContactCacheEntry } from './contactCacheService'
+import { voiceTranscribeService } from './voiceTranscribeService'

 type HardlinkState = {
  db: Database.Database
@@ -83,6 +84,10 @@ class ChatService {
  private hardlinkCache = new Map<string, HardlinkState>()
  private readonly contactCacheService: ContactCacheService
  private readonly messageCacheService: MessageCacheService
+  private voiceWavCache = new Map<string, Buffer>()
+  private voiceTranscriptCache = new Map<string, string>()
+  private voiceTranscriptPending = new Map<string, Promise<{ success: boolean; transcript?: string; error?: string }>>()
+  private readonly voiceCacheMaxEntries = 50

  constructor() {
    this.configService = new ConfigService()
@@ -1738,6 +1743,9 @@ class ChatService {

    if (includeMessages) {
      this.messageCacheService.clear()
+      this.voiceWavCache.clear()
+      this.voiceTranscriptCache.clear()
+      this.voiceTranscriptPending.clear()
    }

    for (const state of this.hardlinkCache.values()) {
@@ -2263,6 +2271,8 @@ class ChatService {
        const pcmData = readFileSync(pcmFile)
        const wavHeader = this.createWavHeader(pcmData.length, 24000, 1) // 微信语音通常 24kHz
        const wavData = Buffer.concat([wavHeader, pcmData])
+        const cacheKey = this.getVoiceCacheKey(sessionId, msgId)
+        this.cacheVoiceWav(cacheKey, wavData)

        return { success: true, data: wavData.toString('base64') }
      } finally {
@@ -2276,6 +2286,45 @@ class ChatService {
    }
  }

+  async getVoiceTranscript(sessionId: string, msgId: string): Promise<{ success: boolean; transcript?: string; error?: string }> {
+    const cacheKey = this.getVoiceCacheKey(sessionId, msgId)
+    const cached = this.voiceTranscriptCache.get(cacheKey)
+    if (cached) {
+      return { success: true, transcript: cached }
+    }
+
+    const pending = this.voiceTranscriptPending.get(cacheKey)
+    if (pending) {
+      return pending
+    }
+
+    const task = (async () => {
+      try {
+        let wavData = this.voiceWavCache.get(cacheKey)
+        if (!wavData) {
+          const voiceResult = await this.getVoiceData(sessionId, msgId)
+          if (!voiceResult.success || !voiceResult.data) {
+            return { success: false, error: voiceResult.error || '语音解码失败' }
+          }
+          wavData = Buffer.from(voiceResult.data, 'base64')
+        }
+
+        const result = await voiceTranscribeService.transcribeWavBuffer(wavData)
+        if (result.success && result.transcript) {
+          this.cacheVoiceTranscript(cacheKey, result.transcript)
+        }
+        return result
+      } catch (error) {
+        return { success: false, error: String(error) }
+      } finally {
+        this.voiceTranscriptPending.delete(cacheKey)
+      }
+    })()
+
+    this.voiceTranscriptPending.set(cacheKey, task)
+    return task
+  }
+
  private createWavHeader(pcmLength: number, sampleRate: number = 24000, channels: number = 1): Buffer {
    const header = Buffer.alloc(44)
    header.write('RIFF', 0)
@@ -2294,6 +2343,26 @@ class ChatService {
    return header
  }

+  private getVoiceCacheKey(sessionId: string, msgId: string): string {
+    return `${sessionId}:${msgId}`
+  }
+
+  private cacheVoiceWav(cacheKey: string, wavData: Buffer): void {
+    this.voiceWavCache.set(cacheKey, wavData)
+    if (this.voiceWavCache.size > this.voiceCacheMaxEntries) {
+      const oldestKey = this.voiceWavCache.keys().next().value
+      if (oldestKey) this.voiceWavCache.delete(oldestKey)
+    }
+  }
+
+  private cacheVoiceTranscript(cacheKey: string, transcript: string): void {
+    this.voiceTranscriptCache.set(cacheKey, transcript)
+    if (this.voiceTranscriptCache.size > this.voiceCacheMaxEntries) {
+      const oldestKey = this.voiceTranscriptCache.keys().next().value
+      if (oldestKey) this.voiceTranscriptCache.delete(oldestKey)
+    }
+  }
+
  async getMessageById(sessionId: string, localId: number): Promise<{ success: boolean; message?: Message; error?: string }> {
    try {
      console.info('[ChatService] getMessageById (SQL)', { sessionId, localId })
--- a/electron/services/config.ts
+++ b/electron/services/config.ts
@@ -20,6 +20,9 @@ interface ConfigSchema {
  language: string
  logEnabled: boolean
  llmModelPath: string
+  whisperModelName: string
+  whisperModelDir: string
+  whisperDownloadSource: string
 }

 export class ConfigService {
@@ -42,7 +45,10 @@ export class ConfigService {
        themeId: 'cloud-dancer',
        language: 'zh-CN',
        logEnabled: false,
-        llmModelPath: ''
+        llmModelPath: '',
+        whisperModelName: 'base',
+        whisperModelDir: '',
+        whisperDownloadSource: 'tsinghua'
      }
    })
  }
--- a/electron/services/voiceTranscribeService.ts
+++ b/electron/services/voiceTranscribeService.ts
@@ -0,0 +1,281 @@
+import { app } from 'electron'
+import { createWriteStream, existsSync, mkdirSync, statSync, unlinkSync, writeFileSync } from 'fs'
+import { join, dirname } from 'path'
+import { promisify } from 'util'
+import { execFile, spawnSync } from 'child_process'
+import * as https from 'https'
+import * as http from 'http'
+import { ConfigService } from './config'
+
+const execFileAsync = promisify(execFile)
+
+type WhisperModelInfo = {
+  name: string
+  fileName: string
+  sizeLabel: string
+  sizeBytes?: number
+}
+
+type DownloadProgress = {
+  modelName: string
+  downloadedBytes: number
+  totalBytes?: number
+  percent?: number
+}
+
+const WHISPER_MODELS: Record<string, WhisperModelInfo> = {
+  tiny: { name: 'tiny', fileName: 'ggml-tiny.bin', sizeLabel: '75 MB', sizeBytes: 75_000_000 },
+  base: { name: 'base', fileName: 'ggml-base.bin', sizeLabel: '142 MB', sizeBytes: 142_000_000 },
+  small: { name: 'small', fileName: 'ggml-small.bin', sizeLabel: '466 MB', sizeBytes: 466_000_000 },
+  medium: { name: 'medium', fileName: 'ggml-medium.bin', sizeLabel: '1.5 GB', sizeBytes: 1_500_000_000 },
+  'large-v3': { name: 'large-v3', fileName: 'ggml-large-v3.bin', sizeLabel: '2.9 GB', sizeBytes: 2_900_000_000 }
+}
+
+const WHISPER_SOURCES: Record<string, string> = {
+  official: 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main',
+  tsinghua: 'https://hf-mirror.com/ggerganov/whisper.cpp/resolve/main'
+}
+
+function getStaticFfmpegPath(): string | null {
+  try {
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    const ffmpegStatic = require('ffmpeg-static')
+    if (typeof ffmpegStatic === 'string' && existsSync(ffmpegStatic)) {
+      return ffmpegStatic
+    }
+    const devPath = join(process.cwd(), 'node_modules', 'ffmpeg-static', 'ffmpeg.exe')
+    if (existsSync(devPath)) {
+      return devPath
+    }
+    if (app.isPackaged) {
+      const resourcesPath = process.resourcesPath
+      const packedPath = join(resourcesPath, 'app.asar.unpacked', 'node_modules', 'ffmpeg-static', 'ffmpeg.exe')
+      if (existsSync(packedPath)) {
+        return packedPath
+      }
+    }
+    return null
+  } catch {
+    return null
+  }
+}
+
+export class VoiceTranscribeService {
+  private configService = new ConfigService()
+  private downloadTasks = new Map<string, Promise<{ success: boolean; path?: string; error?: string }>>()
+
+  private resolveModelInfo(modelName: string): WhisperModelInfo | null {
+    return WHISPER_MODELS[modelName] || null
+  }
+
+  private resolveModelDir(overrideDir?: string): string {
+    const configured = overrideDir || this.configService.get('whisperModelDir')
+    if (configured) return configured
+    return join(app.getPath('userData'), 'models', 'whisper')
+  }
+
+  private resolveModelPath(modelName: string, overrideDir?: string): string | null {
+    const info = this.resolveModelInfo(modelName)
+    if (!info) return null
+    return join(this.resolveModelDir(overrideDir), info.fileName)
+  }
+
+  private resolveSourceUrl(overrideSource?: string): string {
+    const configured = overrideSource || this.configService.get('whisperDownloadSource')
+    if (configured && WHISPER_SOURCES[configured]) return WHISPER_SOURCES[configured]
+    return WHISPER_SOURCES.official
+  }
+
+  async getModelStatus(payload: { modelName: string; downloadDir?: string }): Promise<{
+    success: boolean
+    exists?: boolean
+    path?: string
+    sizeBytes?: number
+    error?: string
+  }> {
+    const modelPath = this.resolveModelPath(payload.modelName, payload.downloadDir)
+    if (!modelPath) {
+      return { success: false, error: '未知模型名称' }
+    }
+    if (!existsSync(modelPath)) {
+      return { success: true, exists: false, path: modelPath }
+    }
+    const sizeBytes = statSync(modelPath).size
+    return { success: true, exists: true, path: modelPath, sizeBytes }
+  }
+
+  async downloadModel(
+    payload: { modelName: string; downloadDir?: string; source?: string },
+    onProgress?: (progress: DownloadProgress) => void
+  ): Promise<{ success: boolean; path?: string; error?: string }> {
+    const info = this.resolveModelInfo(payload.modelName)
+    if (!info) {
+      return { success: false, error: '未知模型名称' }
+    }
+
+    const modelPath = this.resolveModelPath(payload.modelName, payload.downloadDir)
+    if (!modelPath) {
+      return { success: false, error: '模型路径生成失败' }
+    }
+
+    if (existsSync(modelPath)) {
+      return { success: true, path: modelPath }
+    }
+
+    const cacheKey = `${payload.modelName}:${modelPath}`
+    const pending = this.downloadTasks.get(cacheKey)
+    if (pending) return pending
+
+    const task = (async () => {
+      try {
+        const targetDir = this.resolveModelDir(payload.downloadDir)
+        if (!existsSync(targetDir)) {
+          mkdirSync(targetDir, { recursive: true })
+        }
+
+        const baseUrl = this.resolveSourceUrl(payload.source)
+        const url = `${baseUrl}/${info.fileName}`
+        await this.downloadToFile(url, modelPath, payload.modelName, onProgress)
+        return { success: true, path: modelPath }
+      } catch (error) {
+        try { if (existsSync(modelPath)) unlinkSync(modelPath) } catch { }
+        return { success: false, error: String(error) }
+      } finally {
+        this.downloadTasks.delete(cacheKey)
+      }
+    })()
+
+    this.downloadTasks.set(cacheKey, task)
+    return task
+  }
+
+  async transcribeWavBuffer(wavData: Buffer): Promise<{ success: boolean; transcript?: string; error?: string }> {
+    const modelName = this.configService.get('whisperModelName') || 'base'
+    const modelPath = this.resolveModelPath(modelName)
+    console.info('[VoiceTranscribe] check model', { modelName, modelPath, exists: modelPath ? existsSync(modelPath) : false })
+    if (!modelPath || !existsSync(modelPath)) {
+      return { success: false, error: '未下载语音模型，请在设置中下载' }
+    }
+
+    // 使用内置的预编译 whisper-cli.exe
+    const resourcesPath = app.isPackaged
+      ? join(process.resourcesPath, 'resources')
+      : join(app.getAppPath(), 'resources')
+    const whisperExe = join(resourcesPath, 'whisper-cli.exe')
+    
+    if (!existsSync(whisperExe)) {
+      return { success: false, error: '找不到语音转写程序，请重新安装应用' }
+    }
+
+    const ffmpegPath = getStaticFfmpegPath() || 'ffmpeg'
+    console.info('[VoiceTranscribe] ffmpeg path', ffmpegPath)
+
+    const tempDir = app.getPath('temp')
+    const fileToken = `${Date.now()}_${Math.random().toString(16).slice(2)}`
+    const inputPath = join(tempDir, `weflow_voice_${fileToken}.wav`)
+    const outputPath = join(tempDir, `weflow_voice_${fileToken}_16k.wav`)
+
+    try {
+      writeFileSync(inputPath, wavData)
+      console.info('[VoiceTranscribe] converting to 16kHz', { inputPath, outputPath })
+      await execFileAsync(ffmpegPath, ['-y', '-i', inputPath, '-ar', '16000', '-ac', '1', outputPath])
+      
+      console.info('[VoiceTranscribe] transcribing with whisper', { whisperExe, modelPath })
+      const { stdout } = await execFileAsync(whisperExe, [
+        '-m', modelPath,
+        '-f', outputPath,
+        '-l', 'zh',
+        '-otxt'
+      ], {
+        maxBuffer: 10 * 1024 * 1024,
+        cwd: tempDir
+      })
+
+      // 解析输出文本
+      const txtFile = outputPath.replace(/\.[^.]+$/, '.txt')
+      let transcript = ''
+      if (existsSync(txtFile)) {
+        const { readFileSync } = await import('fs')
+        transcript = readFileSync(txtFile, 'utf-8').trim()
+        unlinkSync(txtFile)
+      } else {
+        // 从 stdout 提取
+        const lines = stdout.split('\n').filter(line => {
+          const trimmed = line.trim()
+          return trimmed && !trimmed.startsWith('[') && !trimmed.startsWith('whisper_')
+        })
+        transcript = lines.join(' ').trim()
+      }
+
+      console.info('[VoiceTranscribe] success', { transcript })
+      return { success: true, transcript }
+    } catch (error) {
+      console.error('[VoiceTranscribe] failed', error)
+      return { success: false, error: String(error) }
+    } finally {
+      try { if (existsSync(inputPath)) unlinkSync(inputPath) } catch { }
+      try { if (existsSync(outputPath)) unlinkSync(outputPath) } catch { }
+    }
+  }
+
+  private downloadToFile(
+    url: string,
+    targetPath: string,
+    modelName: string,
+    onProgress?: (progress: DownloadProgress) => void,
+    remainingRedirects = 3
+  ): Promise<void> {
+    return new Promise((resolve, reject) => {
+      const protocol = url.startsWith('https') ? https : http
+      const request = protocol.get(url, (response) => {
+        if ([301, 302, 303, 307, 308].includes(response.statusCode || 0) && response.headers.location) {
+          if (remainingRedirects <= 0) {
+            reject(new Error('下载重定向次数过多'))
+            return
+          }
+          this.downloadToFile(response.headers.location, targetPath, modelName, onProgress, remainingRedirects - 1)
+            .then(resolve)
+            .catch(reject)
+          return
+        }
+
+        if (response.statusCode !== 200) {
+          reject(new Error(`下载失败: ${response.statusCode}`))
+          return
+        }
+
+        const totalBytes = Number(response.headers['content-length'] || 0) || undefined
+        let downloadedBytes = 0
+
+        const writer = createWriteStream(targetPath)
+
+        response.on('data', (chunk) => {
+          downloadedBytes += chunk.length
+          const percent = totalBytes ? (downloadedBytes / totalBytes) * 100 : undefined
+          onProgress?.({ modelName, downloadedBytes, totalBytes, percent })
+        })
+
+        response.on('error', (error) => {
+          try { writer.close() } catch { }
+          reject(error)
+        })
+
+        writer.on('error', (error) => {
+          try { writer.close() } catch { }
+          reject(error)
+        })
+
+        writer.on('finish', () => {
+          writer.close()
+          resolve()
+        })
+
+        response.pipe(writer)
+      })
+
+      request.on('error', reject)
+    })
+  }
+}
+
+export const voiceTranscribeService = new VoiceTranscribeService()
--- a/electron/types/whisper-node.d.ts
+++ b/electron/types/whisper-node.d.ts
@@ -0,0 +1,22 @@
+declare module 'whisper-node' {
+  export type WhisperSegment = {
+    start: string
+    end: string
+    speech: string
+  }
+
+  export type WhisperOptions = {
+    modelName?: string
+    modelPath?: string
+    whisperOptions?: {
+      language?: string
+      gen_file_txt?: boolean
+      gen_file_subtitle?: boolean
+      gen_file_vtt?: boolean
+      word_timestamps?: boolean
+      timestamp_size?: number
+    }
+  }
+
+  export default function whisper(filePath: string, options?: WhisperOptions): Promise<WhisperSegment[]>
+}