add Xunfei Voice

Signed-off-by: njnuko <njnuko@163.com>
2026-03-18 12:40:06 +08:00 · 2024-05-20 15:04:23 +08:00
parent 99aac76618
commit 6fed719e09
5 changed files with 460 additions and 0 deletions
--- a/voice/factory.py
+++ b/voice/factory.py
@@ -46,4 +46,8 @@ def create_voice(voice_type):
        from voice.edge.edge_voice import EdgeVoice
        return EdgeVoice()
    elif voice_type == "xunfei":
        from voice.xunfei.xunfei_voice import XunfeiVoice
        return XunfeiVoice()
    raise RuntimeError
--- a/voice/xunfei/config.json.template
+++ b/voice/xunfei/config.json.template
@@ -0,0 +1,7 @@
 {
  "APPID":"xxx71xxx",  #讯飞xfyun.cn控制台中应用的ID
  "APIKey":"xxxx69058exxxxxx",  #讯飞xfyun.cn控制台语音合成或者听写界面的APIKey
  "APISecret":"xxxx697f0xxxxxx",  #讯飞xfyun.cn控制台语音合成或者听写界面的APIKey
  "BusinessArgsTTS":{"aue": "lame", "sfl": 1, "auf": "audio/L16;rate=16000", "vcn": "xiaoyan", "tte": "utf8"}, #语音合成的参数，具体可以参考xfyun.cn的文档
  "BusinessArgsASR":{"domain": "iat", "language": "zh_cn", "accent": "mandarin", "vad_eos":10000, "dwa": "wpgs"}  #语音听写的参数，具体可以参考xfyun.cn的文档
 }
--- a/voice/xunfei/xunfei_asr.py
+++ b/voice/xunfei/xunfei_asr.py
@@ -0,0 +1,209 @@
 # -*- coding:utf-8 -*-
 #
 #  Author: njnuko 
 #  Email: njnuko@163.com 
 #
 #  这个文档是基于官方的demo来改的，固体官方demo文档请参考官网
 #
 #  语音听写流式 WebAPI 接口调用示例 接口文档（必看）：https://doc.xfyun.cn/rest_api/语音听写（流式版）.html
 #  webapi 听写服务参考帖子（必看）：http://bbs.xfyun.cn/forum.php?mod=viewthread&tid=38947&extra=
 #  语音听写流式WebAPI 服务，热词使用方式：登陆开放平台https://www.xfyun.cn/后，找到控制台--我的应用---语音听写（流式）---服务管理--个性化热词，
 #  设置热词
 #  注意：热词只能在识别的时候会增加热词的识别权重，需要注意的是增加相应词条的识别率，但并不是绝对的，具体效果以您测试为准。
 #  语音听写流式WebAPI 服务，方言试用方法：登陆开放平台https://www.xfyun.cn/后，找到控制台--我的应用---语音听写（流式）---服务管理--识别语种列表
 #  可添加语种或方言，添加后会显示该方言的参数值
 #  错误码链接：https://www.xfyun.cn/document/error-code （code返回错误码时必看）
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 import websocket
 import datetime
 import hashlib
 import base64
 import hmac
 import json
 from urllib.parse import urlencode
 import time
 import ssl
 from wsgiref.handlers import format_date_time
 from datetime import datetime
 from time import mktime
 import _thread as thread
 import os
 import wave
 STATUS_FIRST_FRAME = 0  # 第一帧的标识
 STATUS_CONTINUE_FRAME = 1  # 中间帧标识
 STATUS_LAST_FRAME = 2  # 最后一帧的标识
 #############
 #whole_dict 是用来存储返回值的，由于带语音修正，所以用dict来存储，有更新的化pop之前的值，最后再合并
 global whole_dict
 #这个文档是官方文档改的，这个参数是用来做函数调用时用的
 global wsParam
 ##############
 class Ws_Param(object):
    # 初始化
    def __init__(self, APPID, APIKey, APISecret,BusinessArgs, AudioFile):
        self.APPID = APPID
        self.APIKey = APIKey
        self.APISecret = APISecret
        self.AudioFile = AudioFile
        self.BusinessArgs = BusinessArgs
        # 公共参数(common)
        self.CommonArgs = {"app_id": self.APPID}
        # 业务参数(business)，更多个性化参数可在官网查看
        #self.BusinessArgs = {"domain": "iat", "language": "zh_cn", "accent": "mandarin", "vinfo":1,"vad_eos":10000}
    # 生成url
    def create_url(self):
        url = 'wss://ws-api.xfyun.cn/v2/iat'
        # 生成RFC1123格式的时间戳
        now = datetime.now()
        date = format_date_time(mktime(now.timetuple()))
        # 拼接字符串
        signature_origin = "host: " + "ws-api.xfyun.cn" + "\n"
        signature_origin += "date: " + date + "\n"
        signature_origin += "GET " + "/v2/iat " + "HTTP/1.1"
        # 进行hmac-sha256进行加密
        signature_sha = hmac.new(self.APISecret.encode('utf-8'), signature_origin.encode('utf-8'),
                                 digestmod=hashlib.sha256).digest()
        signature_sha = base64.b64encode(signature_sha).decode(encoding='utf-8')
        authorization_origin = "api_key=\"%s\", algorithm=\"%s\", headers=\"%s\", signature=\"%s\"" % (
            self.APIKey, "hmac-sha256", "host date request-line", signature_sha)
        authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8')
        # 将请求的鉴权参数组合为字典
        v = {
            "authorization": authorization,
            "date": date,
            "host": "ws-api.xfyun.cn"
        }
        # 拼接鉴权参数，生成url
        url = url + '?' + urlencode(v)
        #print("date: ",date)
        #print("v: ",v)
        # 此处打印出建立连接时候的url,参考本demo的时候可取消上方打印的注释，比对相同参数时生成的url与自己代码生成的url是否一致
        #print('websocket url :', url)
        return url
 # 收到websocket消息的处理
 def on_message(ws, message):
    global whole_dict
    try:
        code = json.loads(message)["code"]
        sid = json.loads(message)["sid"]
        if code != 0:
            errMsg = json.loads(message)["message"]
            print("sid:%s call error:%s code is:%s" % (sid, errMsg, code))
        else:
            temp1 = json.loads(message)["data"]["result"]
            data = json.loads(message)["data"]["result"]["ws"]
            sn = temp1["sn"]
            if "rg" in temp1.keys():
                rep = temp1["rg"]
                rep_start = rep[0]
                rep_end = rep[1]
                for sn in range(rep_start,rep_end+1):
                    print("before pop",whole_dict)
                    print("sn",sn)
                    whole_dict.pop(sn,None)
                    print("after pop",whole_dict)
                results = ""
                for i in data:
                    for w in i["cw"]:
                        results += w["w"]
                whole_dict[sn]=results
                print("after add",whole_dict)
            else:
                results = ""
                for i in data:
                    for w in i["cw"]:
                        results += w["w"]
                whole_dict[sn]=results
            print("sid:%s call success!,data is:%s" % (sid, json.dumps(data, ensure_ascii=False)))
    except Exception as e:
        print("receive msg,but parse exception:", e)
 # 收到websocket错误的处理
 def on_error(ws, error):
    print("### error:", error)
 # 收到websocket关闭的处理
 def on_close(ws,a,b):
    print("### closed ###")
 # 收到websocket连接建立的处理
 def on_open(ws):
    global wsParam
    def run(*args):
        frameSize = 8000  # 每一帧的音频大小
        intervel = 0.04  # 发送音频间隔(单位:s)
        status = STATUS_FIRST_FRAME  # 音频的状态信息，标识音频是第一帧，还是中间帧、最后一帧
        with wave.open(wsParam.AudioFile, "rb") as fp:
            while True:
                buf = fp.readframes(frameSize)
                # 文件结束
                if not buf:
                    status = STATUS_LAST_FRAME
                # 第一帧处理
                # 发送第一帧音频，带business 参数
                # appid 必须带上，只需第一帧发送
                if status == STATUS_FIRST_FRAME:
                    d = {"common": wsParam.CommonArgs,
                         "business": wsParam.BusinessArgs,
                         "data": {"status": 0, "format": "audio/L16;rate=16000","audio": str(base64.b64encode(buf), 'utf-8'), "encoding": "raw"}} 
                    d = json.dumps(d)
                    ws.send(d)
                    status = STATUS_CONTINUE_FRAME
                # 中间帧处理
                elif status == STATUS_CONTINUE_FRAME:
                    d = {"data": {"status": 1, "format": "audio/L16;rate=16000",
                                  "audio": str(base64.b64encode(buf), 'utf-8'),
                                  "encoding": "raw"}}
                    ws.send(json.dumps(d))
                # 最后一帧处理
                elif status == STATUS_LAST_FRAME:
                    d = {"data": {"status": 2, "format": "audio/L16;rate=16000",
                                  "audio": str(base64.b64encode(buf), 'utf-8'),
                                  "encoding": "raw"}}
                    ws.send(json.dumps(d))
                    time.sleep(1)
                    break
                # 模拟音频采样间隔
                time.sleep(intervel)
        ws.close()
    thread.start_new_thread(run, ())
 #提供给xunfei_voice调用的函数
 def xunfei_asr(APPID,APISecret,APIKey,BusinessArgsASR,AudioFile):
    global whole_dict
    global wsParam
    whole_dict = {}
    wsParam1 = Ws_Param(APPID=APPID, APISecret=APISecret,
                       APIKey=APIKey,BusinessArgs=BusinessArgsASR,
                       AudioFile=AudioFile)
    #wsParam是global变量，给上面on_open函数调用使用的
    wsParam = wsParam1
    websocket.enableTrace(True)
    wsUrl = wsParam.create_url()
    ws = websocket.WebSocketApp(wsUrl, on_message=on_message, on_error=on_error, on_close=on_close)
    ws.on_open = on_open
    ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})
    #把字典的值合并起来做最后识别的输出
    whole_words = ""
    for i in sorted(whole_dict.keys()):
        whole_words += whole_dict[i]
    return whole_words
--- a/voice/xunfei/xunfei_tts.py
+++ b/voice/xunfei/xunfei_tts.py
@@ -0,0 +1,163 @@
 # -*- coding:utf-8 -*-
 #
 #  Author: njnuko
 #  Email: njnuko@163.com
 #
 #  这个文档是基于官方的demo来改的，固体官方demo文档请参考官网
 #
 #  语音听写流式 WebAPI 接口调用示例 接口文档（必看）：https://doc.xfyun.cn/rest_api/语音听写（流式版）.html
 #  webapi 听写服务参考帖子（必看）：http://bbs.xfyun.cn/forum.php?mod=viewthread&tid=38947&extra=
 #  语音听写流式WebAPI 服务，热词使用方式：登陆开放平台https://www.xfyun.cn/后，找到控制台--我的应用---语音听写（流式）---服务管理--个性化热词，
 #  设置热词
 #  注意：热词只能在识别的时候会增加热词的识别权重，需要注意的是增加相应词条的识别率，但并不是绝对的，具体效果以您测试为准。
 #  语音听写流式WebAPI 服务，方言试用方法：登陆开放平台https://www.xfyun.cn/后，找到控制台--我的应用---语音听写（流式）---服务管理--识别语种列表
 #  可添加语种或方言，添加后会显示该方言的参数值
 #  错误码链接：https://www.xfyun.cn/document/error-code （code返回错误码时必看）
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 import websocket
 import datetime
 import hashlib
 import base64
 import hmac
 import json
 from urllib.parse import urlencode
 import time
 import ssl
 from wsgiref.handlers import format_date_time
 from datetime import datetime
 from time import mktime
 import _thread as thread
 import os
 STATUS_FIRST_FRAME = 0  # 第一帧的标识
 STATUS_CONTINUE_FRAME = 1  # 中间帧标识
 STATUS_LAST_FRAME = 2  # 最后一帧的标识
 #############
 #这个参数是用来做输出文件路径的
 global outfile
 #这个文档是官方文档改的，这个参数是用来做函数调用时用的
 global wsParam
 ##############
 class Ws_Param(object):
    # 初始化
    def __init__(self, APPID, APIKey, APISecret,BusinessArgs,Text):
        self.APPID = APPID
        self.APIKey = APIKey
        self.APISecret = APISecret
        self.BusinessArgs = BusinessArgs
        self.Text = Text
        # 公共参数(common)
        self.CommonArgs = {"app_id": self.APPID}
        # 业务参数(business)，更多个性化参数可在官网查看
        #self.BusinessArgs = {"aue": "raw", "auf": "audio/L16;rate=16000", "vcn": "xiaoyan", "tte": "utf8"}
        self.Data = {"status": 2, "text": str(base64.b64encode(self.Text.encode('utf-8')), "UTF8")}
        #使用小语种须使用以下方式，此处的unicode指的是 utf16小端的编码方式，即"UTF-16LE"”
        #self.Data = {"status": 2, "text": str(base64.b64encode(self.Text.encode('utf-16')), "UTF8")}
    # 生成url
    def create_url(self):
        url = 'wss://tts-api.xfyun.cn/v2/tts'
        # 生成RFC1123格式的时间戳
        now = datetime.now()
        date = format_date_time(mktime(now.timetuple()))
        # 拼接字符串
        signature_origin = "host: " + "ws-api.xfyun.cn" + "\n"
        signature_origin += "date: " + date + "\n"
        signature_origin += "GET " + "/v2/tts " + "HTTP/1.1"
        # 进行hmac-sha256进行加密
        signature_sha = hmac.new(self.APISecret.encode('utf-8'), signature_origin.encode('utf-8'),
                                 digestmod=hashlib.sha256).digest()
        signature_sha = base64.b64encode(signature_sha).decode(encoding='utf-8')
        authorization_origin = "api_key=\"%s\", algorithm=\"%s\", headers=\"%s\", signature=\"%s\"" % (
            self.APIKey, "hmac-sha256", "host date request-line", signature_sha)
        authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8')
        # 将请求的鉴权参数组合为字典
        v = {
            "authorization": authorization,
            "date": date,
            "host": "ws-api.xfyun.cn"
        }
        # 拼接鉴权参数，生成url
        url = url + '?' + urlencode(v)
        # print("date: ",date)
        # print("v: ",v)
        # 此处打印出建立连接时候的url,参考本demo的时候可取消上方打印的注释，比对相同参数时生成的url与自己代码生成的url是否一致
        # print('websocket url :', url)
        return url
 def on_message(ws, message):
    #输出文件
    global outfile
    try:
        message =json.loads(message)
        code = message["code"]
        sid = message["sid"]
        audio = message["data"]["audio"]
        audio = base64.b64decode(audio)
        status = message["data"]["status"]
        if status == 2:
            print("ws is closed")
            ws.close()
        if code != 0:
            errMsg = message["message"]
            print("sid:%s call error:%s code is:%s" % (sid, errMsg, code))
        else:
            with open(outfile, 'ab') as f:
                f.write(audio)
    except Exception as e:
        print("receive msg,but parse exception:", e)
 # 收到websocket连接建立的处理
 def on_open(ws):
    global outfile
    global wsParam
    def run(*args):
        d = {"common": wsParam.CommonArgs,
             "business": wsParam.BusinessArgs,
             "data": wsParam.Data,
             }
        d = json.dumps(d)
        print("------>开始发送文本数据")
        ws.send(d)
        if os.path.exists(outfile):
            os.remove(outfile)
    thread.start_new_thread(run, ())
 # 收到websocket错误的处理
 def on_error(ws, error):
    print("### error:", error)
 # 收到websocket关闭的处理
 def on_close(ws):
    print("### closed ###")
 def xunfei_tts(APPID, APIKey, APISecret,BusinessArgsTTS, Text, OutFile):
    global outfile
    global wsParam 
    outfile = OutFile
    wsParam1 = Ws_Param(APPID,APIKey,APISecret,BusinessArgsTTS,Text)
    wsParam = wsParam1
    websocket.enableTrace(False)
    wsUrl = wsParam.create_url()
    ws = websocket.WebSocketApp(wsUrl, on_message=on_message, on_error=on_error, on_close=on_close)
    ws.on_open = on_open
    ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})
    return outfile
--- a/voice/xunfei/xunfei_voice.py
+++ b/voice/xunfei/xunfei_voice.py
@@ -0,0 +1,77 @@
 #####################################################################
 #    xunfei voice service
 #     Auth: njnuko
 #     Email: njnuko@163.com
 #
 #    要使用本模块, 首先到 xfyun.cn 注册一个开发者账号,
 #    之后创建一个新应用, 然后在应用管理的语音识别或者语音合同右边可以查看APPID API Key 和 Secret Key
 #    然后在 config.json 中填入这三个值
 #####################################################################
 import json
 import os
 import time
 from bridge.reply import Reply, ReplyType
 from common.log import logger
 from common.tmp_dir import TmpDir
 from config import conf
 from voice.voice import Voice
 from .xunfei_asr import xunfei_asr
 from .xunfei_tts import xunfei_tts
 from voice.audio_convert import any_to_mp3
 import shutil
 from pydub import AudioSegment
 class XunfeiVoice(Voice):
    def __init__(self):
        try:
            curdir = os.path.dirname(__file__)
            config_path = os.path.join(curdir, "config.json")
            conf = None
            with open(config_path, "r") as fr:
                conf = json.load(fr)
            print(conf)
            self.APPID = str(conf.get("APPID"))
            self.APIKey = str(conf.get("APIKey"))
            self.APISecret = str(conf.get("APISecret"))
            self.BusinessArgsTTS = conf.get("BusinessArgsTTS")
            self.BusinessArgsASR= conf.get("BusinessArgsASR")
        except Exception as e:
            logger.warn("XunfeiVoice init failed: %s, ignore " % e)
    def voiceToText(self, voice_file):
        # 识别本地文件
        try:
            logger.debug("[Xunfei] voice file name={}".format(voice_file))
            #print("voice_file===========",voice_file)
            #print("voice_file_type===========",type(voice_file))
            #mp3_name, file_extension = os.path.splitext(voice_file)
            #mp3_file = mp3_name + ".mp3"
            #pcm_data=get_pcm_from_wav(voice_file)
            #mp3_name, file_extension = os.path.splitext(voice_file)
            #AudioSegment.from_wav(voice_file).export(mp3_file, format="mp3")
            #shutil.copy2(voice_file, 'tmp/test1.wav')
            #shutil.copy2(mp3_file, 'tmp/test1.mp3')
            #print("voice and mp3 file",voice_file,mp3_file)
            text = xunfei_asr(self.APPID,self.APISecret,self.APIKey,self.BusinessArgsASR,voice_file)
            logger.info("讯飞语音识别到了: {}".format(text))
            reply = Reply(ReplyType.TEXT, text)
        except Exception as e:
            logger.warn("XunfeiVoice init failed: %s, ignore " % e)
            reply = Reply(ReplyType.ERROR, "讯飞语音识别出错了；{0}")
        return reply
    def textToVoice(self, text):
        try:
            # Avoid the same filename under multithreading
            fileName = TmpDir().path() + "reply-" + str(int(time.time())) + "-" + str(hash(text) & 0x7FFFFFFF) + ".mp3"
            return_file = xunfei_tts(self.APPID,self.APIKey,self.APISecret,self.BusinessArgsTTS,text,fileName)
            logger.info("[Xunfei] textToVoice text={} voice file name={}".format(text, fileName))
            reply = Reply(ReplyType.VOICE, fileName)
        except Exception as e:
            logger.error("[Xunfei] textToVoice error={}".format(fileName))
            reply = Reply(ReplyType.ERROR, "抱歉，讯飞语音合成失败")
        return reply