最近需要完成一个实时语音识别的项目,因此将讯飞的实时语音识别项目简单封装了一下,代码如下:
import threading
import time
import wave
import pyaudio
import json
import websocket
import hashlib
import hmac
import base64
from urllib.parse import quote
# ================== 配置信息 ==================
APPID = ""
API_KEY = ""
# ================== 音频参数 ==================
CHUNK = 1024 # 每帧大小
FORMAT = pyaudio.paInt16 # 音频格式
CHANNELS = 1 # 单声道
RATE = 16000 # 采样率
INTERVAL = 0.04 # 发送间隔(秒)
# ================== 实时识别客户端 ==================
class RealtimeASRClient:
def __init__(self, output_file=None):
self.ws = None
self.connected = False
self.output_file = output_file
self.lock = threading.Lock()
def connect(self):
try:
ts = str(int(time.time()))
tt = (APPID + ts).encode('utf-8')
md5 = hashlib.md5()
md5.update(tt)
baseString = md5.hexdigest()
baseString = bytes(baseString, encoding='utf-8')
apiKey = API_KEY.encode('utf-8')
signa = hmac.new(apiKey, baseString, hashlib.sha1).digest()
signa = base64.b64encode(signa)
signa = str(signa, 'utf-8')
base_url = "ws://rtasr.xfyun.cn/v1/ws"
self.ws = websocket.create_connection(
f"{base_url}?appid={APPID}&ts={ts}&signa={quote(signa)}"
)
print("WebSocket 连接成功")
self.connected = True
except Exception as e:
print("WebSocket 连接失败:", e)
self.connected = False
def send_audio_stream(self, stream):
if not self.connected:
return
end_tag = "{\"end\": true}"
try:
while self.connected:
data = stream.read(CHUNK, exception_on_overflow=False)
if not data:
break
self.ws.send(data, opcode=websocket.ABNF.OPCODE_BINARY)
time.sleep(INTERVAL) # 控制节奏
except Exception as e:
print("音频发送错误:", e)
finally:
self.ws.send(bytes(end_tag.encode('utf-8')))
print("已发送结束标记")
def recv(self):
if not self.connected:
return
try:
while self.connected:
result = self.ws.recv()
if not result:
print("接收结束")
break
try:
result_dict = json.loads(result)
if result_dict.get("action") == "result":
data = json.loads(result_dict.get("data", ""))
text = self.extract_text_from_result(data)
if text:
print("实时识别:", text)
if self.output_file:
self.output_file.write(text + "\n")
self.output_file.flush()
except json.JSONDecodeError:
print("JSON 解析失败:", result)
except Exception as e:
print("接收异常:", e)
finally:
self.close()
def close(self):
self.connected = False
if self.ws:
self.ws.close()
print("WebSocket 已关闭")
def extract_text_from_result(self, result_dict):
try:
ws_list = result_dict.get("cn", {}).get("st", {}).get("rt", [{}])[0].get("ws", [])
text = ""
for ws in ws_list:
if ws.get("cw") and len(ws["cw"]) > 0:
word = ws["cw"][0].get("w", "")
text += word
return text.strip()
except Exception as e:
print("解析失败:", e)
return ""
此外,博主需要在开启语音识别的过程中,开启一个线程,其要进行录音,并将结果保存到txt中,最后还需要将录音与图像进行合并,生成一个完整视频,在实验过程中,博主发现其识别速度较慢,因此在查询相关技术文档后,发现只需要改变一下数据传输块大小和发送频率便可以显著提升速度,即:
CHUNK = 1024 # 每帧大小
INTERVAL = 0.04 # 发送间隔(秒)
默认是1024B,每40毫秒发送一次,经过博主测试,可以稳定在4096B,10毫秒的处理速度。
至于识别精度,说实话,一言难尽,这个实时的识别效果比起上传读取音频的识别效果要差很多。
不过毕竟它能够实现一个实时效果,就看大家如何选择了