VOSK实现语音输出
以下代码读取麦克风输入并输出中文:
#encoding:utf-8
import wave
import time
import json
import threading
from vosk import Model,KaldiRecognizer,SetLogLevel
# 禁止日志输出
SetLogLevel(-1)
model_path = "models/vosk-model-cn-0.1"
model = Model(model_path)
def resume_microphone(rec):
print(rec.state(),alsaaudio.PCM_STATE_PAUSED)
if rec.state() == alsaaudio.PCM_STATE_PAUSED:
rec.pause(False)
rec = KaldiRecognizer(model,16000)
rec.SetWords(True)
rec.SetPartialWords(True)
import alsaaudio
inp = alsaaudio.PCM(alsaaudio.PCM_CAPTURE,alsaaudio.PCM_NONBLOCK,channels=1,rate=16000,format=alsaaudio.PCM_FORMAT_S16_LE,periodsize=4096)
try:
while 1:
l,data = inp.read()
if rec.AcceptWaveform(data):
result = json.loads(rec.Result())
text = result["text"]
print("You say:{}".format(text))
if "停止" in text:
print("program will be pause")
inp.pause(True)
t = threading.Timer(5, resume_microphone,args=(inp,))
t.start()
if "退出" in text:
print("program will be exist.")
break
if "恢复" in text:
inp.pause(False)
print("program will be resume")
else:
res = json.loads(rec.PartialResult())
partial = res["partial"]
if partial:
print("Say:{}".format(partial))
if "退出" in partial:
print("program will be exist.")
break
time.sleep(0.01)
except KeyboardInterrupt:
print("KeyboardInterrupt...")
finally:
inp.close()
对已存在文件进行分析,需要使用16KHz频率,单声道及16位输入:
#encoding:utf-8
import wave
import json
from vosk import Model,KaldiRecognizer,SetLogLevel
def recognize_speech_from_file(filename):
SetLogLevel(-1)
wf = wave.open(filename, "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
print("Audio file must be WAV format mono PCM")
return
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetWords(True)
rec.SetPartialWords(True)
while True:
data = wf.readframes(2048)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
jres = json.loads(rec.Result())
print(jres["text"])
# else:
# jres = json.loads(rec.PartialResult())
# if jres["partial"] != "":
# print(jres["partial"],2222)
final_result = rec.FinalResult()
if final_result:
final_result = json.loads(final_result)
text = final_result.get("text")
if text:
print(text)
recognize_speech_from_file("output.wav")
使用pyaudio的情况:
#encoding:utf-8
import wave
import json
from pyaudio import PyAudio, paInt16
SetLogLevel(-1)
model_path = "models/vosk-model-cn-0.1"
model = Model(model_path)
rec = KaldiRecognizer(model,16000)
rec.SetWords(True)
rec.SetPartialWords(True)
p = PyAudio()
stream = p.open(format=paInt16, channels=1, rate=16000, input=True, frames_per_buffer=4096)
stream.start_stream()
try:
while stream.is_active():
data = stream.read(4096)
if rec.AcceptWaveform(data):
result = json.loads(rec.Result())
text = result["text"]
print("You say:{}".format(text))
if "退出" in text:
print("program will be exist.")
break
else:
res = json.loads(rec.PartialResult())
partial = res["partial"]
if partial:
print("Say:{}".format(partial))
if "退出" in partial:
print("program will be exist.")
break
time.sleep(0.01)
except KeyboardInterrupt:
print("KeyboardInterrupt...")
finally:
stream.stop_stream()
stream.close()
p.terminate()
参考文章:
https://blog.csdn.net/weixin_48967543/article/details/142338862