VOSK实现语音输出

以下代码读取麦克风输入并输出中文:

#encoding:utf-8
import wave
import time
import json
import threading
from vosk import Model,KaldiRecognizer,SetLogLevel

# 禁止日志输出
SetLogLevel(-1)
model_path = "models/vosk-model-cn-0.1"
model = Model(model_path)

def resume_microphone(rec):
    print(rec.state(),alsaaudio.PCM_STATE_PAUSED)
    if rec.state() == alsaaudio.PCM_STATE_PAUSED:
        rec.pause(False)

rec = KaldiRecognizer(model,16000)
rec.SetWords(True)
rec.SetPartialWords(True)
import alsaaudio
inp = alsaaudio.PCM(alsaaudio.PCM_CAPTURE,alsaaudio.PCM_NONBLOCK,channels=1,rate=16000,format=alsaaudio.PCM_FORMAT_S16_LE,periodsize=4096)
try:
    while 1:
        l,data = inp.read()
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            text = result["text"]
            print("You say:{}".format(text))
            if "停止" in text:
                print("program will be pause")
                inp.pause(True)
                t = threading.Timer(5, resume_microphone,args=(inp,))
                t.start()
            if "退出" in text:
                print("program will be exist.")
                break
            if "恢复" in text:
                inp.pause(False)
                print("program will be resume")
        else:
            res = json.loads(rec.PartialResult())
            partial = res["partial"]
            if partial:
                print("Say:{}".format(partial))
                if "退出" in partial:
                    print("program will be exist.")
                    break
        time.sleep(0.01)
except KeyboardInterrupt:
    print("KeyboardInterrupt...")
finally:
    inp.close()

对已存在文件进行分析,需要使用16KHz频率,单声道及16位输入:

#encoding:utf-8
import wave
import json
from vosk import Model,KaldiRecognizer,SetLogLevel


def recognize_speech_from_file(filename):
    SetLogLevel(-1)
    wf = wave.open(filename, "rb")
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
        print("Audio file must be WAV format mono PCM")
        return
    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)
    rec.SetPartialWords(True)
    while True:
        data = wf.readframes(2048)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            jres = json.loads(rec.Result())
            print(jres["text"])
        # else:
        #     jres = json.loads(rec.PartialResult())
        #     if jres["partial"] != "":
        #         print(jres["partial"],2222)
    final_result = rec.FinalResult()
    if final_result:
        final_result = json.loads(final_result)
        text = final_result.get("text")
        if text:
            print(text)

recognize_speech_from_file("output.wav")

使用pyaudio的情况:

#encoding:utf-8
import wave
import json
from pyaudio import PyAudio, paInt16

SetLogLevel(-1)
model_path = "models/vosk-model-cn-0.1"
model = Model(model_path)

rec = KaldiRecognizer(model,16000)
rec.SetWords(True)
rec.SetPartialWords(True)
p = PyAudio()
stream = p.open(format=paInt16, channels=1, rate=16000, input=True, frames_per_buffer=4096)
stream.start_stream()
try:
    while stream.is_active():
        data = stream.read(4096)
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            text = result["text"]
            print("You say:{}".format(text))
            if "退出" in text:
                print("program will be exist.")
                break
        else:
            res = json.loads(rec.PartialResult())
            partial = res["partial"]
            if partial:
                print("Say:{}".format(partial))
                if "退出" in partial:
                    print("program will be exist.")
                    break
        time.sleep(0.01)
except KeyboardInterrupt:
    print("KeyboardInterrupt...")
finally:
    stream.stop_stream()
    stream.close()
    p.terminate()

参考文章:

https://blog.csdn.net/weixin_48967543/article/details/142338862

posted @ 2024-10-16 13:45  yafeile  阅读(17)  评论(0编辑  收藏  举报