使用SenseVoiceSmall进行声音转文字

1、环境

　　　　windows 10 + python 3.11 + pycharm + torch 2.3.0

2、下载模型

git clone https://www.modelscope.cn/iic/SenseVoiceSmall.git

3、启动模型和对外API

import base64

import uvicorn
from fastapi import FastAPI
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
from pydantic import BaseModel

# 模型和音频文件的本地路径
model_dir = "G:\\py_workspace\\qwen\\models\\models\\SenseVoiceSmall"
model = AutoModel(
    model=model_dir,
    trust_remote_code=True,
    remote_code="./model.py",
    vad_model="fsmn-vad",
    vad_kwargs={"max_single_segment_time": 30000},
    device="cuda:0",
)

app = FastAPI()  # 创建一个FastAPI实例


# 定义一个Pydantic模型，用于验证接收到的数据结构
class ASRItem(BaseModel):
    audio_path: str  # wav字段是一个字符串，它将包含Base64编码的wav文件数据


# 定义一个POST路由，端点是/asr
@app.post("/asr")
async def asr(item: ASRItem):  # 接收一个ASRItem类型的对象
    try:
        # 将Base64编码的wav数据解码为二进制数据
        data = base64.b64decode(item.audio_path)

        # 将解码后的二进制数据写入到一个名为test.wav的文件中
        with open("test.wav", "wb") as f:
            f.write(data)

        # 使用语音识别模型处理test.wav文件，并设置相关参数
        # generate方法假设返回一个列表，列表中的每个元素是一个包含"text"键的字典
        res = model.generate("test.wav",
                             language="auto",  # 自动检测语言，也可以指定"zn"（中文）、"en"（英语）等
                             use_itn=True,  # 使用数字文本标准化
                             batch_size_s=60,  # 批处理大小（秒）
                             merge_vad=True,  # 合并语音活动检测
                             merge_length_s=15,  # 合并长度（秒）
                             )

        # 处理语音识别结果，假设rich_transcription_postprocess是一个用于后处理的函数
        # 该函数接收识别文本并返回处理后的文本
        text = rich_transcription_postprocess(res[0]["text"])

        # 构建一个成功响应的字典
        result_dict = {"code": 0, "msg": "ok", "res": text}
    except Exception as e:  # 如果在处理过程中发生异常
        # 构建一个错误响应的字典
        result_dict = {"code": 1, "msg": str(e)}

    # 返回响应字典
    return result_dict


if __name__ == '__main__':
    uvicorn.run(app, host='0.0.0.0', port=2002)

4、模型测试代码

from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess

model_dir = "G:\\py_workspace\\qwen\\models\\models\\SenseVoiceSmall"

model = AutoModel(
    model=model_dir,
    trust_remote_code=True,
    remote_code="./model.py",
    vad_model="fsmn-vad",
    vad_kwargs={"max_single_segment_time": 30000},
    device="cuda:0",
)

# en
res = model.generate(
    input="G:/py_workspace/qwen/media/demo2.mp3",
    cache={},
    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
    use_itn=True,
    batch_size_s=60,
    merge_vad=True,  #
    merge_length_s=15,
)
text = rich_transcription_postprocess(res[0]["text"])
print(text)