使用sherpa + go 来做语音识别
参考
1.预训练模型:https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html
2.sherpa-onnx介绍 https://k2-fsa.github.io/sherpa/onnx/index.html
3. go api 介绍 https://k2-fsa.github.io/sherpa/onnx/go-api/index.html
术语解释
在线、离线、流式 streaming、非流式 non-streaming 的区别
在线 等同于流式,离线 等同于非流式。
在线 即流式,是边说边识别;响应速度快、延迟小。
离线 即非流式,是把所有待识别的数据,一次性送给模型;特点是需要 等待所有的数据都到齐, 然后才能开始识别。
语音转文字
语音转文字涉及到2个步骤:识别为文字、加入标点符号
模型名 | 用途 | 地址 |
---|---|---|
sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 | 语音转文字 | https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english |
sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12 | 加入标点符号 | https://k2-fsa.github.io/sherpa/onnx/punctuation/pretrained_models.html#sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12 |
sherpa-onnx-vits-zh-ll | 文字转语音 | https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#csukuangfj-sherpa-onnx-vits-zh-ll-chinese-5-speakers |
测试流程
如果需要在win系统运行,请在wsl内运行!
1. 下载上述3个模型包 到 /your_path
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/punctuation-models/sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12.tar.bz2
tar xvf sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12.tar.bz2
rm sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12.tar.bz2
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
rm sherpa-onnx-vits-zh-ll.tar.bz2
2. 复制如下代码到 /your_path/sherpa/sherpa.go
package sherpa
import (
"bytes"
"context"
"encoding/binary"
"fmt"
sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx"
"github.com/youpy/go-wav"
"io"
"os"
"os/exec"
"path"
"strings"
)
const (
supportAudioType = "wav"
// ffmpeg -i test.amr -y -ar 22050 test.wav
CMD_AMR_TO_WAV = "-i %s -y -ar 22050 %s"
)
type SherpaOnnx struct {
cfg *SherpaConfig
recognizer *sherpa.OnlineRecognizer
tts *sherpa.OfflineTts
offlinePuncuation *sherpa.OfflinePunctuation
ffmpeg string
}
type SherpaConfig struct {
// 语音转文字模型
Decoder string
Encoder string
Joiner string
Tokens string
// 标点
CtTransformer string
// TTS
VitsModel string
VitsDictDir string
VitsLexicon string
VitsTokens string
VitsLengthScale float32
Sid int
TtsNumThreads int
}
type RecognitionAudio struct {
Buffer *bytes.Buffer
AudioType string
}
func NewSherpa(cfg *SherpaConfig) *SherpaOnnx {
sherpaInstance := &SherpaOnnx{
cfg: cfg,
}
config := sherpa.OnlineRecognizerConfig{}
config.FeatConfig = sherpa.FeatureConfig{SampleRate: 16000, FeatureDim: 80}
config.ModelConfig = sherpa.OnlineModelConfig{
Transducer: sherpa.OnlineTransducerModelConfig{
Encoder: cfg.Encoder,
Decoder: cfg.Decoder,
Joiner: cfg.Joiner,
},
Tokens: cfg.Tokens,
NumThreads: 1,
Provider: "cpu",
}
config.DecodingMethod = "greedy_search"
config.MaxActivePaths = 4
soopmc := sherpa.OfflinePunctuationModelConfig{
CtTransformer: cfg.CtTransformer,
}
println("Initializing recognizer (may take several seconds)")
recognizer := sherpa.NewOnlineRecognizer(&config)
sherpaInstance.recognizer = recognizer
println("Recognizer created!")
soopc := sherpa.OfflinePunctuationConfig{
Model: soopmc,
}
sherpaInstance.offlinePuncuation = sherpa.NewOfflinePunctuation(&soopc)
println("Initializing tts (may take several seconds)")
ttsConfig := sherpa.OfflineTtsConfig{
Model: sherpa.OfflineTtsModelConfig{
Vits: sherpa.OfflineTtsVitsModelConfig{
Model: cfg.VitsModel,
Lexicon: cfg.VitsLexicon,
Tokens: cfg.VitsTokens,
NoiseScale: 0.667,
NoiseScaleW: 0.8,
LengthScale: cfg.VitsLengthScale,
DictDir: cfg.VitsDictDir,
},
NumThreads: cfg.TtsNumThreads,
Debug: 0,
Provider: "cpu",
},
MaxNumSentences: 1,
}
sherpaInstance.tts = sherpa.NewOfflineTts(&ttsConfig)
println("Tts created!")
return sherpaInstance
}
func (s *SherpaOnnx) HandleRecognize(ctx context.Context, audio *RecognitionAudio) (textResult string, err error) {
// 1. 读取原始文件
audioFile, err := os.CreateTemp(os.TempDir(), "*."+audio.AudioType)
if err != nil {
return
}
defer os.Remove(audioFile.Name())
if _, err = io.Copy(audioFile, audio.Buffer); err != nil {
return
}
// 2. 判断文件格式转换
if audio.AudioType != supportAudioType {
tmpWav, cerr := os.CreateTemp(os.TempDir(), "*."+supportAudioType)
if cerr != nil {
return
}
defer os.Remove(tmpWav.Name())
args := fmt.Sprintf(CMD_AMR_TO_WAV, audioFile.Name(), tmpWav.Name())
if _, cerr := exec.Command(s.ffmpeg, strings.Split(args, " ")...).CombinedOutput(); cerr != nil {
err = cerr
return
}
audioFile = tmpWav
}
// 3. 开始识别
stream := sherpa.NewOnlineStream(s.recognizer)
defer sherpa.DeleteOnlineStream(stream)
samples, sampleRate, err := readWave(ctx, audioFile)
if err != nil {
return
}
stream.AcceptWaveform(sampleRate, samples)
tailPadding := make([]float32, int(float32(sampleRate)*0.3))
stream.AcceptWaveform(sampleRate, tailPadding)
for s.recognizer.IsReady(stream) {
s.recognizer.Decode(stream)
}
// 4. 记录结果
textResult = s.recognizer.GetResult(stream).Text
// 加入标点
textResult = s.offlinePuncuation.AddPunct(textResult)
return
}
func readWave(ctx context.Context, file *os.File) (samples []float32, sampleRate int, err error) {
reader := wav.NewReader(file)
format, err := reader.Format()
if err != nil {
err = fmt.Errorf("failed to read wave format")
return
}
if format.AudioFormat != 1 {
err = fmt.Errorf("Support only PCM format. Given: %v\n", format.AudioFormat)
return
}
if format.NumChannels != 1 {
err = fmt.Errorf("Support only 1 channel wave file. Given: %v\n", format.NumChannels)
return
}
if format.BitsPerSample != 16 {
err = fmt.Errorf("Support only 16-bit per sample. Given: %v\n", format.BitsPerSample)
return
}
reader.Duration() // so that it initializes reader.Size
buf := make([]byte, reader.Size)
n, err := reader.Read(buf)
if n != int(reader.Size) {
err = fmt.Errorf("Failed to read %v bytes. Returned %v bytes\n", reader.Size, n)
return
}
samples, err = samplesInt16ToFloat(buf)
if err != nil {
return
}
sampleRate = int(format.SampleRate)
return
}
func samplesInt16ToFloat(inSamples []byte) ([]float32, error) {
numSamples := len(inSamples) / 2
outSamples := make([]float32, numSamples)
for i := 0; i != numSamples; i++ {
s := inSamples[i*2 : (i+1)*2]
var s16 int16
buf := bytes.NewReader(s)
err := binary.Read(buf, binary.LittleEndian, &s16)
if err != nil {
return nil, fmt.Errorf("Failed to parse 16-bit sample")
}
outSamples[i] = float32(s16) / 32768
}
return outSamples, nil
}
func (s *SherpaOnnx) HandleGenerateAudio(ctx context.Context, text string) (result string, err error) {
// 生成audio
audioResult := s.tts.Generate(text, s.cfg.Sid, 1.0)
tmpFile := path.Join(os.TempDir(), "generate_wav."+supportAudioType)
ok := audioResult.Save(tmpFile)
if !ok {
err = fmt.Errorf("save audio failed")
return
}
result = tmpFile
return
}
3. 复制如下代码到 /your_path/main.go
package main
import (
"bytes"
"context"
"fmt"
"io"
"os"
"sherpa-demo/sherpa"
)
func main() {
testwav, err := os.Open("./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/0.wav")
if err != nil {
panic(fmt.Sprintf("open wav: %s", err.Error()))
return
}
defer testwav.Close()
bs, _ := io.ReadAll(testwav)
audio := &sherpa.RecognitionAudio{
Buffer: bytes.NewBuffer(bs),
AudioType: "wav",
}
sherpaInstance := sherpa.NewSherpa(&sherpa.SherpaConfig{
Decoder: "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx",
Encoder: "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx",
Joiner: "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx",
Tokens: "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt",
CtTransformer: "./sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12/model.onnx",
VitsModel: "./sherpa-onnx-vits-zh-ll/model.onnx",
VitsDictDir: "./sherpa-onnx-vits-zh-ll/dict",
VitsLexicon: "./sherpa-onnx-vits-zh-ll/lexicon.txt",
VitsTokens: "./sherpa-onnx-vits-zh-ll/tokens.txt",
VitsLengthScale: 1,
Sid: 0,
TtsNumThreads: 1,
})
result, err := sherpaInstance.HandleRecognize(context.Background(), audio)
if err != nil {
panic(fmt.Errorf("handle recognize:%s", err.Error()))
}
println(fmt.Sprintf("result: %s", result))
wavName, err := sherpaInstance.HandleGenerateAudio(context.Background(), "今天是周一,明天是周二")
println(fmt.Sprintf("wav file name:%s", wavName))
}
输出
Initializing recognizer (may take several seconds)
Recognizer created!
Initializing tts (may take several seconds)
Tts created!
result: 昨天是MONDAY TODAY IS LIBR THE DAY AFTER TOMORROW是星期三。
wav file name:/var/folders/m_/76_5t5ws7zzc2bn3d30fq6s40000gn/T/*.wav
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通