使用sherpa + go 来做语音识别

参考

1.预训练模型:https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html
2.sherpa-onnx介绍 https://k2-fsa.github.io/sherpa/onnx/index.html
3. go api 介绍 https://k2-fsa.github.io/sherpa/onnx/go-api/index.html

术语解释

在线、离线、流式 streaming、非流式 non-streaming 的区别

在线 等同于流式,离线 等同于非流式。

在线 即流式,是边说边识别;响应速度快、延迟小。

离线 即非流式,是把所有待识别的数据,一次性送给模型;特点是需要 等待所有的数据都到齐, 然后才能开始识别。

语音转文字

语音转文字涉及到2个步骤:识别为文字、加入标点符号

测试流程

如果需要在win系统运行,请在wsl内运行!

1. 下载上述3个模型包 到 /your_path

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2

tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/punctuation-models/sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12.tar.bz2

tar xvf sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12.tar.bz2
rm sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12.tar.bz2

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
rm sherpa-onnx-vits-zh-ll.tar.bz2

2. 复制如下代码到 /your_path/sherpa/sherpa.go

package sherpa

import (
	"bytes"
	"context"
	"encoding/binary"
	"fmt"
	sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx"
	"github.com/youpy/go-wav"
	"io"
	"os"
	"os/exec"
	"path"
	"strings"
)

const (
	supportAudioType   = "wav"
	// ffmpeg -i test.amr -y -ar 22050 test.wav
	CMD_AMR_TO_WAV   = "-i %s -y -ar 22050 %s"
)

type SherpaOnnx struct {
	cfg               *SherpaConfig
	recognizer        *sherpa.OnlineRecognizer
	tts               *sherpa.OfflineTts
	offlinePuncuation *sherpa.OfflinePunctuation
	ffmpeg            string
}

type SherpaConfig struct {
	// 语音转文字模型
	Decoder string
	Encoder string
	Joiner  string
	Tokens  string

	// 标点
	CtTransformer string

	// TTS
	VitsModel       string
	VitsDictDir     string
	VitsLexicon     string
	VitsTokens      string
	VitsLengthScale float32
	Sid             int
	TtsNumThreads   int
}

type RecognitionAudio struct {
	Buffer    *bytes.Buffer
	AudioType string
}

func NewSherpa(cfg *SherpaConfig) *SherpaOnnx {
	sherpaInstance := &SherpaOnnx{
		cfg: cfg,
	}
	config := sherpa.OnlineRecognizerConfig{}
	config.FeatConfig = sherpa.FeatureConfig{SampleRate: 16000, FeatureDim: 80}
	config.ModelConfig = sherpa.OnlineModelConfig{
		Transducer: sherpa.OnlineTransducerModelConfig{
			Encoder: cfg.Encoder,
			Decoder: cfg.Decoder,
			Joiner:  cfg.Joiner,
		},
		Tokens:     cfg.Tokens,
		NumThreads: 1,
		Provider:   "cpu",
	}
	config.DecodingMethod = "greedy_search"
	config.MaxActivePaths = 4

	soopmc := sherpa.OfflinePunctuationModelConfig{
		CtTransformer: cfg.CtTransformer,
	}
	println("Initializing recognizer (may take several seconds)")
	recognizer := sherpa.NewOnlineRecognizer(&config)
	sherpaInstance.recognizer = recognizer
	println("Recognizer created!")
	soopc := sherpa.OfflinePunctuationConfig{
		Model: soopmc,
	}
	sherpaInstance.offlinePuncuation = sherpa.NewOfflinePunctuation(&soopc)

	println("Initializing tts (may take several seconds)")
	ttsConfig := sherpa.OfflineTtsConfig{
		Model: sherpa.OfflineTtsModelConfig{
			Vits: sherpa.OfflineTtsVitsModelConfig{
				Model:       cfg.VitsModel,
				Lexicon:     cfg.VitsLexicon,
				Tokens:      cfg.VitsTokens,
				NoiseScale:  0.667,
				NoiseScaleW: 0.8,
				LengthScale: cfg.VitsLengthScale,
				DictDir:     cfg.VitsDictDir,
			},
			NumThreads: cfg.TtsNumThreads,
			Debug:      0,
			Provider:   "cpu",
		},
		MaxNumSentences: 1,
	}
	sherpaInstance.tts = sherpa.NewOfflineTts(&ttsConfig)
	println("Tts created!")
	return sherpaInstance
}

func (s *SherpaOnnx) HandleRecognize(ctx context.Context, audio *RecognitionAudio) (textResult string, err error) {
	// 1. 读取原始文件
	audioFile, err := os.CreateTemp(os.TempDir(), "*."+audio.AudioType)
	if err != nil {
		return
	}
	defer os.Remove(audioFile.Name())

	if _, err = io.Copy(audioFile, audio.Buffer); err != nil {
		return
	}

	// 2. 判断文件格式转换
	if audio.AudioType != supportAudioType {
		tmpWav, cerr := os.CreateTemp(os.TempDir(), "*."+supportAudioType)
		if cerr != nil {
			return
		}
		defer os.Remove(tmpWav.Name())

		args := fmt.Sprintf(CMD_AMR_TO_WAV, audioFile.Name(), tmpWav.Name())
		if _, cerr := exec.Command(s.ffmpeg, strings.Split(args, " ")...).CombinedOutput(); cerr != nil {
			err = cerr
			return
		}
		audioFile = tmpWav
	}

	// 3. 开始识别
	stream := sherpa.NewOnlineStream(s.recognizer)
	defer sherpa.DeleteOnlineStream(stream)

	samples, sampleRate, err := readWave(ctx, audioFile)
	if err != nil {
		return
	}
	stream.AcceptWaveform(sampleRate, samples)

	tailPadding := make([]float32, int(float32(sampleRate)*0.3))
	stream.AcceptWaveform(sampleRate, tailPadding)

	for s.recognizer.IsReady(stream) {
		s.recognizer.Decode(stream)
	}

	// 4. 记录结果
	textResult = s.recognizer.GetResult(stream).Text
	// 加入标点
	textResult = s.offlinePuncuation.AddPunct(textResult)
	return
}

func readWave(ctx context.Context, file *os.File) (samples []float32, sampleRate int, err error) {
	reader := wav.NewReader(file)
	format, err := reader.Format()
	if err != nil {
		err = fmt.Errorf("failed to read wave format")
		return
	}

	if format.AudioFormat != 1 {
		err = fmt.Errorf("Support only PCM format. Given: %v\n", format.AudioFormat)
		return
	}

	if format.NumChannels != 1 {
		err = fmt.Errorf("Support only 1 channel wave file. Given: %v\n", format.NumChannels)
		return
	}

	if format.BitsPerSample != 16 {
		err = fmt.Errorf("Support only 16-bit per sample. Given: %v\n", format.BitsPerSample)
		return
	}

	reader.Duration() // so that it initializes reader.Size

	buf := make([]byte, reader.Size)
	n, err := reader.Read(buf)
	if n != int(reader.Size) {
		err = fmt.Errorf("Failed to read %v bytes. Returned %v bytes\n", reader.Size, n)
		return
	}

	samples, err = samplesInt16ToFloat(buf)
	if err != nil {
		return
	}
	sampleRate = int(format.SampleRate)

	return
}

func samplesInt16ToFloat(inSamples []byte) ([]float32, error) {
	numSamples := len(inSamples) / 2
	outSamples := make([]float32, numSamples)

	for i := 0; i != numSamples; i++ {
		s := inSamples[i*2 : (i+1)*2]

		var s16 int16
		buf := bytes.NewReader(s)
		err := binary.Read(buf, binary.LittleEndian, &s16)
		if err != nil {
			return nil, fmt.Errorf("Failed to parse 16-bit sample")
		}
		outSamples[i] = float32(s16) / 32768
	}

	return outSamples, nil
}

func (s *SherpaOnnx) HandleGenerateAudio(ctx context.Context, text string) (result string, err error) {
	// 生成audio
	audioResult := s.tts.Generate(text, s.cfg.Sid, 1.0)
	tmpFile := path.Join(os.TempDir(), "generate_wav."+supportAudioType)
	ok := audioResult.Save(tmpFile)
	if !ok {
		err = fmt.Errorf("save audio failed")
		return
	}
	result = tmpFile
	return
}

3. 复制如下代码到 /your_path/main.go

package main

import (
	"bytes"
	"context"
	"fmt"
	"io"
	"os"
	"sherpa-demo/sherpa"
)

func main() {
	testwav, err := os.Open("./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/0.wav")
	if err != nil {
		panic(fmt.Sprintf("open wav: %s", err.Error()))
		return
	}
	defer testwav.Close()
	bs, _ := io.ReadAll(testwav)
	audio := &sherpa.RecognitionAudio{
		Buffer:    bytes.NewBuffer(bs),
		AudioType: "wav",
	}
	sherpaInstance := sherpa.NewSherpa(&sherpa.SherpaConfig{
		Decoder:         "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx",
		Encoder:         "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx",
		Joiner:          "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx",
		Tokens:          "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt",
		CtTransformer:   "./sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12/model.onnx",
		VitsModel:       "./sherpa-onnx-vits-zh-ll/model.onnx",
		VitsDictDir:     "./sherpa-onnx-vits-zh-ll/dict",
		VitsLexicon:     "./sherpa-onnx-vits-zh-ll/lexicon.txt",
		VitsTokens:      "./sherpa-onnx-vits-zh-ll/tokens.txt",
		VitsLengthScale: 1,
		Sid:             0,
		TtsNumThreads:   1,
	})
	result, err := sherpaInstance.HandleRecognize(context.Background(), audio)
	if err != nil {
		panic(fmt.Errorf("handle recognize:%s", err.Error()))
	}
	println(fmt.Sprintf("result: %s", result))

	wavName, err := sherpaInstance.HandleGenerateAudio(context.Background(), "今天是周一,明天是周二")
	println(fmt.Sprintf("wav file name:%s", wavName))
}

输出

Initializing recognizer (may take several seconds)
Recognizer created!
Initializing tts (may take several seconds)
Tts created!
result: 昨天是MONDAY TODAY IS LIBR THE DAY AFTER TOMORROW是星期三。
wav file name:/var/folders/m_/76_5t5ws7zzc2bn3d30fq6s40000gn/T/*.wav

posted @   bytehello  阅读(220)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
点击右上角即可分享
微信分享提示