七种python读取语音文件的方法

该文主要记录一下常用的python读取wav文件的常用三方库以及优缺点对比，以一段采样率16k，4.99秒单声道的测试语音为例子，音频文件读取后主要有以下几种形式

#格式一：列表 [-0.00015259 -0.00021362 -0.00021362 -0.00027466 -0.00015259] float32
#格式二：列表 [-5, -7, -7, -9, -5] int16
#格式三：字节流 b'\xfb\xff\xf9\xff\xf9'   pcm编码格式
#格式四：字节流带文件头 b'RIFF\xb2\xbb\x00\x00WAVEfmt \x12\x00\x0 pcm编码格式
#格式五：字符串 'UklGRhIdAwBXQVZFZm10IBIAAAADAA' 通常是base64编码后的信息

下面介绍7种python读取语音文件的方法；

1、librosa

该方法较为经典，缺点是需要提前知道语音采样率,mono默认是True,双轨录音需注意改为False

import librosa
filepath='./zh.wav'
wav_list,sr=librosa.load(filepath,sr=16000,mono=False)
print (wav_list[:5])
#[-0.00015259 -0.00021362 -0.00021362 -0.00027466 -0.00015259]

2、torchaudio

该方法在语音领域较为成熟，可读取采样率，返回tensor，方便后期进行特征抽取，也有采样模块

import torchaudio
wav_list,sr= torchaudio.load(filepath,normalize=True) #normalize设为false时返回int16
print (wav_list[0][:5])
#tensor([[-0.00015259 -0.00021362 -0.00021362 -0.00027466 -0.00015259]])
#重采样
resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
resampled_waveform = resampler(wav_list)
#回写文件
torchaudio.save(output_file, resampled_waveform, target_sample_rate)
#抽取fbank特征
import torchaudio.compliance.kaldi as kaldi
mat = kaldi.fbank(waveform=wav_list,num_mel_bins=23,
frame_length=25,frame_shift=10,dither=0.0,energy_floor=0.0,
sample_frequency=sample_rate）

3、soundfile

无需知道采样率，采样率是返回值

import soundfile
wav_list, sr = soundfile.read(filepath,dtype='float32')
print ("sr:"+str(sr))
print (wav_list[:5])
#[-0.00015259 -0.00021362 -0.00021362 -0.00027466 -0.00015259]

wav_list2, sr = soundfile.read(filepath,dtype='int16')
print ("sr:"+str(sr))
wav_list=wav_list2/2**15
print (wav_list[:5])
#[-0.00015259 -0.00021362 -0.00021362 -0.00027466 -0.00015259]

#文件回写
soundfile.write('zh1.wav', wav_list, 16000, subtype='PCM_16')

4、wave

该方法能返回最多语音相关信息

import wave
import numpy as np
fp = wave.open(filepath, 'rb')
byte_data = fp.readframes(fp.getnframes())  #无文件头
params = fp.getparams()
nchannels, sampwidth, sr, nframes = params[:4]
print ("音轨数：" +str(nchannels))
print ("位宽："+str(sampwidth))
print ("采样率："+str(sr))
print (nframes)
print ("采样点数："+str(len(byte_data)))
wav_list= np.frombuffer(byte_data, dtype=np.int16).astype(np.float32) / 2**15     #字节流转list
print (byte_data[:5])
print (wav_list[:5])

#音轨：1
#位宽：2
#采样率：16000
#79949
#采样点数：159898
#b'\xfb\xff\xf9\xff\xf9'
#[-0.00015259 -0.00021362 -0.00021362 -0.00027466 -0.00015259]


with wave.open('zh2.wav', 'wb') as wf:
    wf.setnchannels(1)
    wf.setsampwidth(2)
    wf.setframerate(16000)
    wf.writeframes(byte_file)

5、open

该方法缺点是需要先读取成byte数据然后剔除文件头，再转换成语音list

f=open(filepath,'rb')
print ("文件头内容："+str(f.read()[:20])) #包含文件头的bytes数据
f = open(filepath, "rb")
data = np.fromfile(f, dtype=np.int16) #包含文件头信息
wav_list=data[22:] #剔除文件头信息
wav_list=wav_list/2**15
print (wav_list[:5])
#文件头内容：b'RIFF\xbep\x02\x00WAVEfmt \x10\x00\x00\x00'
#[-0.00015259 -0.00021362 -0.00021362 -0.00027466 -0.00015259]

#如若已知文件头信息，可使用以下函数复原
import io
def audiobytes_header(audio_bytes, sample_rate=16000, fileheader=False):
    '''增加文件头方法'''
    if fileheader:
        val = audio_bytes
    else:
        fp = io.BytesIO()
        with wave.open(fp, mode='wb') as waveobj:
            waveobj.setnchannels(1)
            waveobj.setframerate(sample_rate)
            waveobj.setsampwidth(2)
            waveobj.setcomptype('NONE','NONE')
            waveobj.writeframes(audio_bytes)
            val = fp.getvalue()
    return val
    
#base64的编码与解码
import base64
f=open(filepath,'rb')
input_bytes=f.read()
base64_bytes = base64.b64encode(input_bytes)
audio = base64_bytes.decode('utf-8')
    
#input_bytes 包含文件头
#audio utf-8编码的base64加密后的字节流信息
base64_bytes=audio.encode('utf-8') #解码
byte_file = base64.b64decode(base64_bytes) #解密
wav_list=np.frombuffer(byte_file[44:], dtype=np.int16).astype(np.float32) / 2**15  #头44为文件头，需剔除
print (wav_list[:5])

6 、scipy

import scipy.io.wavfile as wavfile
sr, data = wavfile.read(filepath)
wav_list=data/2**15
print ("采样率为："+str(sr))
print (wav_list[:5])

#采样率为：16000
#[-0.00015259 -0.00021362 -0.00021362 -0.00027466 -0.00015259]

#文件回写
audio_array = np.array(data, dtype=np.int16)
wavfile.write('output.wav', sample_rate, audio_array)

7、pydub

from pydub import AudioSegment
audio = AudioSegment.from_file(filepath)
sr = audio.frame_rate
data = audio.get_array_of_samples()
wav_list = [i/2**15 for i in list(data)]
print ("采样率:"+str(sr))
print (wav_list[:5])
#采样率为：16000
#[-0.000152587890625, -0.000213623046875, -0.000213623046875, -0.000274658203125, -0.000152587890625]

# 创建AudioSegment对象
audio_segment = AudioSegment(
    data=bytes(audio_data), 
    sample_width=sample_width, 
    frame_rate=sample_rate, 
    channels=num_channels
)

# 将音频数据写入文件
audio_segment.export('output.wav', format='wav')

posted @ 2023-08-23 19:03 glowwormss 阅读(593) 评论(0) 编辑收藏举报

刷新页面返回顶部

永远抽象派