MFCC特征提取(C语言版本)
音频分析中,MFCC参数是经典参数之一。之前对于它的计算流程和原理,大体上是比较清楚的,所以仿真的时候,都是直接调用matlab的voicebox工具或者开发的时候直接调用第三方库。最近想整理一个纯C语言版本的MFCC函数,发现第三方开源的一部分是C++的,有些纯C的开源代码是针对语音固定了某些参数,不太灵活。干脆自己动手写一下,发现matlab写习惯了,都弱化了写C的思维,磕磕碰碰弄了2天,初版总算是完成了。
计算的大体流程:预加重->分帧->加窗->FFT->能量->Mel滤波器组滤波->log->dct解卷积->倒谱提升->差分。
在编码的过程中碰到了一些小问题,跟大家分享一下。
问题1:预加重也就高通滤波,它是针对整段语音的。而如果先进行分帧,然后按帧预加重,这种做法比较符合实时流,一帧一帧的实时处理。但本处实现是针对整段语音,
做预加重,然后一帧一帧的输出特征。
问题2:Mel尺度转换,两个公式都可以。
问题3:能量
幅度谱和用幅度谱的平方都可以。
问题4:相关公式
DCT变换,有好几个形式,见wiki。
http://en.wikipedia.org/wiki/Discrete_cosine_transform。
此处用到:
Mel滤波器组: 三角滤波器
倒谱提升窗:
w=1+0.5*K*sin(pi*m/K); 1<=m<=K
差分计算: 也有不同的形式,但是基本上是一致的,只是系数倍数不同而已。
差分计算比较麻烦,涉及到前后好几帧,也有对此处简化的处理。
,有的此处分母是开方。
FFT变换:调用了第三方库FFTW,也可以直接使用轻量级的源码FFT计算方法。
头文件:
#ifndef MFCC_H #define MFCC_H #include <stdio.h> typedef struct AudioInfo { int sample_rate; int frame_len; int frame_shift; float* data_in; int data_len; AudioInfo():sample_rate(0),frame_len(0),frame_shift(0),data_in(NULL),data_len(0){};//构造函数注释掉,变成纯C版本 }; typedef struct MelBankInfo { float **filter; int nfilters; int nfft; int low; int high; MelBankInfo():filter(NULL),nfilters(0),nfft(0),low(0),high(0){}; }; typedef struct DctInfo { float **coeff; int dctlen; DctInfo():coeff(NULL),dctlen(0){}; }; typedef enum MFCC_TYPE { MFCC_STD, MFCC_DIFF_1, MFCC_DIFF_2, }; typedef struct MfccInfo { MelBankInfo melbank; DctInfo dct; int nframes; int out_nframes;//可输出的特征数 float *frame_data; float *data_in; float *window; float *lift_window; int frame_len; int frame_shift; float *pre1; float *pre2; float *cur; float *next1; float *next2; float *diff_pre1; float *diff_pre2; float *diff_cur; float *diff_next1; float *diff_next2; MFCC_TYPE m_type; MfccInfo():nframes(0),out_nframes(0),window(NULL),lift_window(NULL),cur(NULL),frame_len(0),pre1(NULL),pre2(NULL),next1(NULL),next2(NULL),m_type(MFCC_STD),frame_data(NULL),frame_shift(0){}; }; MfccInfo* MfccInit(AudioInfo audioctx, int nfft, int low, int high, int nfilters, int ndcts, MFCC_TYPE type); int Mfcc_Frame_std(MfccInfo *p, int iframe, float *out, int len); int Mfcc_Frame_diff1(MfccInfo *p, int iframe, float *out, int len); int Mfcc_Frame_diff2(MfccInfo *p, int iframe, float *out, int len); void MfccDestroy(MfccInfo *data); #endif 源文件: #include "mfcc.h" #include "Spl.h" #include "fftw3.h" #include <malloc.h> #include <math.h> #include <string.h> #include <assert.h> #define PI 3.1415926 #define EPS 0.0000001 #pragma comment(lib, "libfftw3f-3.lib") using namespace std; void PreEmphasise(const float *data, int len, float *out, float preF)//预加重 { for(int i = len - 1; i <= 1; i--) { out[i] = data[i] - preF * data[i-1]; } out[0] = data[0]; } float HzToMel(float f) { return 1127*log(1.0 + f/700); } float MelToHz(float data) { return 700 * (exp(data/1127) - 1); } int HzToN(float f, int fs, int nfft) { return f/fs *nfft+1; } void MelBank( int fs, int nfft, int low, int high, int nfilters, float** coeff )//三角滤波器组。 { float fre_bin = (float)fs / nfft; float low_mel = HzToMel(low); float high_mel = HzToMel(high); float mel_bw = (high_mel - low_mel)/(nfilters + 1); int valid_nfft = nfft/2 + 1; for(int j = 1; j <= nfilters; j++) { float mel_cent = j * mel_bw + low_mel; float mel_left = mel_cent - mel_bw; float mel_right = mel_cent + mel_bw; float freq_cent = MelToHz(mel_cent); float freq_left = MelToHz(mel_left); float freq_bw_left = freq_cent - freq_left; float freq_right = MelToHz(mel_right); float freq_bw_right = freq_right - freq_cent; for(int i = 1; i <= valid_nfft; i++) { float freq = (i-1) * fre_bin ; if( freq > freq_left && freq < freq_right ) { if( freq <= freq_cent) { coeff[j-1][i-1] = (freq - freq_left) / freq_bw_left; } else { coeff[j-1][i-1] = (freq_right - freq) / freq_bw_right; } } } } } void DctCoeff( int m, int n, float** coeff )//标准DCT变换。 { for( int i = 1; i <= m; i++) { for(int j = 0; j < n; j++) { coeff[i-1][j] = cos( (2*j + 1) * i *PI / (2 * n)); } } } void lift_window(float* p, int m)//倒谱提升窗归一化。 { float max_value = 0.0f; for(int i = 1; i <= m; i++) { p[i-1] = 1+ 0.5 * m * sin( PI * i/m ); if( p[i-1] > max_value) { max_value = p[i-1]; } } for(int i = 1; i <= m; i++) { p[i-1] /= max_value; } } float Product(float *data1, float* data2, int len) { float result = 0.0; for(int i = 0; i < len; i++) { result += data1[i] * data2[i]; } return result; } float** MallocMatrix(int m, int n) { float **in = (float**)malloc(m * sizeof(float*)); float* data = (float*)malloc( m*n*sizeof(float)); memset( data, 0, sizeof(float)*m*n ); for(int i = 1; i <= m; i++) { in[i-1] = &data[(i-1)*n]; } return in; } void FreeMatrix(float **in) { float *data = *in; if(data != NULL) { free(data); } if(in != NULL) { free(in); } } int Mfcc_Frame_diff1_temp(MfccInfo *p, int iframe, float *out, int len); //初始化,预加重,获取滤波器组系数,DCT系数,倒谱提升窗系数等。 MfccInfo* MfccInit(AudioInfo audioctx, int nfft, int low, int high, int nfilters, int ndcts, MFCC_TYPE type) { MfccInfo* p = (MfccInfo*)malloc(sizeof(MfccInfo)); p->melbank.nfft = nfft; p->melbank.low = low; p->melbank.high = high; p->melbank.nfilters = nfilters; p->dct.dctlen = ndcts; p->pre1 = NULL; p->pre2 = NULL; p->cur = NULL; p->next1 = NULL; p->next2 = NULL; p->m_type = type; p->data_in = audioctx.data_in;//整段语音的数据流 p->frame_shift = audioctx.frame_shift; int valid_nfft = nfft/2 + 1; p->melbank.filter = MallocMatrix( nfilters, valid_nfft); MelBank( audioctx.sample_rate, nfft, low, high, nfilters, p->melbank.filter);//Mel滤波器系数 p->dct.coeff = MallocMatrix( ndcts, nfilters); DctCoeff( ndcts, nfilters, p->dct.coeff );//DCT系数 float preF = 0.9375; //整段语音高通滤波,预加重 PreEmphasise( audioctx.data_in, audioctx.data_len, audioctx.data_in, preF); int nframes = (audioctx.data_len - audioctx.frame_len)/audioctx.frame_shift + 1; p->nframes = nframes; p->out_nframes = nframes; p->frame_len = audioctx.frame_len; p->window = (float*) malloc( audioctx.frame_len * sizeof(float)); hamming( p->window, audioctx.frame_len);//加窗 p->lift_window = (float*)malloc( ndcts * sizeof(float)); lift_window(p->lift_window, ndcts);//倒谱提升窗 int buffer_len = audioctx.frame_len > nfft ? audioctx.frame_len:nfft; p->frame_data = (float*) malloc( buffer_len * sizeof(float)); switch(type) { case MFCC_DIFF_1: { p->out_nframes = nframes - 4; p->pre1 = (float*)malloc( ndcts*sizeof(float)); p->pre2 = (float*)malloc( ndcts*sizeof(float)); p->cur = (float*)malloc( ndcts*sizeof(float)); p->next1 = (float*)malloc( ndcts*sizeof(float)); p->next2 = (float*)malloc( ndcts*sizeof(float)); Mfcc_Frame_std(p, 1, p->pre1, ndcts); Mfcc_Frame_std(p, 2, p->pre2, ndcts); Mfcc_Frame_std(p, 3, p->cur, ndcts); Mfcc_Frame_std(p, 4, p->next1, ndcts); //一阶差分需要相邻两帧数据 break; } case MFCC_DIFF_2: { p->out_nframes = nframes - 8; p->pre1 = (float*)malloc( ndcts*sizeof(float)); p->pre2 = (float*)malloc( ndcts*sizeof(float)); p->cur = (float*)malloc( ndcts*sizeof(float)); p->next1 = (float*)malloc( ndcts*sizeof(float)); p->next2 = (float*)malloc( ndcts*sizeof(float)); Mfcc_Frame_std(p, 1, p->pre1, ndcts); Mfcc_Frame_std(p, 2, p->pre2, ndcts); Mfcc_Frame_std(p, 3, p->cur, ndcts); Mfcc_Frame_std(p, 4, p->next1, ndcts); //一阶差分需要相邻两帧数据 p->diff_pre1 = (float*)malloc( ndcts*sizeof(float)); p->diff_pre2 = (float*)malloc( ndcts*sizeof(float)); p->diff_cur = (float*)malloc( ndcts*sizeof(float)); p->diff_next1 = (float*)malloc( ndcts*sizeof(float)); p->diff_next2 = (float*)malloc( ndcts*sizeof(float)); Mfcc_Frame_diff1_temp(p,1,p->diff_pre1,ndcts); Mfcc_Frame_diff1_temp(p,2,p->diff_pre2,ndcts); Mfcc_Frame_diff1_temp(p,3,p->diff_cur,ndcts); Mfcc_Frame_diff1_temp(p,4,p->diff_next1,ndcts);//二阶差分需要相邻一阶差分数据 } } return p; } int Mfcc_Frame_std(MfccInfo *p, int iframe, float *out, int len)//输出mfcc,任意帧输出 { if(iframe > p->nframes) { return -1; } memcpy(p->frame_data, p->data_in + (iframe - 1) * p->frame_shift, sizeof(float) * p->frame_len); apply_window( p->frame_data, p->window, p->frame_len); int nfft = p->melbank.nfft; int valid_nfft = nfft/2 + 1; fftwf_plan r2cP; fftwf_complex* temp = (fftwf_complex*)fftwf_malloc(sizeof( fftwf_complex ) * valid_nfft); r2cP = fftwf_plan_dft_r2c_1d( p->frame_len, p->frame_data, temp, FFTW_ESTIMATE ); //完成FFT运算 fftwf_execute( r2cP ); for (int j = 0; j < valid_nfft; ++j) { p->frame_data[j] = pow( temp[j][0], 2 ) + pow( temp[j][1], 2 );//平方能量值,也可以用谱幅度值 } fftwf_destroy_plan( r2cP ); for(int i = 1; i <= p->dct.dctlen; i++) { float temp = 0.0; for(int j = 1; j <= p->melbank.nfilters; j++) { //DCT变换,解卷积 temp += p->dct.coeff[i-1][j-1] * log(Product(p->frame_data, p->melbank.filter[j-1], valid_nfft)+ EPS)/log(10.0); } out[i-1] = temp * p->lift_window[i-1];//倒谱提升 } fftwf_free(temp); return 0; } int Mfcc_Frame_diff1(MfccInfo *p, int iframe, float *out, int len)//标准一阶差分,输出 mfcc + 一阶差分。 逐帧输出 { assert(p->nframes >= 5 && iframe <= p->nframes -4 && p->m_type == MFCC_DIFF_1); int ret = Mfcc_Frame_std(p, iframe + 4, p->next2, len); int dctlen = p->dct.dctlen; memcpy( out, p->cur, sizeof(float)* dctlen);//mfcc float factor = sqrt(10.0); for(int i = 0; i < dctlen; i++) { out[i + dctlen] = (2 * p->next2[i] + p->next1[i] - 2*p->pre1[i] - p->pre2[i])/factor ;//一阶差分 } float *temp = p->pre1; p->pre1 = p->pre2; p->pre2 = p->cur; p->cur = p->next1; p->next1 = p->next2; p->next2 = temp; return ret; } int Mfcc_Frame_diff1_temp(MfccInfo *p, int iframe, float *out, int len)//输出一阶差分 { int ret = Mfcc_Frame_std(p, iframe + 4, p->next2, len); int dctlen = p->dct.dctlen; float factor = sqrt(10.0); for(int i = 0; i < dctlen; i++) { out[i] = (2 * p->next2[i] + p->next1[i] - 2*p->pre1[i] - p->pre2[i])/factor ;//一阶差分 } float *temp = p->pre1; p->pre1 = p->pre2; p->pre2 = p->cur; p->cur = p->next1; p->next1 = p->next2; p->next2 = temp; return ret; } int Mfcc_Frame_diff2(MfccInfo *p, int iframe, float *out, int len)//输出mfcc+1+2 { assert(p->nframes >= 9 && iframe <= p->nframes -8 && p->m_type == MFCC_DIFF_2); int ret = Mfcc_Frame_diff1_temp(p, iframe + 8, p->diff_next2, len); int dctlen = p->dct.dctlen; memcpy( out, p->next2, sizeof(float)* dctlen);//mfcc memcpy( out + dctlen, p->diff_cur, sizeof(float)* dctlen);//一阶差分 float factor = sqrt(10.0); for(int i = 0; i < dctlen; i++) { out[i + 2*dctlen] = (2 * p->diff_next2[i] + p->diff_next1[i] - 2*p->diff_pre1[i] - p->diff_pre2[i])/factor ;//二阶差分 } float *temp = p->diff_pre1; p->diff_pre1 = p->diff_pre2; p->diff_pre2 = p->diff_cur; p->diff_cur = p->diff_next1; p->diff_next1 = p->diff_next2; p->diff_next2 = temp; return ret; } void MfccDestroy(MfccInfo *data) { FreeMatrix(data->melbank.filter); FreeMatrix(data->dct.coeff); if(data->window) { free(data->window); data->window = NULL; } if(data->lift_window) { free(data->lift_window); data->lift_window = NULL; } if(data->pre1) { free(data->pre1); data->pre1 = NULL; } if(data->pre2) { free(data->pre2); data->pre2 = NULL; } if(data->cur) { free(data->cur); data->cur = NULL; } if(data->next1) { free(data->next1); data->next1 = NULL; } if(data->next2) { free(data->next2); data->next2 = NULL; } if(data->frame_data) { free(data->frame_data); data->frame_data = NULL; } if(data->diff_pre1) { free(data->pre1); data->pre1 = NULL; } if(data->diff_pre2) { free(data->pre2); data->pre2 = NULL; } if(data->diff_cur) { free(data->cur); data->cur = NULL; } if(data->diff_next1) { free(data->next1); data->next1 = NULL; } if(data->diff_next2) { free(data->next2); data->next2 = NULL; } } 其它文件: #include "Spl.h" #include <math.h> #define TWOPI 6.283185307179586 void hanning( float *win, int N) { int half = 0; if ( N % 2 == 0 ) { half = N / 2; for (int i = 1; i <= half; ++i) { win[i - 1] = 0.5 - 0.5*cos(TWOPI*i / (N + 1.0)); } int index = half + 1; for (int i = half; i >= 1; i--) { win[index - 1] = win[i - 1]; index++; } } else { half = (N + 1) / 2; for (int i = 1; i <= half; ++i) { win[i - 1] = 0.5 - 0.5*cos(TWOPI*i / (N + 1.0)); } int index = half + 1; for (int i = half-1; i >= 1; i--) { win[index - 1] = win[i - 1]; index++; } } } void hamming( float *win, int N) { int half = 0; if ( N % 2 == 0 ) { half = N / 2; for (int i = 1; i <= half; ++i) { win[i - 1] = 0.54 - 0.46*cos(TWOPI*i / (N + 1.0)); } int index = half + 1; for (int i = half; i >= 1; i--) { win[index - 1] = win[i - 1]; index++; } } else { half = (N + 1) / 2; for (int i = 1; i <= half; ++i) { win[i - 1] = 0.54 - 0.46*cos(TWOPI*i / (N + 1.0)); } int index = half + 1; for (int i = half-1; i >= 1; i--) { win[index - 1] = win[i - 1]; index++; } } } void apply_window(float* data, float* window, int window_len) { for(int i = 0; i< window_len; i++) { data[i] = data[i] * window[i]; } } 测试代码: #include "mfcc.h" #include <malloc.h> int main(int argc, char* argv[]) { int samples = 1024*4; float *data = (float*)malloc(samples * sizeof(float)); for(int i = 0; i < samples; i++) { data[i] = i; } AudioInfo audioctx; audioctx.data_in = data; audioctx.data_len = samples; audioctx.frame_len = 512; audioctx.frame_shift = 128; audioctx.sample_rate = 8000; int nfft = audioctx.frame_len; int low = 0; int high = audioctx.sample_rate/2; int nfilters = 24; int ndcts = 12; #if 0 MFCC_TYPE type = MFCC_STD; MfccInfo* p = MfccInit( audioctx, nfft, low, high, nfilters, ndcts, type); int len = ndcts * 1; float *out = (float*) malloc( sizeof(float)* (len)); for(int j = 0; j < p->out_nframes; j++)//能指定输出第i帧的mfcc { Mfcc_Frame_std(p, j+1, out, len); printf("第 %d 帧:\n", j+1); for(int i = 0; i < len; i++) { printf("%f ", out[i]); } printf("\n"); } #endif #if 0 MFCC_TYPE type = MFCC_DIFF_1; MfccInfo* p = MfccInit( audioctx, nfft, low, high, nfilters, ndcts, type); int len = ndcts * 2; float *out = (float*) malloc( sizeof(float)* (len)); for(int j = 0; j < p->out_nframes; j++)//必须按顺序输出,不能直接获取指定帧的mfcc差分 { Mfcc_Frame_diff1(p, j+1, out, len); printf("第 %d 帧:\n", j+1); for(int i = 0; i < len; i++) { printf("%f ", out[i]); } printf("\n"); } #endif #if 1 MFCC_TYPE type = MFCC_DIFF_2; MfccInfo* p = MfccInit( audioctx, nfft, low, high, nfilters, ndcts, type); int len = ndcts * 3; float *out = (float*) malloc( sizeof(float)* (len)); for(int j = 0; j < p->out_nframes; j++)//必须按顺序输出,不能直接获取指定帧的mfcc差分 { Mfcc_Frame_diff2(p, j+1, out, len); printf("第 %d 帧:\n", j+1); for(int i = 0; i < len; i++) { printf("%f ", out[i]); } printf("\n"); } #endif free(data); data = NULL; free(out); out = NULL; MfccDestroy(p); return 0; }