librosa.filters.mel
https://qiita.com/tmtakashi_dist/items/eecb705ea48260db0b62
Create a Mel filter-bank.
def mel(*, sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False,
norm="slaney", dtype=np.float32, ):
if fmax is None:
fmax = float(sr) / 2
# Initialize the weights
n_mels = int(n_mels)
weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
# Center freqs of each FFT bin
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
# 'Center freqs' of mel bands - uniformly spaced between limits
mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
fdiff = np.diff(mel_f)
ramps = np.subtract.outer(mel_f, fftfreqs)
for i in range(n_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i + 2] / fdiff[i + 1]
# .. then intersect them with each other and zero
weights[i] = np.maximum(0, np.minimum(lower, upper))
if norm == "slaney":
# Slaney-style mel is scaled to be approx constant energy per channel
enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
weights *= enorm[:, np.newaxis]
else:
weights = util.normalize(weights, norm=norm, axis=-1)
# Only check weights if f_mel[0] is positive
if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
# This means we have an empty channel somewhere
warnings.warn(
"Empty filters detected in mel frequency basis. "
"Some channels will produce empty responses. "
"Try increasing your sampling rate (and fmax) or "
"reducing n_mels.",
stacklevel=2,
)
return weights
def fft_frequencies(*, sr: float = 22050, n_fft: int = 2048) -> np.ndarray:
"""Alternative implementation of `np.fft.fftfreq`
Parameters
----------
sr : number > 0 [scalar]
Audio sampling rate
n_fft : int > 0 [scalar]
FFT window size
Returns
-------
freqs : np.ndarray [shape=(1 + n_fft/2,)]
Frequencies ``(0, sr/n_fft, 2*sr/n_fft, ..., sr/2)``
Examples
--------
>>> librosa.fft_frequencies(sr=22050, n_fft=16)
array([ 0. , 1378.125, 2756.25 , 4134.375,
5512.5 , 6890.625, 8268.75 , 9646.875, 11025. ])
"""
return np.fft.rfftfreq(n=n_fft, d=1.0 / sr)
def mel_frequencies(
n_mels: int = 128, *, fmin: float = 0.0, fmax: float = 11025.0, htk: bool = False
) -> np.ndarray:
"""Compute an array of acoustic frequencies tuned to the mel scale.
The mel scale is a quasi-logarithmic function of acoustic frequency
designed such that perceptually similar pitch intervals (e.g. octaves)
appear equal in width over the full hearing range.
Because the definition of the mel scale is conditioned by a finite number
of subjective psychoaoustical experiments, several implementations coexist
in the audio signal processing literature [#]_. By default, librosa replicates
the behavior of the well-established MATLAB Auditory Toolbox of Slaney [#]_.
According to this default implementation, the conversion from Hertz to mel is
linear below 1 kHz and logarithmic above 1 kHz. Another available implementation
replicates the Hidden Markov Toolkit [#]_ (HTK) according to the following formula::
mel = 2595.0 * np.log10(1.0 + f / 700.0).
The choice of implementation is determined by the ``htk`` keyword argument: setting
``htk=False`` leads to the Auditory toolbox implementation, whereas setting it ``htk=True``
leads to the HTK implementation.
.. [#] Umesh, S., Cohen, L., & Nelson, D. Fitting the mel scale.
In Proc. International Conference on Acoustics, Speech, and Signal Processing
(ICASSP), vol. 1, pp. 217-220, 1998.
.. [#] Slaney, M. Auditory Toolbox: A MATLAB Toolbox for Auditory
Modeling Work. Technical Report, version 2, Interval Research Corporation, 1998.
.. [#] Young, S., Evermann, G., Gales, M., Hain, T., Kershaw, D., Liu, X.,
Moore, G., Odell, J., Ollason, D., Povey, D., Valtchev, V., & Woodland, P.
The HTK book, version 3.4. Cambridge University, March 2009.
See Also
--------
hz_to_mel
mel_to_hz
librosa.feature.melspectrogram
librosa.feature.mfcc
Parameters
----------
n_mels : int > 0 [scalar]
Number of mel bins.
fmin : float >= 0 [scalar]
Minimum frequency (Hz).
fmax : float >= 0 [scalar]
Maximum frequency (Hz).
htk : bool
If True, use HTK formula to convert Hz to mel.
Otherwise (False), use Slaney's Auditory Toolbox.
Returns
-------
bin_frequencies : ndarray [shape=(n_mels,)]
Vector of ``n_mels`` frequencies in Hz which are uniformly spaced on the Mel
axis.
Examples
--------
>>> librosa.mel_frequencies(n_mels=40)
array([ 0. , 85.317, 170.635, 255.952,
341.269, 426.586, 511.904, 597.221,
682.538, 767.855, 853.173, 938.49 ,
1024.856, 1119.114, 1222.042, 1334.436,
1457.167, 1591.187, 1737.532, 1897.337,
2071.84 , 2262.393, 2470.47 , 2697.686,
2945.799, 3216.731, 3512.582, 3835.643,
4188.417, 4573.636, 4994.285, 5453.621,
5955.205, 6502.92 , 7101.009, 7754.107,
8467.272, 9246.028, 10096.408, 11025. ])
"""
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = hz_to_mel(fmin, htk=htk)
max_mel = hz_to_mel(fmax, htk=htk)
mels = np.linspace(min_mel, max_mel, n_mels)
hz: np.ndarray = mel_to_hz(mels, htk=htk)
return hz