MFCC特征提取及Griffin-Lim算法(librosa实现)

发布于:2025-05-14 ⋅ 阅读:(14) ⋅ 点赞:(0)

在这里插入图片描述

MFCC特征提取

当S存在,接下来的步骤就到了离散余弦变换(DCT)步。
melspectrogram:根据名字可知mel语谱图。
power_to_db:显然就是对数功率的步骤。

def mfcc(
    *,
    y: Optional[np.ndarray] = None,
    sr: float = 22050,
    S: Optional[np.ndarray] = None,
    n_mfcc: int = 20,
    dct_type: int = 2,
    norm: Optional[str] = "ortho",
    lifter: float = 0,
    mel_norm: Optional[Union[Literal["slaney"], float]] = "slaney",
    **kwargs: Any,
) -> np.ndarray:

    if S is None:
        # multichannel behavior may be different due to relative noise floor differences between channels
        S = power_to_db(melspectrogram(y=y, sr=sr, norm = mel_norm, **kwargs))

    fft = get_fftlib()
    M: np.ndarray = fft.dct(S, axis=-2, type=dct_type, norm=norm)[
        ..., :n_mfcc, :
    ]

    if lifter > 0:
        # shape lifter for broadcasting
        LI = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) / lifter)
        LI = util.expand_to(LI, ndim=S.ndim, axes=-2)

        M *= 1 + (lifter / 2) * LI
        return M
    elif lifter == 0:
        return M
    else:
        raise ParameterError(f"MFCC lifter={lifter} must be a non-negative number")

进入melspectrogram函数可以看到S的计算过程,对应stft和幅值的平方计算。

def _spectrogram(
    *,
    y: Optional[np.ndarray] = None,
    S: Optional[np.ndarray] = None,
    n_fft: Optional[int] = 2048,
    hop_length: Optional[int] = 512,
    power: float = 1,
    win_length: Optional[int] = None,
    window: _WindowSpec = "hann",
    center: bool = True,
    pad_mode: _PadModeSTFT = "constant",
) -> Tuple[np.ndarray, int]:
    if S is not None:
        # Infer n_fft from spectrogram shape, but only if it mismatches
        if n_fft is None or n_fft // 2 + 1 != S.shape[-2]:
            n_fft = 2 * (S.shape[-2] - 1)
    else:
        # Otherwise, compute a magnitude spectrogram from input
        if n_fft is None:
            raise ParameterError(f"Unable to compute spectrogram with n_fft={n_fft}")
        if y is None:
            raise ParameterError(
                "Input signal must be provided to compute a spectrogram"
            )
        S = (
            np.abs(
                stft(
                    y,
                    n_fft=n_fft,
                    hop_length=hop_length,
                    win_length=win_length,
                    center=center,
                    window=window,
                    pad_mode=pad_mode,
                )
            )
            ** power
        )

    return S, n_fft

Griffin-Lim 算法(vocoder)

给出声谱图,还原音频
Griffin-Lim 算法基于以下核心思想:

  • 幅度谱保留:直接使用已知的幅度谱信息
  • 迭代优化相位:通过迭代逐步逼近真实相位

算法流程可以概括为:

  1. 初始化相位(通常为随机相位)
  2. 通过当前相位和已知幅度谱合成复数谱
  3. 进行逆傅里叶变换得到时域信号
  4. 对时域信号进行短时傅里叶变换,提取新的相位
  5. 保留新相位,但使用原始已知幅度谱构建新的复数谱
  6. 重复步骤 3-5 直到收敛或达到最大迭代次数
import librosa
import librosa.display
import numpy as np
import soundfile as sf
import matplotlib.pyplot as plt

# 1. 加载音频文件
audio_file = 'your_audio_file.wav'  # 替换为实际音频文件路径
y, sr = librosa.load(audio_file, sr=None)

# 2. 提取音频特征 - 短时傅里叶变换(STFT)
n_fft = 2048
hop_length = 512
win_length = 1024

D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)

# 3. 可视化原始音频的频谱图
plt.figure(figsize=(10, 4))
librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='log')
plt.colorbar(format='%+2.0f dB')
plt.title('Original Audio Spectrogram')
plt.tight_layout()
plt.savefig('original_spectrogram.png')
plt.close()

# 4. 使用Griffin-Lim算法从幅度谱重建音频
n_iter = 32  # Griffin-Lim迭代次数
y_reconstructed = librosa.griffinlim(
    np.abs(D), 
    n_iter=n_iter,
    hop_length=hop_length,
    win_length=win_length
)

# 5. 保存重建的音频
output_file = 'reconstructed_audio.wav'
sf.write(output_file, y_reconstructed, sr)

# 6. 计算原始音频和重建音频之间的均方误差(MSE)
mse = np.mean((y[:len(y_reconstructed)] - y_reconstructed) ** 2)
print(f"原始音频长度: {len(y)} 样本")
print(f"重建音频长度: {len(y_reconstructed)} 样本")
print(f"均方误差(MSE): {mse:.6f}")

# 7. 可视化重建音频的频谱图
D_reconstructed = librosa.stft(y_reconstructed, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
S_reconstructed_db = librosa.amplitude_to_db(np.abs(D_reconstructed), ref=np.max)

plt.figure(figsize=(10, 4))
librosa.display.specshow(S_reconstructed_db, sr=sr, x_axis='time', y_axis='log')
plt.colorbar(format='%+2.0f dB')
plt.title('Reconstructed Audio Spectrogram')
plt.tight_layout()
plt.savefig('reconstructed_spectrogram.png')
plt.close()    

网站公告

今日签到

点亮在社区的每一天
去签到