在语音分析,合成,转换中,第一步往往是提取语音特征参数。
利用机器学习方法进行上述语音任务,常用到梅尔频谱。本文介绍从音频文件提取梅尔频谱,和从梅尔频谱变成音频波形。从音频波形提取Mel频谱:
对音频信号预加重、分帧和加窗
对每帧信号进行短时傅立叶变换STFT,得到短时幅度谱短时幅度谱通过Mel滤波器组得到Mel频谱从Mel频谱重建音频波形Mel频谱转换成幅度谱
griffin_lim声码器算法重建波形去加重声码器有很多种,比如world,straight等,但是griffin_lim是特殊的,它不需要相位信息就可以重频谱重建波形,实际上它根据帧之间的关系估计相位信息。和成的音频质量也较高,代码也比较简单。音频波形 到 mel-spectrogramsr = 24000 # Sample rate.
n_fft = 2048 # fft points (samples)frame_shift = 0.0125 # secondsframe_length = 0.05 # secondshop_length = int(sr*frame_shift) # samples.win_length = int(sr*frame_length) # samples.n_mels = 512 # Number of Mel banks to generatepower = 1.2 # Exponent for amplifying the predicted magnituden_iter = 100 # Number of inversion iterationspreemphasis = .97 # or Nonemax_db = 100ref_db = 20top_db = 1512345678910111213def get_spectrograms(fpath): '''Returns normalized log(melspectrogram) and log(magnitude) from `sound_file`. Args: sound_file: A string. The full path of a sound file.Returns:
mel: A 2d array of shape (T, n_mels) <- Transposed mag: A 2d array of shape (T, 1+n_fft/2) <- Transposed ''' # Loading sound file y, sr = librosa.load(fpath, sr=sr)# Trimming
y, _ = librosa.effects.trim(y, top_db=top_db)# Preemphasis
y = np.append(y[0], y[1:] - preemphasis * y[:-1]) # stft linear = librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)# magnitude spectrogram
mag = np.abs(linear) # (1+n_fft//2, T)# mel spectrogram
mel_basis = librosa.filters.mel(sr, n_fft, n_mels) # (n_mels, 1+n_fft//2) mel = np.dot(mel_basis, mag) # (n_mels, t)# to decibel
mel = 20 * np.log10(np.maximum(1e-5, mel)) mag = 20 * np.log10(np.maximum(1e-5, mag))# normalize
mel = np.clip((mel - ref_db + max_db) / max_db, 1e-8, 1) mag = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1)# Transpose
mel = mel.T.astype(np.float32) # (T, n_mels) mag = mag.T.astype(np.float32) # (T, 1+n_fft//2)return mel, mag
1
23456789101112131415161718192021222324252627282930313233343536373839404142434445mel-spectrogram 到 音频波形def melspectrogram2wav(mel):
'''# Generate wave file from spectrogram''' # transpose mel = mel.T# de-noramlize
mel = (np.clip(mel, 0, 1) * max_db) - max_db + ref_db# to amplitude
mel = np.power(10.0, mel * 0.05) m = _mel_to_linear_matrix(sr, n_fft, n_mels) mag = np.dot(m, mel)# wav reconstruction
wav = griffin_lim(mag)# de-preemphasis
wav = signal.lfilter([1], [1, -preemphasis], wav)# trim
wav, _ = librosa.effects.trim(wav)return wav.astype(np.float32)
def spectrogram2wav(mag):
'''# Generate wave file from spectrogram''' # transpose mag = mag.T# de-noramlize
mag = (np.clip(mag, 0, 1) * max_db) - max_db + ref_db# to amplitude
mag = np.power(10.0, mag * 0.05)# wav reconstruction
wav = griffin_lim(mag)# de-preemphasis
wav = signal.lfilter([1], [1, -preemphasis], wav)# trim
wav, _ = librosa.effects.trim(wav)return wav.astype(np.float32)
123456789101112131415161718192021222324252627282930313233343536373839404142434445几个辅助函数:def _mel_to_linear_matrix(sr, n_fft, n_mels):
m = librosa.filters.mel(sr, n_fft, n_mels) m_t = np.transpose(m) p = np.matmul(m, m_t) d = [1.0 / x if np.abs(x) > 1.0e-8 else x for x in np.sum(p, axis=0)] return np.matmul(m_t, np.diag(d))def griffin_lim(spectrogram):
'''Applies Griffin-Lim's raw. ''' X_best = copy.deepcopy(spectrogram) for i in range(n_iter): X_t = invert_spectrogram(X_best) est = librosa.stft(X_t, n_fft, hop_length, win_length=win_length) phase = est / np.maximum(1e-8, np.abs(est)) X_best = spectrogram * phase X_t = invert_spectrogram(X_best) y = np.real(X_t)return y
def invert_spectrogram(spectrogram): ''' spectrogram: [f, t] ''' return librosa.istft(spectrogram, hop_length, win_length=win_length, window="hann")1
2345678910111213141516171819202122232425262728预加重:语音信号的平均功率谱受声门激励和口鼻辐射影响,高频端约在800HZ以上按6dB/倍频程衰落,预加重的目的是提升高频成分,使信号频谱平坦化,以便于频谱分析或声道参数分析.---------------------