语音增强,频谱映射算法


import os
import torch
import numpy as np
from torch.utils.data import Dataset,DataLoader
from hparams import hparams
import librosa
import random
import soundfile as sf

def feature_stft(wav,para):
    spec = librosa.stft(wav,
                      n_fft=para["N_fft"],
                      win_length = para["win_length"],
                      hop_length = para["hop_length"],
                      window =para["window"])
                      
    mag =   np.abs(spec)    #幅度谱
    # print(mag)
    LPS =   np.log(mag**2)  #输入的是幅度谱平方后的Log
    phase = np.angle(spec)    
    # print(LPS.T,phase.T)
    return LPS.T, phase.T    #  T x D

def feature_contex(feature,expend):
    feature = feature.unfold(0,2*expend+1,1)  # T x D x  2*expand+1
    feature = feature.transpose(1,2)           # T x  2*n_expand+1  x D 
    feature = feature.view([-1,(2*expend+1)*feature.shape[-1]]) # T x  D * 2*n_expand+1
    return feature
'''# 这里调用了Tensor.unfold(dimension,size,step)函数
    # dimension 是沿着哪个维度重叠取帧 (T维度 ,所以是 第0维)
    # size 重复取帧大小 (2*左右扩展数 +1 )
    # step 步长
    # 输出维度 # (T-4) x D x  2*expand+1
    feature = feature.transpose(1,2)           # (T-4) x  2*n_expand+1  x D 
    # 把后两个维度“切换”一下
    feature = feature.view([-1,(2*expend+1)*feature.shape[-1]]) # T x  (D *( 2*n_expand+1))
    # 这一步,相当于保持第一维(帧 )不变,后面两维合并成了一维
'''
    
    

class TIMIT_Dataset(Dataset):
    
    def __init__(self,para):

        self.file_scp = para.file_scp
        self.para_stft = para.para_stft
        self.n_expand = para.n_expand

        files = np.loadtxt(self.file_scp,dtype = 'str')
        self.clean_files = files[:,1].tolist()
        self.noisy_files = files[:,0].tolist()
         
        print(len(self.clean_files))
    
    def __len__(self):
        return len(self.clean_files)

    def __getitem__(self,idx):
        
        # 读取干净语音
        clean_wav,fs = sf.read(self.clean_files[idx],dtype = 'int16')
        clean_wav = clean_wav.astype('float32')
        
        #  读取含噪语音
        noisy_wav,fs = sf.read(self.noisy_files[idx],dtype = 'int16')
        noisy_wav = noisy_wav.astype('float32')
        
        # 提取stft特征
        clean_LPS,_ = feature_stft(clean_wav,self.para_stft) # T x D
        noisy_LPS,_= feature_stft(noisy_wav,self.para_stft)  # T x D
        
        # 转为torch格式
        X_train = torch.from_numpy(noisy_LPS)
        Y_train = torch.from_numpy(clean_LPS)
        
        # 拼帧
        X_train = feature_contex(X_train,self.n_expand)
        Y_train = Y_train[self.n_expand:-self.n_expand,:]
        return X_train, Y_train

def my_collect(batch):
    # 神经网络训练时需要每一个batch大小相同
    # 由于语音数据 每次训练的feasture 大小= T x  (D *( 2*n_expand+1))
    batch_X = [item[0] for item in batch] # 帧数可能不一样 所以需要重写,实现batch的拼接
    batch_Y = [item[1] for item in batch]
    batch_X = torch.cat(batch_X,0) # 由于 T维度 可能不一样,所以沿着 T维度(第零维度)进行拼接,下同
    batch_Y = torch.cat(batch_Y,0)
    return[batch_X.float(),batch_Y.float()]
    
    
if __name__ == '__main__':
    
    # 数据加载测试
    para = hparams()
    
    m_Dataset= TIMIT_Dataset(para)
    
    m_DataLoader = DataLoader(m_Dataset,batch_size =2,shuffle = True, num_workers = 4, collate_fn = my_collect)
    # shuffle:随机打乱  num_workers:多线程选取  collate_fn:特征选取函数
    for i_batch, sample_batch in enumerate(m_DataLoader):
        train_X = sample_batch[0]
        train_Y = sample_batch[1]
        print(train_X.shape)
        print(train_Y.shape)

代码能跑 但是运行一段时间后会出现问题。求解。

img

img

这个算法出问题了吗

它跑着跑着,有些值慢慢调整,就如报错所言,有的太小计算时分母都接近0了