音频数据的自定义DataLoader及其AutoEncoder降噪算法
DataLoader要求每一个Batch里面的数据的 shape 都一样,但是语音数据显然不可能都是等长的,因为每一条语音长度都不一样,因此在定制DataLoader的时候还要对每一个 batch 的数据进行剪裁(crop)或者填充(padding)处理。
这里采用 padding来对齐数据,方法采用 Pytorch Discussion 的网友Felix Kreuk的代码 (https://discuss.pytorch.org/t/dataloader-for-various-length-of-data/6418/7)
含义为:定义一个函数来处理一个 batch 的 tensor,在DataLoader 实例化的时候调用: train_loader = DataLoader(ds, ..., collate_fn=PadCollate(dim=0))
但是Felix的代码运行起来bug还挺多,可能是因为我的语音数据是单通道的,导致数据的 dimension为1,此外lambda函数和map函数也出了各种bug,所以重新写了一下如下:
def pad_tensor(vec, pad, dim): """ args: vec - tensor to pad pad - the size to pad to dim - dimension to pad return: a new tensor padded to 'pad' in dimension 'dim' """ pad_size = list(vec.shape) pad_size[dim] = pad - vec.size(dim) return torch.cat([vec, torch.zeros(*pad_size)], dim=dim) class PadCollate: """ a variant of callate_fn that pads according to the longest sequence in a batch of sequences """ def __init__(self, dim=0): """ args: dim - the dimension to be padded (dimension of time in sequences) """ self.dim = dim def pad_collate(self, batch): """ args: batch - list of (tensor, label) reutrn: xs - a tensor of all examples in 'batch' after padding ys - a LongTensor of all labels in batch """ # find longest sequence max_len = max(map(lambda x: x[0].shape[self.dim], batch)) # pad according to max_len batch = map(lambda (x, y): (pad_tensor(x, pad=max_len, dim=self.dim), y), batch) # stack all xs = torch.stack(map(lambda x: x[0], batch), dim=0) ys = torch.LongTensor(map(lambda x: x[1], batch)) return xs, ys def __call__(self, batch): return self.pad_collate(batch)
完整的 Dataset定义、Padding方法定义、DataLoader定义的程序如下:
class SpokenMnistSet(Dataset): def __init__(self, audios): self._root = "../free-spoken-digit-dataset-1.0.10/free-spoken-digit-dataset-1.0.10/recordings" self.audios = audios self.max_duration = 3.0 self.min_duration = 0.5 self.mode='train' def __getitem__(self, index): audio_segment, sr = librosa.load(self._root +'/'+ self.audios[index]) # spec_image = librosa.feature.melpectrogram(y=audio, sr=sr, hop_length=256) # 裁剪需要的数据 # audio_segment.crop(duration=self.max_duration, mode=self.mode) # print(audio_segment, int(self.audios[index].split('_')[0])) return audio_segment, int(self.audios[index].split('_')[0]) def __len__(self): return len(self.audios) # DataLoader for various length of data # https://discuss.pytorch.org/t/dataloader-for-various-length-of-data/6418/7 def pad_tensor(vec, pad, dim): """ args: vec - tensor to pad pad - the size to pad to dim - dimension to pad return: a new tensor padded to 'pad' in dimension 'dim' """ pad_size = list(vec.shape) # print(pad_size) # print(vec.shape, pad) pad_size[dim] = pad - vec.shape[dim] # print(pad_size) return torch.cat([torch.tensor(vec), torch.zeros(*pad_size)], dim=dim) class PadCollate: """ a variant of callate_fn that pads according to the longest sequence in a batch of sequences """ def __init__(self, dim=0): """ args: dim - the dimension to be padded (dimension of time in sequences) """ self.dim = dim def pad_collate(self, batch): """ args: batch - list of (tensor, label) reutrn: xs - a tensor of all examples in 'batch' after padding ys - a LongTensor of all labels in batch """ max_len = max(map(lambda x: x[0].shape[self.dim], batch)) print(max_len) # pad according to max_len batch_new = [] for b in batch: batch_new.append([pad_tensor(b[0], pad=max_len, dim=self.dim), b[1]]) # stack all # print(batch_new[0]) xs = [] for bat in batch_new: xs.append(bat[0]) xs = torch.stack(xs, dim=0) ys = torch.LongTensor(list(map(lambda x: x[1], batch))) return xs, ys def __call__(self, batch): return self.pad_collate(batch) def get_train_valid_test(): _root = "../free-spoken-digit-dataset-1.0.10/free-spoken-digit-dataset-1.0.10/recordings" all_data_path = os.listdir(_root) random.shuffle(all_data_path) si = len(all_data_path) train_datas, val_datas, test_datas = [], [], [] train_labels, val_labels, test_labels = [], [], [] for data_path in all_data_path[:int(si*0.8)]: train_datas.append(data_path) for data_path in all_data_path[int(si*0.8):int(si*0.9)]: val_datas.append(data_path) for data_path in all_data_path[int(si*0.9):]: test_datas.append(data_path) train_loader = torch.utils.data.DataLoader(SpokenMnistSet(train_datas), batch_size=BATCH_SIZE,drop_last=True,shuffle=True, collate_fn=PadCollate(dim=0)) val_loader = torch.utils.data.DataLoader(SpokenMnistSet(val_datas), batch_size=BATCH_SIZE,drop_last=True,shuffle=True, collate_fn=PadCollate(dim=0)) test_loader = torch.utils.data.DataLoader(SpokenMnistSet(test_datas), batch_size=BATCH_SIZE,shuffle=False, collate_fn=PadCollate(dim=0)) loader = {"train": train_loader, "val": val_loader, "test": test_loader} return loader loaders = get_train_valid_test() train_loader = loaders["train"] val_loader = loaders["val"] test_loader = loaders["test"] # 展示几条数据看看 plt.figure(0) for item in test_loader: idx = 1 leng = len(item[0]) for it in item[0][:4]: plt.subplot(4,1, idx) plt.plot(it.squeeze()) plt.xticks([]) plt.yticks([]) idx += 1 break
通过上述的程序进行AutoEncoder无监督降噪算法,特征采用MFCC。