HW4
在构建Dataset的时候,可以将长的序列先截断,先不管短的序列:
import os import json import torch import random from pathlib import Path from torch.utils.data import Dataset from torch.nn.utils.rnn import pad_sequence class myDataset(Dataset): def __init__(self, data_dir, segment_len=128): self.data_dir = data_dir self.segment_len = segment_len # Load the mapping from speaker neme to their corresponding id. mapping_path = Path(data_dir) / "mapping.json" mapping = json.load(mapping_path.open()) self.speaker2id = mapping["speaker2id"] # Load metadata of training data. metadata_path = Path(data_dir) / "metadata.json" metadata = json.load(open(metadata_path))["speakers"] # Get the total number of speaker. self.speaker_num = len(metadata.keys()) self.data = [] for speaker in metadata.keys(): for utterances in metadata[speaker]: self.data.append([utterances["feature_path"], self.speaker2id[speaker]]) def __len__(self): return len(self.data) def __getitem__(self, index): feat_path, speaker = self.data[index] # Load preprocessed mel-spectrogram. mel = torch.load(os.path.join(self.data_dir, feat_path)) # Segmemt mel-spectrogram into "segment_len" frames. if len(mel) > self.segment_len: # Randomly get the starting point of the segment. start = random.randint(0, len(mel) - self.segment_len) # Get a segment with "segment_len" frames. mel = torch.FloatTensor(mel[start:start+self.segment_len]) else: mel = torch.FloatTensor(mel) # Turn the speaker id into long for computing loss later. speaker = torch.FloatTensor([speaker]).long() return mel, speaker def get_speaker_number(self): return self.speaker_num
random_split可以直接对Dataset进行切分,此外,可以通过DataLoader中的collate_fn参数预先对序列进行填充之后再堆叠;
填充所使用的是torch.nn.utils.rnn.pad_sequence:
一篇介绍collate_fn参数的博客:pytorch中collate_fn函数的使用&如何向collate_fn函数传参_collate_fn=collater_XJTU-Qidong的博客-CSDN博客
import torch from torch.utils.data import DataLoader, random_split from torch.nn.utils.rnn import pad_sequence def collate_batch(batch): # ****************************** # Process features within a batch. """Collate a batch of data.""" mel, speaker = zip(*batch) # Because we train the model batch by batch, we need to pad the features in the same batch to make their lengths the same. mel = pad_sequence(mel, batch_first=True, padding_value=-20) # pad log 10^(-20) which is very small value. # mel: (batch size, length, 40) return mel, torch.FloatTensor(speaker).long() def get_dataloader(data_dir, batch_size, n_workers): """Generate dataloader""" dataset = myDataset(data_dir) speaker_num = dataset.get_speaker_number() # Split dataset into training dataset and validation dataset trainlen = int(0.9 * len(dataset)) lengths = [trainlen, len(dataset) - trainlen] trainset, validset = random_split(dataset, lengths) # ****************************** train_loader = DataLoader( trainset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=n_workers, pin_memory=True, collate_fn=collate_batch, ) valid_loader = DataLoader( validset, batch_size=batch_size, num_workers=n_workers, drop_last=True, pin_memory=True, collate_fn=collate_batch, ) return train_loader, valid_loader, speaker_num
Transformer模型的构建以及mean pooling的使用,注意Transformer模型输入序列的形状(length, batch_size, d_model);
可以通过tensor的permute方法对维度进行转换;此外注意mean pooling的使用:
import torch import torch.nn as nn import torch.nn.functional as F class Classifier(nn.Module): def __init__(self, d_model=80, n_spks=600, dropout=0.1): super().__init__() # Project the dimension of features from that of input into d_model. self.prenet = nn.Linear(40, d_model) # TODO: # Change Transformer to Conformer. # https://arxiv.org/abs/2005.08100 self.encoder_layer = nn.TransformerEncoderLayer( d_model=d_model, dim_feedforward=256, nhead=2 ) # self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2) # Project the the dimension of features from d_model into speaker nums. self.pred_layer = nn.Sequential( nn.Linear(d_model, d_model), nn.ReLU(), nn.Linear(d_model, n_spks), ) def forward(self, mels): """ args: mels: (batch size, length, 40) return: out: (batch size, n_spks) """ # out: (batch size, length, d_model) out = self.prenet(mels) # out: (length, batch size, d_model) out = out.permute(1, 0, 2) # The encoder layer expect features in the shape of (length, batch size, d_model). out = self.encoder_layer(out) # out: (batch size, length, d_model) out = out.transpose(0, 1) # mean pooling stats = out.mean(dim=1) # ****************************** # out: (batch, n_spks) out = self.pred_layer(stats) return out
Learning Rate Schedule的使用:用在每一个batch的optimizer.step()之后[scheduler.step()],optimizer.zero_grad()之前:
import math import torch from torch.optim import Optimizer from torch.optim.lr_scheduler import LambdaLR def get_cosine_schedule_with_warmup( optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1, ): """ Create a schedule with a learning rate that decreases following the values of the cosine function between the initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer. Args: optimizer (:class:`~torch.optim.Optimizer`): The optimizer for which to schedule the learning rate. num_warmup_steps (:obj:`int`): The number of steps for the warmup phase. num_training_steps (:obj:`int`): The total number of training steps. num_cycles (:obj:`float`, `optional`, defaults to 0.5): The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). last_epoch (:obj:`int`, `optional`, defaults to -1): The index of the last epoch when resuming training. Return: :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. """ def lr_lambda(current_step): # Warmup if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) # decadence progress = float(current_step - num_warmup_steps) / float( max(1, num_training_steps - num_warmup_steps) ) return max( 0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)) ) return LambdaLR(optimizer, lr_lambda, last_epoch)