《PyTorch 深度学习实践 》-刘二大人 第十三讲
同样的参数,CPU跑15min,GPU 2min43s
1 #根据地名分辨国家 2 import math 3 import time 4 import torch 5 # 绘图 6 import matplotlib.pyplot as plt 7 import numpy as np 8 # 读取数据 9 import gzip 10 import csv 11 12 from torch.nn.utils.rnn import pack_padded_sequence 13 from torch.utils.data import Dataset, DataLoader 14 import os 15 os.environ['KMP_DUPLICATE_LIB_OK']='True' 16 17 # ------------0 parameters-------------# 18 HIDDEN_SIZE = 100 19 BATCH_SIZE = 256 20 N_LAYER = 2 21 N_EPOCHS = 100 22 N_CHARS = 128 # 字典长度 23 USE_GPU = True # 不用GPU 24 25 # ---------------------1 Preparing Data and DataLoad-------------------------------# 26 class NameDataset(Dataset): 27 def __init__(self, is_train_set=True): 28 filename = 'names_train.csv.gz' if is_train_set else 'names_test.csv.gz' 29 30 # 访问数据集,使用gzip和csv包 31 with gzip.open(filename, 'rt') as f: 32 reader = csv.reader(f) 33 rows = list(reader) # 按行读取(names,countries) 34 35 self.names = [row[0] for row in rows] 36 self.len = len(self.names) 37 self.countries = [row[1] for row in rows] 38 self.country_list = list(sorted(set(self.countries))) # set:去除重复,sorted:排序,list:转换为列表 39 self.country_dict = self.getCountryDict() 40 self.country_num = len(self.country_list) 41 42 def __getitem__(self, index): 43 return self.names[index], self.country_dict[self.countries[index]] 44 # 取出的names是字符串,country_dict是索引 45 46 def __len__(self): 47 return self.len 48 49 def getCountryDict(self): # Convert list into dictionary. 50 country_dict = dict() 51 for idx, country_name in enumerate(self.country_list, 0): 52 country_dict[country_name] = idx 53 return country_dict 54 55 def idx2country(self, index): # Return country name giving index. 56 return self.country_list[index] 57 58 def getCountriesNum(self): # Return the number of countries. 59 return self.country_num 60 61 62 # DataLoade 63 trainset = NameDataset(is_train_set=True) 64 trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True) 65 testset = NameDataset(is_train_set=False) 66 testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False) 67 N_COUNTRY = trainset.getCountriesNum() 68 69 70 # ------------------------------Design Model-----------------------------------# 71 def create_tensor(tensor): 72 if USE_GPU: 73 device = torch.device("cuda:0") 74 tensor = tensor.to(device) 75 return tensor 76 77 78 class RNNClassifier(torch.nn.Module): 79 def __init__(self, input_size, hidden_size, output_size, n_layers=1, bidirectional=True): 80 super(RNNClassifier, self).__init__() 81 self.hidden_size = hidden_size 82 self.n_layers = n_layers 83 self.n_directions = 2 if bidirectional else 1 # bidirectional,双向循环神经网络 84 self.embedding = torch.nn.Embedding(input_size, hidden_size) 85 self.gru = torch.nn.GRU(hidden_size, hidden_size, n_layers, bidirectional=bidirectional) 86 self.fc = torch.nn.Linear(hidden_size * self.n_directions, output_size) 87 88 def _init_hidden(self, batch_size): 89 hidden = torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size) 90 return create_tensor(hidden) 91 92 def forward(self, input, seq_lengths): 93 input = input.t() # 转置 t -> transpose: input shape : B x S -> S x B 94 batch_size = input.size(1) 95 96 hidden = self._init_hidden(batch_size) # h0 97 embedding = self.embedding(input) # (seqLen,batchSize,hiddenSize) 98 99 # PackedSquence:把为0的填充量去除,把每个样本的长度记录下来,按长度排序后拼接在一起 100 gru_input = pack_padded_sequence(embedding, seq_lengths) 101 102 output, hidden = self.gru(gru_input, hidden) 103 if self.n_directions == 2: # 双向循环神经网络有两个hidden 104 hidden_cat = torch.cat([hidden[-1], hidden[-2]], dim=1) 105 else: 106 hidden_cat = hidden[-1] 107 108 fc_output = self.fc(hidden_cat) 109 return fc_output 110 111 112 classifier = RNNClassifier(N_CHARS, HIDDEN_SIZE, N_COUNTRY, N_LAYER) 113 114 #----------------------3 Construct Loss and Optimizer------------------------------------# 115 criterion = torch.nn.CrossEntropyLoss() 116 optimizer = torch.optim.Adam(classifier.parameters(), lr=0.001) 117 118 119 #-----------------------------------4 Train and Test----------------------------------------------------# 120 def time_since(since): 121 s = time.time() - since 122 m = math.floor(s / 60) 123 s -= m * 60 124 return '%dm %ds' % (m, s) 125 126 127 def name2list(name): 128 arr = [ord(c) for c in name] # 返回对应字符的 ASCII 数值 129 return arr, len(arr) # 返回元组,列表本身和列表长度 130 131 132 def make_tensors(names, countries): 133 sequences_and_lengths = [name2list(name) for name in names] 134 name_sequences = [sl[0] for sl in sequences_and_lengths] 135 seq_lengths = torch.LongTensor([sl[1] for sl in sequences_and_lengths]) 136 countries = countries.long() # countries:国家索引 137 138 # make tensor of name, BatchSize x SeqLen 139 seq_tensor = torch.zeros(len(name_sequences), seq_lengths.max()).long() 140 for idx, (seq, seq_len) in enumerate(zip(name_sequences, seq_lengths), 0): 141 seq_tensor[idx, :seq_len] = torch.LongTensor(seq) 142 # 先制作一个全0的tensor,然后将名字贴在上面 143 144 # 排序,sort by length to use pack_padded_sequence 145 seq_lengths, perm_idx = seq_lengths.sort(dim=0, descending=True) 146 # sort返回两个值,seq_lengths:排完序后的序列(未padding),perm_idx:排完序后对应元素的索引 147 seq_tensor = seq_tensor[perm_idx] # 排序(已padding) 148 countries = countries[perm_idx] # 排序(标签) 149 return create_tensor(seq_tensor), create_tensor(seq_lengths), create_tensor(countries) 150 151 152 def trainModel(): 153 total_loss = 0 154 for i, (names, countries) in enumerate(trainloader, 1): 155 inputs, seq_lengths, target = make_tensors(names, countries) # make_tensors 156 output = classifier(inputs, seq_lengths.to('cpu')) 157 loss = criterion(output, target) 158 optimizer.zero_grad() 159 loss.backward() 160 optimizer.step() 161 162 total_loss += loss.item() 163 if i % 10 == 0: 164 print(f'[{time_since(start)}] Epoch {epoch} ', end='') 165 print(f'[{i * len(inputs)}/{len(trainset)}] ', end='') 166 print(f'loss={total_loss / (i * len(inputs))}') 167 return total_loss 168 169 #test module 170 def hehe(): 171 correct = 0 172 total = len(testset) 173 print("evaluating trained model ...") 174 with torch.no_grad(): 175 for i, (names, countries) in enumerate(testloader, 1): 176 inputs, seq_lengths, target = make_tensors(names, countries) # make_tensors 177 output = classifier(inputs, seq_lengths.to('cpu')) 178 pred = output.max(dim=1, keepdim=True)[1] 179 correct += pred.eq(target.view_as(pred)).sum().item() 180 percent = '%.2f' % (100 * correct / total) 181 print(f'Test set: Accuracy {correct}/{total} {percent}%') 182 return correct / total 183 184 185 if __name__ == '__main__': 186 if USE_GPU: 187 device = torch.device("cuda:0") 188 classifier.to(device) 189 start = time.time() 190 print("Training for %d epochs..." % N_EPOCHS) 191 acc_list = [] 192 # Train cycle,In every epoch, training and testing the model once. 193 for epoch in range(1, N_EPOCHS + 1): 194 trainModel() 195 acc = hehe() 196 acc_list.append(acc) 197 198 # 绘图 199 epoch = np.arange(1, len(acc_list) + 1, 1) 200 acc_list = np.array(acc_list) 201 plt.plot(epoch, acc_list) 202 plt.xlabel('Epoch') 203 plt.ylabel('Accuracy') 204 plt.grid() 205 plt.show()
evaluating trained model ...
Test set: Accuracy 5599/6700 83.57%
[2m 41s] Epoch 100 [2560/13374] loss=0.00011349248889018782
[2m 42s] Epoch 100 [5120/13374] loss=0.00012008407356915996
[2m 42s] Epoch 100 [7680/13374] loss=0.0001346439957463493
[2m 42s] Epoch 100 [10240/13374] loss=0.00013780106764897936
[2m 43s] Epoch 100 [12800/13374] loss=0.00014130977695458568
evaluating trained model ...
Test set: Accuracy 5607/6700 83.69%
课后作业:
1 ''' 2 Sentiment Analysis on Movie Reviews-RNN 3 来自:https://blog.csdn.net/qq_39187959/article/details/121102959 4 ''' 5 import math 6 import torch 7 from itertools import chain 8 import pandas as pd 9 from torch.nn.utils.rnn import pack_padded_sequence 10 from torch.utils.data import Dataset, DataLoader 11 import time 12 import matplotlib.pyplot as plt 13 import os 14 os.environ['KMP_DUPLICATE_LIB_OK']='True' 15 16 class SAData(Dataset): 17 def __init__(self, train): 18 # 构建数据样本 19 self.train = train 20 self.data = pd.read_csv('sentiment-analysis-on-movie-reviews/train.tsv', sep='\t') 21 22 if self.train: 23 # 随机选取80%作为训练集,不可按索引顺序取,数据会不全面 24 self.data = self.data.sample(frac=0.8, replace=False, random_state=1, axis=0) 25 # self.data = self.data[:int(self.data.shape[0] * 0.8)] 26 self.data = self.data.reset_index(drop=True) # 重新生成索引 27 ### 正式训练要训练所有数据 ### 28 # self.data = self.data 29 self.len = self.data.shape[0] 30 else: 31 # 20%作为验证集 32 self.data = self.data.sample(frac=0.2, replace=False, random_state=1, axis=0) 33 # self.data = self.data[int(self.data.shape[0] * 0.8):] 34 self.data = self.data.reset_index(drop=True) # 重新生成索引 35 self.len = self.data.shape[0] 36 self.x_data, self.y_data = self.data['Phrase'], self.data['Sentiment'] 37 38 def __getitem__(self, index): 39 # 根据数据索引获取样本 40 return self.x_data[index], self.y_data[index] 41 42 def __len__(self): 43 # 返回数据长度 44 return self.len 45 46 47 # 训练集验证集数据对象 48 train_set = SAData(train=True) 49 validation_set = SAData(train=False) 50 51 # Hyper Parameters 52 N_CHARS = 128 # ASCII码个数 53 HIDDEN_SIZE = 128 54 N_LAYER = 2 55 BATCH_SIZE = 1024 56 N_EPOCHS = 50 #自己电脑建议大家改成10再运行,因为真的需要很久,改成100的时候直接报硬件不足了 57 USE_GPU = True 58 N_CLASS = len(set(train_set.y_data)) 59 60 # 训练集验证集数据加载对象 61 train_loader = DataLoader( 62 dataset=train_set, 63 batch_size=BATCH_SIZE, 64 shuffle=True, 65 # num_workers=2 66 ) 67 68 validation_loader = DataLoader( 69 dataset=validation_set, 70 batch_size=BATCH_SIZE, 71 shuffle=False, # 测试集不打乱有利于观察结果 72 # num_workers=2 73 ) 74 75 76 def time_since(since): 77 s = time.time() - since 78 m = math.floor(s / 60) 79 s -= m * 60 80 return '%dm %ds' % (m, s) 81 82 83 def phrase2list(phrase): 84 arr = [ord(c) for c in phrase] # ord() 返回对应的ASCII码 85 return arr, len(arr) 86 87 88 def create_tensor(tensor): 89 if USE_GPU: 90 device = torch.device('cuda:0') 91 tensor = tensor.to(device) 92 return tensor 93 94 95 def make_tensor(phrase, sentiment): 96 sequences_and_lengths = [phrase2list(phrase) for phrase in phrase] # 名字字符串->字符数组->对应ASCII码 97 phrase_sequences = [sl[0] for sl in sequences_and_lengths] 98 seq_lengths = torch.LongTensor([sl[1] for sl in sequences_and_lengths]) 99 sentiment = sentiment.long() 100 101 # make tensor of name, batchSize x seqLen 102 seq_tensor = torch.zeros(len(phrase_sequences), seq_lengths.max()).long() 103 for idx, (seq, seq_len) in enumerate(zip(phrase_sequences, seq_lengths)): # 填充零 104 seq_tensor[idx, :seq_len] = torch.LongTensor(seq) # name_sequences不够最大长度的位置补零 105 106 # 排序 sort by length to use pack_padded_sequence 107 seq_lengths, perm_idx = seq_lengths.sort(dim=0, descending=True) # perm_idx表示排完序元素原本的索引 108 seq_tensor = seq_tensor[perm_idx] # 对补零后的name_sequences按照长度排序 109 sentiment = sentiment[perm_idx] 110 111 return create_tensor(seq_tensor), create_tensor(seq_lengths), create_tensor(sentiment) 112 113 114 class RNNClassifier(torch.nn.Module): 115 def __init__(self, input_size, hidden_size, output_size, n_layers=1, bidirection=True): 116 super(RNNClassifier, self).__init__() 117 self.hidden_size = hidden_size 118 self.n_layers = n_layers 119 self.n_directions = 2 if bidirection else 1 120 121 self.embedding = torch.nn.Embedding(input_size, hidden_size) 122 self.gru = torch.nn.GRU(hidden_size, hidden_size, n_layers, bidirectional=bidirection) 123 self.fc = torch.nn.Linear(hidden_size * self.n_directions, output_size) 124 125 def _init_hidden(self, batch_size): 126 hidden = torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size) 127 return create_tensor(hidden) 128 129 def forward(self, input, seq_lengths): 130 input = input.t() # 转置 B x S -> S x B 131 batch_size = input.size(1) 132 133 hidden = self._init_hidden(batch_size) 134 embedding = self.embedding(input) 135 136 # 这里的pack,理解成压紧比较好。 137 # 将一个 填充过的变长序列 压紧。(填充时候,会有冗余,所以压紧一下) 138 gru_input = pack_padded_sequence(embedding, seq_lengths.to('cpu')) # pack them up 139 140 output, hidden = self.gru(gru_input, hidden) 141 if self.n_directions == 2: 142 hidden_cat = torch.cat([hidden[-1], hidden[-2]], dim=1) 143 else: 144 hidden_cat = hidden[-1] 145 146 fc_output = self.fc(hidden_cat) 147 return fc_output 148 149 150 def trainModel(): 151 total_loss = 0 152 for i, (phrase, sentiment) in enumerate(train_loader, 1): 153 inputs, seq_lengths, target = make_tensor(phrase, sentiment) 154 output = classifier(inputs, seq_lengths) 155 loss = criterion(output, target) 156 optimizer.zero_grad() 157 loss.backward() 158 optimizer.step() 159 160 total_loss += loss.item() 161 if i % 10 == 0: 162 print(f'[{time_since(start)}] Epoch {epoch}', end='') 163 print(f'[{i * len(inputs)}/{len(train_set)}]', end='') 164 print(f'loss={total_loss / (i * len(inputs))}') 165 166 167 def evalModel(): 168 correct = 0 169 total = len(validation_set) 170 print("Evaluating trained model...") 171 with torch.no_grad(): 172 for i, (phrase, sentiment) in enumerate(validation_loader, 1): 173 inputs, seq_lengths, target = make_tensor(phrase, sentiment) 174 output = classifier(inputs, seq_lengths.to('cpu')) 175 pred = output.max(dim=1, keepdim=True)[1] 176 correct += pred.eq(target.view_as(pred)).sum().item() 177 178 percent = '%.2f' % (100 * correct / total) 179 print(f'Test set: Accuracy {correct}/{total} {percent}%') 180 return correct / total 181 182 183 # 获取测试集 184 def get_test_set(): 185 test_set = pd.read_csv('sentiment-analysis-on-movie-reviews/test.tsv', '\t') 186 PhraseId = test_set['PhraseId'] 187 Phrase = test_set['Phrase'] 188 return PhraseId, Phrase 189 190 191 # 为测试集写的处理文本函数 192 def make_tensor_test(phrase): 193 sequences_and_lengths = [phrase2list(phrase) for phrase in phrase] # 名字字符串->字符数组->对应ASCII码 194 phrase_sequences = [sl[0] for sl in sequences_and_lengths] 195 seq_lengths = torch.LongTensor([sl[1] for sl in sequences_and_lengths]) 196 197 # make tensor of name, batchSize x seqLen 198 seq_tensor = torch.zeros(len(phrase_sequences), seq_lengths.max()).long() 199 for idx, (seq, seq_len) in enumerate(zip(phrase_sequences, seq_lengths)): # 填充零 200 seq_tensor[idx, :seq_len] = torch.LongTensor(seq) # name_sequences不够最大长度的位置补零 201 202 # 排序 sort by length to use pack_padded_sequence 203 seq_lengths, perm_idx = seq_lengths.sort(dim=0, descending=True) # perm_idx表示排完序元素原本的索引 204 seq_tensor = seq_tensor[perm_idx] # 对补零后的name_sequences按照长度排序 205 # 因为这里将测试集的每个Batch的文本顺序打乱了,记录原本的顺序org_idx,以便将预测出的结果顺序还原 206 _, org_idx = perm_idx.sort(descending=False) 207 return create_tensor(seq_tensor), create_tensor(seq_lengths), org_idx 208 209 210 def predict(): 211 # 使用模型得到结果 212 PhraseId, Phrase = get_test_set() # 获取测试集,这里会有个警告,可以不用管 213 sentiment_list = [] # 定义预测结果列表 214 batchNum = math.ceil(PhraseId.shape[0] / BATCH_SIZE) # 获取总的Batch数 215 classifier = torch.load('sentiment-analysis-on-movie-reviews/sentimentAnalyst.pkl') 216 if USE_GPU: 217 device = torch.device("cuda:0") 218 classifier.to(device) 219 with torch.no_grad(): 220 for i in range(batchNum): 221 print(i) 222 if i == batchNum - 1: 223 phraseBatch = Phrase[BATCH_SIZE * i:] # 处理最后不足BATCH_SIZE的情况 224 else: 225 phraseBatch = Phrase[BATCH_SIZE * i:BATCH_SIZE * (i + 1)] 226 inputs, seq_lengths, org_idx = make_tensor_test(phraseBatch) 227 output = classifier(inputs, seq_lengths) 228 sentiment = output.max(dim=1, keepdim=True)[1] 229 sentiment = sentiment[org_idx].squeeze(1) 230 sentiment_list.append(sentiment.cpu().numpy().tolist()) 231 232 sentiment_list = list(chain.from_iterable(sentiment_list)) # 将sentiment_list按行拼成一维列表 233 result = pd.DataFrame({'PhraseId': PhraseId, 'Sentiment': sentiment_list}) 234 result.to_csv('sentiment-analysis-on-movie-reviews/SA_predict.csv', index=False) # 保存结果 235 236 237 # Main Cycle 238 if __name__ == '__main__': 239 classifier = RNNClassifier(N_CHARS, HIDDEN_SIZE, N_CLASS, N_LAYER) 240 if USE_GPU: 241 device = torch.device("cuda:0") 242 classifier.to(device) 243 244 criterion = torch.nn.CrossEntropyLoss() 245 optimizer = torch.optim.Adam(classifier.parameters(), lr=0.001) 246 247 start = time.time() 248 print("Training for %d epochs..." % N_EPOCHS) 249 acc_list = [] 250 for epoch in range(1, N_EPOCHS + 1): 251 trainModel() 252 acc = evalModel() 253 acc_list.append(acc) 254 255 # 保存最优时的模型 ################################ 256 if acc >= max(acc_list): 257 torch.save(classifier, 'sentiment-analysis-on-movie-reviews/sentimentAnalyst.pkl') 258 print('Save Model!') 259 260 predict() # 在测试集上预测结果 261 # Plot Accuracy 262 epoch = [epoch + 1 for epoch in range(len(acc_list))] 263 plt.plot(epoch, acc_list) 264 plt.xlabel('Epoch') 265 plt.ylabel('Accuracy') 266 plt.grid() 267 plt.show()
Save Model!
[26m 52s] Epoch 49[10240/124848]loss=5.978458284516819e-05
[26m 55s] Epoch 49[20480/124848]loss=6.253116080188193e-05
[26m 58s] Epoch 49[30720/124848]loss=6.407238082222951e-05
[27m 0s] Epoch 49[40960/124848]loss=6.285167992245989e-05
[27m 3s] Epoch 49[51200/124848]loss=6.33556558022974e-05
[27m 5s] Epoch 49[61440/124848]loss=6.325708897444807e-05
[27m 7s] Epoch 49[71680/124848]loss=6.395567797881085e-05
[27m 10s] Epoch 49[81920/124848]loss=6.394394872586418e-05
[27m 12s] Epoch 49[92160/124848]loss=6.472848835983313e-05
[27m 15s] Epoch 49[102400/124848]loss=6.566120515344665e-05
[27m 17s] Epoch 49[112640/124848]loss=6.628994719490452e-05
[27m 20s] Epoch 49[122880/124848]loss=6.691597712536653e-05
Evaluating trained model...
Test set: Accuracy 30846/31212 98.83%
Save Model!
[27m 25s] Epoch 50[10240/124848]loss=4.8162178427446636e-05
[27m 28s] Epoch 50[20480/124848]loss=5.085184220661176e-05
[27m 30s] Epoch 50[30720/124848]loss=5.1025171342189426e-05
[27m 32s] Epoch 50[40960/124848]loss=5.125555562699446e-05
[27m 35s] Epoch 50[51200/124848]loss=5.2761899132747204e-05
[27m 37s] Epoch 50[61440/124848]loss=5.2417092532171713e-05
[27m 40s] Epoch 50[71680/124848]loss=5.285588964138047e-05
[27m 42s] Epoch 50[81920/124848]loss=5.33487543634692e-05
[27m 47s] Epoch 50[92160/124848]loss=5.4311194819294746e-05
[27m 50s] Epoch 50[102400/124848]loss=5.5293230507231786e-05
[27m 52s] Epoch 50[112640/124848]loss=5.628204975933345e-05
[27m 55s] Epoch 50[122880/124848]loss=5.696998793306799e-05
Evaluating trained model...
Test set: Accuracy 30848/31212 98.83%
epoch 60多就收敛了
Save Model!
[41m 55s] Epoch 70[10240/124848]loss=1.9616868030425396e-06
[41m 57s] Epoch 70[20480/124848]loss=1.9560199916668354e-06
[42m 0s] Epoch 70[30720/124848]loss=1.953235948803922e-06
[42m 3s] Epoch 70[40960/124848]loss=1.950406559103612e-06
[42m 7s] Epoch 70[51200/124848]loss=1.962950072993408e-06
[42m 9s] Epoch 70[61440/124848]loss=1.984759186749822e-06
[42m 12s] Epoch 70[71680/124848]loss=2.0060957646299876e-06
[42m 15s] Epoch 70[81920/124848]loss=2.02179848116657e-06
[42m 17s] Epoch 70[92160/124848]loss=2.0244396068846983e-06
[42m 20s] Epoch 70[102400/124848]loss=2.034707755456111e-06
[42m 22s] Epoch 70[112640/124848]loss=2.033167483907825e-06
[42m 25s] Epoch 70[122880/124848]loss=2.038910443502573e-06
Evaluating trained model...
Test set: Accuracy 31212/31212 100.00%