《PyTorch 深度学习实践 》-刘二大人 第十三讲

同样的参数,CPU跑15min,GPU 2min43s

  1 #根据地名分辨国家
  2 import math
  3 import time
  4 import torch
  5 # 绘图
  6 import matplotlib.pyplot as plt
  7 import numpy as np
  8 # 读取数据
  9 import gzip
 10 import csv
 11 
 12 from torch.nn.utils.rnn import pack_padded_sequence
 13 from torch.utils.data import Dataset, DataLoader
 14 import os
 15 os.environ['KMP_DUPLICATE_LIB_OK']='True'
 16 
 17 # ------------0 parameters-------------#
 18 HIDDEN_SIZE = 100
 19 BATCH_SIZE = 256
 20 N_LAYER = 2
 21 N_EPOCHS = 100
 22 N_CHARS = 128  # 字典长度
 23 USE_GPU = True  # 不用GPU
 24 
 25 # ---------------------1 Preparing Data and DataLoad-------------------------------#
 26 class NameDataset(Dataset):
 27     def __init__(self, is_train_set=True):
 28         filename = 'names_train.csv.gz' if is_train_set else 'names_test.csv.gz'
 29 
 30         # 访问数据集,使用gzip和csv包
 31         with gzip.open(filename, 'rt') as f:
 32             reader = csv.reader(f)
 33             rows = list(reader)  # 按行读取(names,countries)
 34 
 35         self.names = [row[0] for row in rows]
 36         self.len = len(self.names)
 37         self.countries = [row[1] for row in rows]
 38         self.country_list = list(sorted(set(self.countries)))  # set:去除重复,sorted:排序,list:转换为列表
 39         self.country_dict = self.getCountryDict()
 40         self.country_num = len(self.country_list)
 41 
 42     def __getitem__(self, index):
 43         return self.names[index], self.country_dict[self.countries[index]]
 44         # 取出的names是字符串,country_dict是索引
 45 
 46     def __len__(self):
 47         return self.len
 48 
 49     def getCountryDict(self):  # Convert list into dictionary.
 50         country_dict = dict()
 51         for idx, country_name in enumerate(self.country_list, 0):
 52             country_dict[country_name] = idx
 53         return country_dict
 54 
 55     def idx2country(self, index):  # Return country name giving index.
 56         return self.country_list[index]
 57 
 58     def getCountriesNum(self):  # Return the number of countries.
 59         return self.country_num
 60 
 61 
 62 # DataLoade
 63 trainset = NameDataset(is_train_set=True)
 64 trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
 65 testset = NameDataset(is_train_set=False)
 66 testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)
 67 N_COUNTRY = trainset.getCountriesNum()
 68 
 69 
 70 # ------------------------------Design  Model-----------------------------------#
 71 def create_tensor(tensor):
 72     if USE_GPU:
 73         device = torch.device("cuda:0")
 74         tensor = tensor.to(device)
 75     return tensor
 76 
 77 
 78 class RNNClassifier(torch.nn.Module):
 79     def __init__(self, input_size, hidden_size, output_size, n_layers=1, bidirectional=True):
 80         super(RNNClassifier, self).__init__()
 81         self.hidden_size = hidden_size
 82         self.n_layers = n_layers
 83         self.n_directions = 2 if bidirectional else 1  # bidirectional,双向循环神经网络
 84         self.embedding = torch.nn.Embedding(input_size, hidden_size)
 85         self.gru = torch.nn.GRU(hidden_size, hidden_size, n_layers, bidirectional=bidirectional)
 86         self.fc = torch.nn.Linear(hidden_size * self.n_directions, output_size)
 87 
 88     def _init_hidden(self, batch_size):
 89         hidden = torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size)
 90         return create_tensor(hidden)
 91 
 92     def forward(self, input, seq_lengths):
 93         input = input.t()  # 转置 t -> transpose: input shape : B x S -> S x B
 94         batch_size = input.size(1)
 95 
 96         hidden = self._init_hidden(batch_size)  # h0
 97         embedding = self.embedding(input)  # (seqLen,batchSize,hiddenSize)
 98 
 99         # PackedSquence:把为0的填充量去除,把每个样本的长度记录下来,按长度排序后拼接在一起
100         gru_input = pack_padded_sequence(embedding, seq_lengths)
101 
102         output, hidden = self.gru(gru_input, hidden)
103         if self.n_directions == 2:  # 双向循环神经网络有两个hidden
104             hidden_cat = torch.cat([hidden[-1], hidden[-2]], dim=1)
105         else:
106             hidden_cat = hidden[-1]
107 
108         fc_output = self.fc(hidden_cat)
109         return fc_output
110 
111 
112 classifier = RNNClassifier(N_CHARS, HIDDEN_SIZE, N_COUNTRY, N_LAYER)
113 
114 #----------------------3 Construct Loss and Optimizer------------------------------------#
115 criterion = torch.nn.CrossEntropyLoss()
116 optimizer = torch.optim.Adam(classifier.parameters(), lr=0.001)
117 
118 
119 #-----------------------------------4 Train and Test----------------------------------------------------#
120 def time_since(since):
121     s = time.time() - since
122     m = math.floor(s / 60)
123     s -= m * 60
124     return '%dm %ds' % (m, s)
125 
126 
127 def name2list(name):
128     arr = [ord(c) for c in name]  # 返回对应字符的 ASCII 数值
129     return arr, len(arr)  # 返回元组,列表本身和列表长度
130 
131 
132 def make_tensors(names, countries):
133     sequences_and_lengths = [name2list(name) for name in names]
134     name_sequences = [sl[0] for sl in sequences_and_lengths]
135     seq_lengths = torch.LongTensor([sl[1] for sl in sequences_and_lengths])
136     countries = countries.long()  # countries:国家索引
137 
138     # make tensor of name, BatchSize x SeqLen
139     seq_tensor = torch.zeros(len(name_sequences), seq_lengths.max()).long()
140     for idx, (seq, seq_len) in enumerate(zip(name_sequences, seq_lengths), 0):
141         seq_tensor[idx, :seq_len] = torch.LongTensor(seq)
142     # 先制作一个全0的tensor,然后将名字贴在上面
143 
144     # 排序,sort by length to use pack_padded_sequence
145     seq_lengths, perm_idx = seq_lengths.sort(dim=0, descending=True)
146     # sort返回两个值,seq_lengths:排完序后的序列(未padding),perm_idx:排完序后对应元素的索引
147     seq_tensor = seq_tensor[perm_idx]  # 排序(已padding)
148     countries = countries[perm_idx]  # 排序(标签)
149     return create_tensor(seq_tensor), create_tensor(seq_lengths), create_tensor(countries)
150 
151 
152 def trainModel():
153     total_loss = 0
154     for i, (names, countries) in enumerate(trainloader, 1):
155         inputs, seq_lengths, target = make_tensors(names, countries)  # make_tensors
156         output = classifier(inputs, seq_lengths.to('cpu'))
157         loss = criterion(output, target)
158         optimizer.zero_grad()
159         loss.backward()
160         optimizer.step()
161 
162         total_loss += loss.item()
163         if i % 10 == 0:
164             print(f'[{time_since(start)}] Epoch {epoch} ', end='')
165             print(f'[{i * len(inputs)}/{len(trainset)}] ', end='')
166             print(f'loss={total_loss / (i * len(inputs))}')
167     return total_loss
168 
169 #test module
170 def hehe():
171     correct = 0
172     total = len(testset)
173     print("evaluating trained model ...")
174     with torch.no_grad():
175         for i, (names, countries) in enumerate(testloader, 1):
176             inputs, seq_lengths, target = make_tensors(names, countries)  # make_tensors
177             output = classifier(inputs, seq_lengths.to('cpu'))
178             pred = output.max(dim=1, keepdim=True)[1]
179             correct += pred.eq(target.view_as(pred)).sum().item()
180         percent = '%.2f' % (100 * correct / total)
181         print(f'Test set: Accuracy {correct}/{total} {percent}%')
182     return correct / total
183 
184 
185 if __name__ == '__main__':
186     if USE_GPU:
187         device = torch.device("cuda:0")
188         classifier.to(device)
189     start = time.time()
190     print("Training for %d epochs..." % N_EPOCHS)
191     acc_list = []
192     # Train cycle,In every epoch, training and testing the model once.
193     for epoch in range(1, N_EPOCHS + 1):
194         trainModel()
195         acc = hehe()
196         acc_list.append(acc)
197 
198     # 绘图
199     epoch = np.arange(1, len(acc_list) + 1, 1)
200     acc_list = np.array(acc_list)
201     plt.plot(epoch, acc_list)
202     plt.xlabel('Epoch')
203     plt.ylabel('Accuracy')
204     plt.grid()
205     plt.show()

evaluating trained model ...
Test set: Accuracy 5599/6700 83.57%
[2m 41s] Epoch 100 [2560/13374] loss=0.00011349248889018782
[2m 42s] Epoch 100 [5120/13374] loss=0.00012008407356915996
[2m 42s] Epoch 100 [7680/13374] loss=0.0001346439957463493
[2m 42s] Epoch 100 [10240/13374] loss=0.00013780106764897936
[2m 43s] Epoch 100 [12800/13374] loss=0.00014130977695458568
evaluating trained model ...
Test set: Accuracy 5607/6700 83.69%

 课后作业:

  1 '''
  2 Sentiment Analysis on Movie Reviews-RNN
  3 来自:https://blog.csdn.net/qq_39187959/article/details/121102959
  4 '''
  5 import math
  6 import torch
  7 from itertools import chain
  8 import pandas as pd
  9 from torch.nn.utils.rnn import pack_padded_sequence
 10 from torch.utils.data import Dataset, DataLoader
 11 import time
 12 import matplotlib.pyplot as plt
 13 import os
 14 os.environ['KMP_DUPLICATE_LIB_OK']='True'
 15 
 16 class SAData(Dataset):
 17     def __init__(self, train):
 18         # 构建数据样本
 19         self.train = train
 20         self.data = pd.read_csv('sentiment-analysis-on-movie-reviews/train.tsv', sep='\t')
 21 
 22         if self.train:
 23             # 随机选取80%作为训练集,不可按索引顺序取,数据会不全面
 24             self.data = self.data.sample(frac=0.8, replace=False, random_state=1, axis=0)
 25             # self.data = self.data[:int(self.data.shape[0] * 0.8)]
 26             self.data = self.data.reset_index(drop=True)  # 重新生成索引
 27             ### 正式训练要训练所有数据 ###
 28             # self.data = self.data
 29             self.len = self.data.shape[0]
 30         else:
 31             # 20%作为验证集
 32             self.data = self.data.sample(frac=0.2, replace=False, random_state=1, axis=0)
 33             # self.data = self.data[int(self.data.shape[0] * 0.8):]
 34             self.data = self.data.reset_index(drop=True)  # 重新生成索引
 35             self.len = self.data.shape[0]
 36         self.x_data, self.y_data = self.data['Phrase'], self.data['Sentiment']
 37 
 38     def __getitem__(self, index):
 39         # 根据数据索引获取样本
 40         return self.x_data[index], self.y_data[index]
 41 
 42     def __len__(self):
 43         # 返回数据长度
 44         return self.len
 45 
 46 
 47 # 训练集验证集数据对象
 48 train_set = SAData(train=True)
 49 validation_set = SAData(train=False)
 50 
 51 # Hyper Parameters
 52 N_CHARS = 128  # ASCII码个数
 53 HIDDEN_SIZE = 128
 54 N_LAYER = 2
 55 BATCH_SIZE = 1024
 56 N_EPOCHS = 50  #自己电脑建议大家改成10再运行,因为真的需要很久,改成100的时候直接报硬件不足了
 57 USE_GPU = True
 58 N_CLASS = len(set(train_set.y_data))
 59 
 60 # 训练集验证集数据加载对象
 61 train_loader = DataLoader(
 62     dataset=train_set,
 63     batch_size=BATCH_SIZE,
 64     shuffle=True,
 65     # num_workers=2
 66 )
 67 
 68 validation_loader = DataLoader(
 69     dataset=validation_set,
 70     batch_size=BATCH_SIZE,
 71     shuffle=False,  # 测试集不打乱有利于观察结果
 72     # num_workers=2
 73 )
 74 
 75 
 76 def time_since(since):
 77     s = time.time() - since
 78     m = math.floor(s / 60)
 79     s -= m * 60
 80     return '%dm %ds' % (m, s)
 81 
 82 
 83 def phrase2list(phrase):
 84     arr = [ord(c) for c in phrase]  # ord() 返回对应的ASCII码
 85     return arr, len(arr)
 86 
 87 
 88 def create_tensor(tensor):
 89     if USE_GPU:
 90         device = torch.device('cuda:0')
 91         tensor = tensor.to(device)
 92     return tensor
 93 
 94 
 95 def make_tensor(phrase, sentiment):
 96     sequences_and_lengths = [phrase2list(phrase) for phrase in phrase]  # 名字字符串->字符数组->对应ASCII码
 97     phrase_sequences = [sl[0] for sl in sequences_and_lengths]
 98     seq_lengths = torch.LongTensor([sl[1] for sl in sequences_and_lengths])
 99     sentiment = sentiment.long()
100 
101     # make tensor of name, batchSize x seqLen
102     seq_tensor = torch.zeros(len(phrase_sequences), seq_lengths.max()).long()
103     for idx, (seq, seq_len) in enumerate(zip(phrase_sequences, seq_lengths)):  # 填充零
104         seq_tensor[idx, :seq_len] = torch.LongTensor(seq)  # name_sequences不够最大长度的位置补零
105 
106     # 排序 sort by length to use pack_padded_sequence
107     seq_lengths, perm_idx = seq_lengths.sort(dim=0, descending=True)  # perm_idx表示排完序元素原本的索引
108     seq_tensor = seq_tensor[perm_idx]  # 对补零后的name_sequences按照长度排序
109     sentiment = sentiment[perm_idx]
110 
111     return create_tensor(seq_tensor), create_tensor(seq_lengths), create_tensor(sentiment)
112 
113 
114 class RNNClassifier(torch.nn.Module):
115     def __init__(self, input_size, hidden_size, output_size, n_layers=1, bidirection=True):
116         super(RNNClassifier, self).__init__()
117         self.hidden_size = hidden_size
118         self.n_layers = n_layers
119         self.n_directions = 2 if bidirection else 1
120 
121         self.embedding = torch.nn.Embedding(input_size, hidden_size)
122         self.gru = torch.nn.GRU(hidden_size, hidden_size, n_layers, bidirectional=bidirection)
123         self.fc = torch.nn.Linear(hidden_size * self.n_directions, output_size)
124 
125     def _init_hidden(self, batch_size):
126         hidden = torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size)
127         return create_tensor(hidden)
128 
129     def forward(self, input, seq_lengths):
130         input = input.t()  # 转置 B x S -> S x B
131         batch_size = input.size(1)
132 
133         hidden = self._init_hidden(batch_size)
134         embedding = self.embedding(input)
135 
136         # 这里的pack,理解成压紧比较好。
137         # 将一个 填充过的变长序列 压紧。(填充时候,会有冗余,所以压紧一下)
138         gru_input = pack_padded_sequence(embedding, seq_lengths.to('cpu'))  # pack them up
139 
140         output, hidden = self.gru(gru_input, hidden)
141         if self.n_directions == 2:
142             hidden_cat = torch.cat([hidden[-1], hidden[-2]], dim=1)
143         else:
144             hidden_cat = hidden[-1]
145 
146         fc_output = self.fc(hidden_cat)
147         return fc_output
148 
149 
150 def trainModel():
151     total_loss = 0
152     for i, (phrase, sentiment) in enumerate(train_loader, 1):
153         inputs, seq_lengths, target = make_tensor(phrase, sentiment)
154         output = classifier(inputs, seq_lengths)
155         loss = criterion(output, target)
156         optimizer.zero_grad()
157         loss.backward()
158         optimizer.step()
159 
160         total_loss += loss.item()
161         if i % 10 == 0:
162             print(f'[{time_since(start)}] Epoch {epoch}', end='')
163             print(f'[{i * len(inputs)}/{len(train_set)}]', end='')
164             print(f'loss={total_loss / (i * len(inputs))}')
165 
166 
167 def evalModel():
168     correct = 0
169     total = len(validation_set)
170     print("Evaluating trained model...")
171     with torch.no_grad():
172         for i, (phrase, sentiment) in enumerate(validation_loader, 1):
173             inputs, seq_lengths, target = make_tensor(phrase, sentiment)
174             output = classifier(inputs, seq_lengths.to('cpu'))
175             pred = output.max(dim=1, keepdim=True)[1]
176             correct += pred.eq(target.view_as(pred)).sum().item()
177 
178         percent = '%.2f' % (100 * correct / total)
179         print(f'Test set: Accuracy {correct}/{total} {percent}%')
180     return correct / total
181 
182 
183 # 获取测试集
184 def get_test_set():
185     test_set = pd.read_csv('sentiment-analysis-on-movie-reviews/test.tsv', '\t')
186     PhraseId = test_set['PhraseId']
187     Phrase = test_set['Phrase']
188     return PhraseId, Phrase
189 
190 
191 # 为测试集写的处理文本函数
192 def make_tensor_test(phrase):
193     sequences_and_lengths = [phrase2list(phrase) for phrase in phrase]  # 名字字符串->字符数组->对应ASCII码
194     phrase_sequences = [sl[0] for sl in sequences_and_lengths]
195     seq_lengths = torch.LongTensor([sl[1] for sl in sequences_and_lengths])
196 
197     # make tensor of name, batchSize x seqLen
198     seq_tensor = torch.zeros(len(phrase_sequences), seq_lengths.max()).long()
199     for idx, (seq, seq_len) in enumerate(zip(phrase_sequences, seq_lengths)):  # 填充零
200         seq_tensor[idx, :seq_len] = torch.LongTensor(seq)  # name_sequences不够最大长度的位置补零
201 
202     # 排序 sort by length to use pack_padded_sequence
203     seq_lengths, perm_idx = seq_lengths.sort(dim=0, descending=True)  # perm_idx表示排完序元素原本的索引
204     seq_tensor = seq_tensor[perm_idx]  # 对补零后的name_sequences按照长度排序
205     # 因为这里将测试集的每个Batch的文本顺序打乱了,记录原本的顺序org_idx,以便将预测出的结果顺序还原
206     _, org_idx = perm_idx.sort(descending=False)
207     return create_tensor(seq_tensor), create_tensor(seq_lengths), org_idx
208 
209 
210 def predict():
211     # 使用模型得到结果
212     PhraseId, Phrase = get_test_set()  # 获取测试集,这里会有个警告,可以不用管
213     sentiment_list = []  # 定义预测结果列表
214     batchNum = math.ceil(PhraseId.shape[0] / BATCH_SIZE)  # 获取总的Batch数
215     classifier = torch.load('sentiment-analysis-on-movie-reviews/sentimentAnalyst.pkl')
216     if USE_GPU:
217         device = torch.device("cuda:0")
218         classifier.to(device)
219     with torch.no_grad():
220         for i in range(batchNum):
221             print(i)
222             if i == batchNum - 1:
223                 phraseBatch = Phrase[BATCH_SIZE * i:]  # 处理最后不足BATCH_SIZE的情况
224             else:
225                 phraseBatch = Phrase[BATCH_SIZE * i:BATCH_SIZE * (i + 1)]
226             inputs, seq_lengths, org_idx = make_tensor_test(phraseBatch)
227             output = classifier(inputs, seq_lengths)
228             sentiment = output.max(dim=1, keepdim=True)[1]
229             sentiment = sentiment[org_idx].squeeze(1)
230             sentiment_list.append(sentiment.cpu().numpy().tolist())
231 
232     sentiment_list = list(chain.from_iterable(sentiment_list))  # 将sentiment_list按行拼成一维列表
233     result = pd.DataFrame({'PhraseId': PhraseId, 'Sentiment': sentiment_list})
234     result.to_csv('sentiment-analysis-on-movie-reviews/SA_predict.csv', index=False)  # 保存结果
235 
236 
237 # Main Cycle
238 if __name__ == '__main__':
239     classifier = RNNClassifier(N_CHARS, HIDDEN_SIZE, N_CLASS, N_LAYER)
240     if USE_GPU:
241         device = torch.device("cuda:0")
242         classifier.to(device)
243 
244     criterion = torch.nn.CrossEntropyLoss()
245     optimizer = torch.optim.Adam(classifier.parameters(), lr=0.001)
246 
247     start = time.time()
248     print("Training for %d epochs..." % N_EPOCHS)
249     acc_list = []
250     for epoch in range(1, N_EPOCHS + 1):
251         trainModel()
252         acc = evalModel()
253         acc_list.append(acc)
254 
255         # 保存最优时的模型 ################################
256         if acc >= max(acc_list):
257             torch.save(classifier, 'sentiment-analysis-on-movie-reviews/sentimentAnalyst.pkl')
258             print('Save Model!')
259 
260     predict()  # 在测试集上预测结果
261     # Plot Accuracy
262     epoch = [epoch + 1 for epoch in range(len(acc_list))]
263     plt.plot(epoch, acc_list)
264     plt.xlabel('Epoch')
265     plt.ylabel('Accuracy')
266     plt.grid()
267     plt.show()

Save Model!
[26m 52s] Epoch 49[10240/124848]loss=5.978458284516819e-05
[26m 55s] Epoch 49[20480/124848]loss=6.253116080188193e-05
[26m 58s] Epoch 49[30720/124848]loss=6.407238082222951e-05
[27m 0s] Epoch 49[40960/124848]loss=6.285167992245989e-05
[27m 3s] Epoch 49[51200/124848]loss=6.33556558022974e-05
[27m 5s] Epoch 49[61440/124848]loss=6.325708897444807e-05
[27m 7s] Epoch 49[71680/124848]loss=6.395567797881085e-05
[27m 10s] Epoch 49[81920/124848]loss=6.394394872586418e-05
[27m 12s] Epoch 49[92160/124848]loss=6.472848835983313e-05
[27m 15s] Epoch 49[102400/124848]loss=6.566120515344665e-05
[27m 17s] Epoch 49[112640/124848]loss=6.628994719490452e-05
[27m 20s] Epoch 49[122880/124848]loss=6.691597712536653e-05
Evaluating trained model...
Test set: Accuracy 30846/31212 98.83%
Save Model!
[27m 25s] Epoch 50[10240/124848]loss=4.8162178427446636e-05
[27m 28s] Epoch 50[20480/124848]loss=5.085184220661176e-05
[27m 30s] Epoch 50[30720/124848]loss=5.1025171342189426e-05
[27m 32s] Epoch 50[40960/124848]loss=5.125555562699446e-05
[27m 35s] Epoch 50[51200/124848]loss=5.2761899132747204e-05
[27m 37s] Epoch 50[61440/124848]loss=5.2417092532171713e-05
[27m 40s] Epoch 50[71680/124848]loss=5.285588964138047e-05
[27m 42s] Epoch 50[81920/124848]loss=5.33487543634692e-05
[27m 47s] Epoch 50[92160/124848]loss=5.4311194819294746e-05
[27m 50s] Epoch 50[102400/124848]loss=5.5293230507231786e-05
[27m 52s] Epoch 50[112640/124848]loss=5.628204975933345e-05
[27m 55s] Epoch 50[122880/124848]loss=5.696998793306799e-05
Evaluating trained model...
Test set: Accuracy 30848/31212 98.83%

epoch 60多就收敛了

Save Model!
[41m 55s] Epoch 70[10240/124848]loss=1.9616868030425396e-06
[41m 57s] Epoch 70[20480/124848]loss=1.9560199916668354e-06
[42m 0s] Epoch 70[30720/124848]loss=1.953235948803922e-06
[42m 3s] Epoch 70[40960/124848]loss=1.950406559103612e-06
[42m 7s] Epoch 70[51200/124848]loss=1.962950072993408e-06
[42m 9s] Epoch 70[61440/124848]loss=1.984759186749822e-06
[42m 12s] Epoch 70[71680/124848]loss=2.0060957646299876e-06
[42m 15s] Epoch 70[81920/124848]loss=2.02179848116657e-06
[42m 17s] Epoch 70[92160/124848]loss=2.0244396068846983e-06
[42m 20s] Epoch 70[102400/124848]loss=2.034707755456111e-06
[42m 22s] Epoch 70[112640/124848]loss=2.033167483907825e-06
[42m 25s] Epoch 70[122880/124848]loss=2.038910443502573e-06
Evaluating trained model...
Test set: Accuracy 31212/31212 100.00%

 

posted @ 2022-10-24 22:22  silvan_happy  阅读(218)  评论(0编辑  收藏  举报