Pytorch-暑假实验的一些方法记录
1.继承Dataset构建的类,按batch的最长句子数读取(而不是按训练集的最长,可以减少一丢丢显存)
以BiLSTM+CRF做分词所需的数据格式为例,重写collate_fn方法:
1 class SegDataset(Dataset): 2 def __init__(self, data_path, word2idx, word_pad_idx=0, label_pad_idx=-1): 3 super(SegDataset_simple, self).__init__() 4 self.texts = load_data(data_path) 5 self.word2idx = word2idx 6 7 self.word_pad_idx = word_pad_idx 8 self.label_pad_idx = label_pad_idx 9 10 def __len__(self): 11 return len(self.texts) 12 13 def __getitem__(self, index): 14 text = self.texts[index] #某一行数据,比如['西充县', '人民', '法院'],没有首尾符 15 16 token_ids, label = [], [] 17 for item in text: 18 tokens = list(item) 19 token_ids.extend([self.word2idx.get(token, 1) for token in tokens]) #unk的索引为1 20 if len(tokens) == 1: 21 label.append(1) 22 else: 23 label.extend([0] * (len(tokens)-1) + [1]) #label:[0, 0, 1, 0, 1, 0, 1] 24 25 return torch.tensor(token_ids), torch.tensor(label) 26 27 28 #按一个batch中的最长长度来填充 29 def collate_fn(self, batch): 30 token_ids = [x[0] for x in batch] 31 labels = [x[1] for x in batch] 32 33 batch_len = len(token_ids) #batch长度 34 max_len = max([len(label) for label in labels]) #一个batch中的最长长度 35 36 #padding data初始化 37 batch_data = self.word_pad_idx * np.ones((batch_len, max_len)) 38 batch_labels = self.label_pad_idx * np.ones((batch_len, max_len)) 39 40 #padding and aligning 41 for j in range(batch_len): 42 cur_len = len(token_ids[j]) 43 batch_data[j][:cur_len] = token_ids[j] 44 batch_labels[j][:cur_len] = labels[j] 45 46 # convert data to torch LongTensors 47 batch_data = torch.tensor(batch_data, dtype=torch.long) 48 batch_labels = torch.tensor(batch_labels, dtype=torch.long) 49 return [batch_data, batch_labels]
2.根据词向量文件构建embedding矩阵
以下代码中词向量维度=300
1 def load_embedding(fpath): 2 word2idx = {} 3 wordemb = [] 4 word2idx['<pad>'] = 0 5 wordemb.append(np.random.uniform(-0.01, 0.01, EMBEDDING_DIM).tolist()) 6 word2idx['<unk>'] = 1 7 wordemb.append(np.random.uniform(-0.01, 0.01, EMBEDDING_DIM).tolist()) 8 with open(fpath, 'r') as f: 9 10 for line in tqdm(f): 11 splt = line.split() 12 if len(splt)!=301: 13 continue 14 vector = list(map(float, splt[-EMBEDDING_DIM:])) 15 word = splt[0] 16 if word not in word2idx: 17 word2idx[word] = len(word2idx) 18 wordemb.append(vector) 19 return word2idx, np.asarray(wordemb, np.float32) 20 21 22 DICT_PATH = './data/embedding/token_vec_300.txt' 23 word2idx, wordemb = load_embedding(DICT_PATH) 24 pretrained_embdding = torch.from_numpy(wordemb) 25 print('pretrained_embdding=', pretrained_embdding.shape) 26 27 #model.embedding.weight.data.copy_(pretrained_embdding)
3.torchcrf的使用
3.1计算损失部分
以中文分词的模型定义为例:
1 from transformers import BertModel 2 from transformers.models.bert.modeling_bert import * 3 from torchcrf import CRF 4 5 class BertSeg(BertPreTrainedModel): 6 def __init__(self, config): 7 super(BertSeg, self).__init__(config) 8 self.num_labels = config.num_labels 9 self.bert = BertModel(config) 10 self.layernorm = nn.LayerNorm(config.hidden_size) 11 12 self.dropout = nn.Dropout(config.hidden_dropout_prob) 13 self.classifier = nn.Linear(config.hidden_size, self.num_labels) 14 self.crf = CRF(config.num_labels, batch_first=True) 15 16 self.init_weights() 17 18 19 def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, 20 position_ids=None, inputs_embeds=None, head_mask=None): 21 22 outputs = self.bert(input_ids, #[batch, max_batch_len] 23 attention_mask=attention_mask, 24 token_type_ids=token_type_ids, 25 position_ids=position_ids, 26 head_mask=head_mask, 27 inputs_embeds=inputs_embeds) 28 sequence_output = outputs[0] #last_hidden_state, shape=[batch, max_batch_len, hidden_size] 29 30 31 origin_sequence_output = [layer[starts.nonzero().squeeze(1)] 32 for layer, starts in zip(sequence_output, attention_mask)] 33 # 将sequence_output的pred_label维度padding到最大长度 34 padded_sequence_output = pad_sequence(origin_sequence_output, batch_first=True) 35 padded_sequence_output = self.dropout(padded_sequence_output) 36 logits = self.classifier(padded_sequence_output) 37 38 #sequence_output = self.dropout(sequence_output) 39 #logits = self.classifier(sequence_output) # [batch, max_batch_len, 2] 40 outputs = (logits,) 41 42 ''' 43 if labels is not None: #如果标签不空,用来训练 44 loss_fct = nn.CrossEntropyLoss() 45 46 loss_mask = labels.gt(-1) #筛选出有效标签做loss, labels.gt(-1) 47 active_loss = loss_mask.view(-1) == 1 48 49 active_labels = labels.view(-1)[active_loss] #[batch*max_batch_len] 50 active_logits = logits.view(-1, self.num_labels)[active_loss] #[batch*max_batch_len, 2] 51 52 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 53 outputs = (loss,) + outputs 54 ''' 55 56 #CRF 57 if labels is not None: 58 loss_mask = labels.gt(-1) 59 loss = self.crf(logits, labels, loss_mask) * (-1) 60 outputs = (loss,) + outputs 61 62 63 return outputs
定义之后BertSeg方法的使用就和BertForSequenceClassification一样
3.2解码部分
1 bert_config = BertConfig.from_pretrained(DIR + 'config.json', num_labels=NUM_LABELS) 2 model = BertSeg.from_pretrained(DIR, config=bert_config) 3 model_file = DIR + 'pytorch_model.bin' 4 model.load_state_dict(torch.load(model_file), strict=False) 5 6 #训练完成后使用CRF解码 7 logits = model.crf.decode(pre_logits, mask=None) 8 9 #普通解码 10 logits = torch.argmax(pre_logits, dim=-1)
4.稀疏矩阵全连接层
4.1 Sparse库的使用
1 i = torch.LongTensor([[0, 2], [1, 0], [1, 2]]) #不是列表值,而是索引 2 v = torch.FloatTensor([3, 4, 5]) 3 res = torch.sparse.FloatTensor(i.t(), v, torch.Size([2,3])) #.to_dense() 4 print(res.to_dense()) #Tensor类型 5 #tensor([[0., 0., 3.], 6 # [4., 0., 5.]]) 7 8 print(res._indices()) 9 #tensor([[0, 1, 1], 10 # [2, 0, 2]]) 11 print(res._values()) #tensor([3., 4., 5.])
4.2 改写nn.Linear函数,使之可以接受三元组作为输入
重写LinearFunction,官方文档:官方文档:https://pytorch.org/docs/stable/notes/extending.html
1 class LinearFunction(torch.autograd.Function): 2 3 # Note that both forward and backward are @staticmethods 4 @staticmethod 5 def forward(ctx, idxes, values, weight, bias=None): #[b, max_len], [b, max_len], [2, vocab_size] 6 ctx.save_for_backward(idxes, values, weight, bias) 7 output = torch.zeros((idxes.size(0), weight.size(0))).cuda() #[b, 2] 8 9 for i in range(idxes.size(0)): #遍历batch 10 rows = idxes[i] #所有非0值的索引, 11 x = weight.t()[rows] #取出weight权重的非0值的行 12 13 mask = values[i].gt(-1) 14 mat = x[mask] * values[i].view(-1, 1)[mask] 15 #mat = x * values[i] 16 res = mat.sum(dim=0) 17 output[i] = res 18 19 #output = input.mm(weight.t()) 20 if bias is not None: 21 output += bias.unsqueeze(0).expand_as(output) 22 return output 23 24 # This function has only a single output, so it gets only one gradient 25 @staticmethod 26 def backward(ctx, grad_output): 27 # This is a pattern that is very convenient - at the top of backward 28 # unpack saved_tensors and initialize all gradients w.r.t. inputs to 29 # None. Thanks to the fact that additional trailing Nones are ignored, 30 # the return statement is simple even when the function has optional inputs. 31 idxes, values, weight, bias = ctx.saved_tensors 32 grad_weight = grad_bias = None 33 34 # These needs_input_grad checks are optional and there only to 35 # improve efficiency. If you want to make your code simpler, you can skip them. 36 # Returning gradients for inputs that don't require it is not an error. 37 38 #把idxes和values换成稀疏矩阵形式 39 b = idxes.size(0) #[b, max_len] 40 new_idxes = [] 41 for i in range(b): 42 mask = values[i].gt(-1) 43 new_idxes.extend([[i, val] for val in idxes[i][mask]]) 44 #new_idxes.extend([[i, val] for val in idxes[i]]) 45 all_mask = values.gt(-1) 46 new_values = torch.FloatTensor(values[all_mask].cpu().numpy()) #tensor类型不能直接转为Float 47 new_idxes = torch.LongTensor(new_idxes) 48 49 input = torch.sparse.FloatTensor(new_idxes.t(), new_values, torch.Size([b, VOCAB_SIZE])).to_dense().cuda() # 50 if ctx.needs_input_grad[2]: 51 grad_weight = grad_output.t().mm(input) 52 if bias is not None and ctx.needs_input_grad[3]: 53 grad_bias = grad_output.sum(0) 54 55 return None, None, grad_weight, grad_bias #idxes和values没有梯度 56 57 58 59 class CustomizedLinear(nn.Module): 60 def __init__(self, input_features, output_features, bias=True): 61 super(CustomizedLinear, self).__init__() 62 self.input_features = input_features 63 self.output_features = output_features 64 65 # nn.Parameter is a special kind of Tensor, that will get 66 # automatically registered as Module's parameter once it's assigned 67 # as an attribute. Parameters and buffers need to be registered, or 68 # they won't appear in .parameters() (doesn't apply to buffers), and 69 # won't be converted when e.g. .cuda() is called. You can use 70 # .register_buffer() to register buffers. nn.Parameters require gradients by default. 71 self.weight = nn.Parameter(torch.empty(output_features, input_features)) 72 if bias: 73 self.bias = nn.Parameter(torch.empty(output_features)) 74 else: 75 # You should always register all possible parameters, but the optional ones can be None if you want. 76 self.register_parameter('bias', None) 77 78 # Not a very smart way to initialize weights 79 self.weight.data.uniform_(-0.1, 0.1) 80 if self.bias is not None: 81 self.bias.data.uniform_(-0.1, 0.1) 82 83 def forward(self, idxes, values): 84 # See the autograd section for explanation of what happens here. 85 return LinearFunction.apply(idxes, values, self.weight, self.bias) 86 87 def extra_repr(self): 88 # (Optional)Set the extra information about this module. You can test it by printing an object of this class. 89 return 'input_features={}, output_features={}, bias={}'.format( 90 self.input_features, self.output_features, self.bias is not None 91 ) 92 93 94 class FcModel(nn.Module): 95 def __init__(self, vocab_size, hidden_dim, output_size): #vocab_size=单词表的长度 96 super(FcModel, self).__init__() 97 self.fc = CustomizedLinear(vocab_size, output_size) 98 99 def forward(self, idxes, values): # [batch, seq_len] 100 x = self.fc(idxes, values) 101 return x
5.tensor类型转numpy,再转list
1 if logits是tensor类型 2 logits.detach().cpu().numpy().tolist()
6.tensor类型翻转函数.flip()
1 x = idxes = torch.tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]) 2 print(x.flip(dims=[0])) #tensor([[ 6, 7, 8, 9, 10], [ 1, 2, 3, 4, 5]]) 3 print(x.flip(dims=[1])) #tensor([[ 5, 4, 3, 2, 1], [10, 9, 8, 7, 6]])
7.被pad填充过的batch做损失(需要mask被填充部分)
1 loss_fct = nn.CrossEntropyLoss() 2 loss_mask = labels.gt(-1) #筛选出有效标签做loss 3 4 active_loss = loss_mask.view(-1) == 1 5 6 active_labels = torch.where(active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)) 7 active_logits = logits.view(-1, self.num_labels) 8 9 loss = loss_fct(active_logits, active_labels) 10 11 #或者下面的写法 12 #active_labels = labels.view(-1)[active_loss] #[batch*max_batch_len] 13 #active_logits = logits.view(-1, self.num_labels)[active_loss] #[batch*max_batch_len, 2] 14 #loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
8.heapq堆排序调库
1 import heapq 2 3 pre_values= [0.9983519315719604, 0.9670383334159851, 0.5502673387527466, 0.9989173412322998, 0.9934349060058594, 0.9387616515159607, 0.8802109956741333, 0.8454128503799438, 0.9943495392799377, 0.9983519315719604, 0.9847546815872192, 0.9931608438491821, 0.9957965612411499, 0.9994950294494629, 0.724168598651886, 0.6788691878318787, 0.9875763654708862, 0.9983518123626709, 0.9984715580940247, 0.9971969127655029, 0.9934577345848083, 0.9863535165786743, 0.995525062084198, 0.6242473125457764, 0.8157297968864441, 0.9625718593597412, 0.9997809529304504, 0.9988841414451599, 0.9907870888710022, 0.9848848581314087, 0.9409478902816772, 0.9791840314865112, 0.9999555349349976, 0.9967131614685059, 0.9983519315719604, 0.9056358933448792, 0.9983962178230286, 0.657272458076477, 0.9997139573097229, 0.9874300360679626, 0.9905149936676025, 0.8445992469787598, 0.9971678853034973, 0.9983079433441162, 0.9997274279594421, 0.9978759288787842, 0.9941542744636536, 0.8210688829421997, 0.8894577026367188, 0.9996976852416992] 4 5 #前topK个值 6 pre = heapq.nlargest(10, pre_values) 7 #[0.9999555349349976, 0.9997809529304504, 0.9997274279594421, 0.9997139573097229, 0.9996976852416992, 0.9994950294494629, 0.9989173412322998, 0.9988841414451599, 0.9984715580940247, 0.9983962178230286] 8 9 #前topK个索引 10 pre_idxes = heapq.nlargest(10, range(len(pre_values)), pre_values.__getitem__) #[32, 26, 44, 38, 49, 13, 3, 27, 18, 36]
9.按相同顺序对不同数组乱序排序
应用场景:中英文语料(一一对应,标签一致)在不同文件里,如果需要乱序,两个列表应该按相同顺序打乱;
光设定随机种子random.seed()是没有用的,两次random.shuffle()随机顺序不一致。
1 import numpy as np 2 3 cont_ch, cont_en = np.array(cont_ch), np.array(cont_en) 4 label_ch = np.array(label) 5 6 state = np.random.get_state() 7 np.random.shuffle(cont_ch) 8 np.random.set_state(state) 9 np.random.shuffle(cont_en) 10 np.random.set_state(state) 11 np.random.shuffle(label)
先获得状态get_state(),再设定状态set_state(),只有np.random有这两个函数,random库没有,需要把数据先转换成numpy类型。
10.csv文件的读取与写入
1 def read(input_file): 2 with open(input_file, "r", encoding="utf-8") as f: 3 reader = csv.reader(f, quotechar='"') 4 lines = [] 5 for line in reader: 6 lines.append(line) #label-->int(line[0]),text-->line[1:] 7 return lines 8 9 10 def write(out_file, content): 11 with open(out_file, 'w', encoding='utf-8') as f: 12 writer = csv.writer(f) 13 writer.writerows(content) 14 print('写入成功!')
如果读取的文件原本不是CSV格式,而是content和label(均为list格式)
1 def convert(x, y): #x是text,y是class 2 rows = [] 3 for i in range(len(y)): 4 dic={} 5 dic['class'] = y[i] 6 dic['text'] = x[i][:-1] #为了删掉换行符\n,有的情况不需要 7 rows.append(dic) 8 return rows 9 10 def get_data(content, all_label, new_path): 11 12 all_dic = convert(content, all_label) 13 headers = ['class', 'text'] 14 with open(new_path, 'w') as f: 15 f_csv = csv.DictWriter(f, headers) 16 f_csv.writerows(all_dic) 17 18 get_data(contents, labels, './sst-2/dev.csv')
11.csv文件转tsv文件
1 import pandas as pd 2 pd_all = pd.read_csv("./yelp/unlabel_8+train.csv", sep=',', encoding='utf-8') 3 # 保存为tsv文件 4 pd_all.to_csv("./yelp_tsv/unlabel_8+train.tsv", index=False, sep='\t', encoding='utf-8')
12.调用百度API对文档一句一句地翻译
1 #百度通用翻译API 2 # coding=utf-8 3 4 import http.client 5 import hashlib 6 import urllib 7 import random 8 import json 9 10 appid = '填写你的appid' 11 secretKey = '填写你的密钥' 12 13 httpClient = None 14 15 def baiduAPI(line, fromLang='auto', toLang='zh', myurl='/api/trans/vip/translate'): 16 salt = random.randint(32768, 65536) 17 q= line 18 sign = appid + q + str(salt) + secretKey 19 sign = hashlib.md5(sign.encode()).hexdigest() 20 myurl = myurl + '?appid=' + appid + '&q=' + urllib.parse.quote(q) + '&from=' + fromLang + '&to=' + toLang + '&salt=' + str( 21 salt) + '&sign=' + sign 22 23 24 httpClient = http.client.HTTPConnection('api.fanyi.baidu.com') 25 httpClient.request('GET', myurl) 26 27 # response是HTTPResponse对象 28 response = httpClient.getresponse() 29 result_all = response.read().decode("utf-8") 30 result = json.loads(result_all) 31 #print(result) 32 return result['trans_result'][0]['dst']
id和密钥通过官方网站申请
13.matplotlib绘制性能趋势图
参考博客:https://www.cnblogs.com/douzujun/p/14974164.html
1 import numpy as np 2 import matplotlib.pyplot as plt 3 import matplotlib 4 matplotlib.matplotlib_fname() 5 6 print(matplotlib.get_configdir()) 7 8 9 with plt.style.context(['no-latex']): 10 #x = np.linspace(0.0, 10.0) 11 #y = np.sin(x) 12 x = np.array([20, 40, 60, 80]) 13 pred = np.array([[87.55, 87.55, 87.55, 87.55], 14 [84.78, 86.6, 86.89, 86.85], 15 [85.26, 86.2, 86.08, 86.75], 16 [85.55, 86.75, 87.59, 87.8] 17 ]) 18 styles = ['-', ':', '--', '-.'] 19 labels = ['Approch1', 'Approch2', 'Approch3', 'Approch4'] 20 colors = ['b', 'y', 'c', 'r'] 21 for p, style, label, color in zip(pred, styles, labels, colors): 22 plt.plot(x, p, label=label, linestyle=style, color=color, linewidth=2.5) 23 24 #plt.grid(True) #添加网格 25 #plt.ylim((83, 90)) #设定y轴上下限 26 #plt.tight_layout() 27 plt.xticks(fontsize=18) 28 plt.yticks(fontsize=18) 29 #plt.title("Cross-Language", fontsize=18) 30 plt.xlabel("The number of unlabeled data", fontsize=18) 31 plt.ylabel("F1-score", fontsize=16) 32 33 import matplotlib.ticker as mticker 34 plt.gca().xaxis.set_major_formatter(mticker.FormatStrFormatter('%dk')) 35 plt.gca().yaxis.set_major_formatter(mticker.FormatStrFormatter('%.2f %%')) 36 plt.legend(edgecolor = 'k', loc='lower right', fontsize=18) #loc='lower right'位置固定在右下角 37 plt.show()
运行效果如下:
14. 集成模型中设定各模型的权重区间为[0,1]且所有权重和为1
1 class ensembleCNN(nn.Module): 2 def __init__(self): 3 super(ensembleCNN, self).__init__() 4 self.embedding = nn.Embedding(len(vocabulary), EMBEDDING_SIZE) 5 self.model1 = TextCNN(len(vocabulary), EMBEDDING_SIZE, NUM_LABELS, SENTENCE_LIMIT_SIZE) 6 self.model2 = TextCNN(len(vocabulary), EMBEDDING_SIZE, NUM_LABELS, SENTENCE_LIMIT_SIZE) 7 self.model3 = TextCNN(len(vocabulary), EMBEDDING_SIZE, NUM_LABELS, SENTENCE_LIMIT_SIZE) 8 self.weight = nn.Parameter(torch.rand(NUM_MODELS)) #取3个随机数,区间[0,1] 9 10 def forward(self, x1, x2, x3): 11 x1 = self.embedding(x1) 12 x2 = self.embedding(x2) 13 x3 = self.embedding(x3) 14 15 out1 = self.model1(x1) 16 out2 = self.model2(x2) 17 out3 = self.model3(x3) 18 19 w = F.softmax(self.weight) #为了权重和为1 20 pred_final = w[0] * out1 + w[1] * out2 + w[2] * out3 21 return out1, out2, out3, pred_final
15. 由类别数字转成编码形式(比如4分类中,类别2对应[0,0,1,0])
1 y = np.zeros(self.num_classes).astype(np.float32) 2 y[label] = 1.0