NLP(十五):word2vec+ESIM进行文本相似度计算
一、准备数据集
models\esim_model\my_dataset.py
import torch.utils.data as data class MyDataset(data.Dataset): def __init__(self, texta, textb, label): self.texta = texta self.textb = textb self.label = label def __getitem__(self, item): texta = self.texta[item] textb = self.textb[item] label = self.label[item] return texta, textb, label def __len__(self): return len(self.texta)
二、用word2vec代替Embedding
models\esim_model\my_word2vec.py
from gensim.models.fasttext import FastText import torch import numpy as np import os class WordEmbedding(object): def __init__(self): parent_path = os.path.split(os.path.realpath(__file__))[0] self.root = parent_path[:parent_path.find("models")] # E:\personas\semantics\ self.word_fasttext = os.path.join(self.root, "checkpoints", "word2vec", "word_fasttext.model") self.char_fasttext = os.path.join(self.root, "checkpoints", "word2vec", "char_fasttext.model") self.model = FastText.load(self.char_fasttext) def sentenceTupleToEmbedding(self, data1, data2): aCutListMaxLen = max([len(list(str(sentence_a))) for sentence_a in data1]) bCutListMaxLen = max([len(list(str(sentence_a))) for sentence_a in data2]) maxLen = max(aCutListMaxLen,bCutListMaxLen) seq_len = maxLen a = self.sqence_vec(data1, seq_len) #batch_size, sqence, embedding b = self.sqence_vec(data2, seq_len) return torch.FloatTensor(a), torch.FloatTensor(b) def sqence_vec(self, data, seq_len): data_a_vec = [] for sequence_a in data: sequence_vec = [] # sequence * 128 for word_a in list(str(sequence_a)): if word_a in self.model.wv: sequence_vec.append(self.model.wv[word_a]) sequence_vec = np.array(sequence_vec) add = np.zeros((seq_len - sequence_vec.shape[0], 128)) sequenceVec = np.vstack((sequence_vec, add)) data_a_vec.append(sequenceVec) a_vec = np.array(data_a_vec) return a_vec
三、模型
models\esim_model\my_esim.py
import torch.nn as nn import torch.nn.functional as F import torch class ESIM(nn.Module): def __init__(self): super(ESIM, self).__init__() self.dropout = 0.5 self.hidden_size = 128 self.embeds_dim = 128 self.bn_embeds = nn.BatchNorm1d(self.embeds_dim) self.lstm1 = nn.LSTM(self.embeds_dim, self.hidden_size, batch_first=True, bidirectional=True) self.lstm2 = nn.LSTM(self.hidden_size * 8, self.hidden_size, batch_first=True, bidirectional=True) self.fc = nn.Sequential( nn.BatchNorm1d(self.hidden_size * 8), nn.Linear(self.hidden_size * 8, 2), nn.ELU(inplace=True), nn.BatchNorm1d(2), nn.Dropout(self.dropout), nn.Linear(2, 2), nn.ELU(inplace=True), nn.BatchNorm1d(2), nn.Dropout(self.dropout), nn.Linear(2, 1), ) def soft_attention_align(self, x1, x2): ''' x1: batch_size * seq_len * dim x2: batch_size * seq_len * dim ''' # attention: batch_size * seq_len * seq_len attention = torch.matmul(x1, x2.transpose(1, 2)) # mask1 = mask1.float().masked_fill_(mask1, float('-inf')) # mask2 = mask2.float().masked_fill_(mask2, float('-inf')) # weight: batch_size * seq_len * seq_len # weight1 = F.softmax(attention + mask2.unsqueeze(1), dim=-1) weight1 = F.softmax(attention, dim=-1) x1_align = torch.matmul(weight1, x2) # weight2 = F.softmax(attention.transpose(1, 2) + mask1.unsqueeze(1), dim=-1) weight2 = F.softmax(attention.transpose(1, 2), dim=-1) x2_align = torch.matmul(weight2, x1) # x_align: batch_size * seq_len * hidden_size return x1_align, x2_align def submul(self, x1, x2): mul = x1 * x2 sub = x1 - x2 return torch.cat([sub, mul], -1) def apply_multiple(self, x): # input: batch_size * seq_len * (2 * hidden_size) p1 = F.avg_pool1d(x.transpose(1, 2), x.size(1)).squeeze(-1) p2 = F.max_pool1d(x.transpose(1, 2), x.size(1)).squeeze(-1) # output: batch_size * (4 * hidden_size) return torch.cat([p1, p2], 1) def forward(self, x1, x2): # batch_size * seq_len # embeds: batch_size * seq_len => batch_size * seq_len * dim # x1 = self.bn_embeds(self.embeds(sent1).transpose(1, 2).contiguous()).transpose(1, 2) # x2 = self.bn_embeds(self.embeds(sent2).transpose(1, 2).contiguous()).transpose(1, 2) # batch_size * seq_len * dim => batch_size * seq_len * hidden_size o1, _ = self.lstm1(x1) o2, _ = self.lstm1(x2) # Attention # batch_size * seq_len * hidden_size q1_align, q2_align = self.soft_attention_align(o1, o2) # Compose # batch_size * seq_len * (8 * hidden_size) q1_combined = torch.cat([o1, q1_align, self.submul(o1, q1_align)], -1) q2_combined = torch.cat([o2, q2_align, self.submul(o2, q2_align)], -1) # batch_size * seq_len * (2 * hidden_size) q1_compose, _ = self.lstm2(q1_combined) q2_compose, _ = self.lstm2(q2_combined) # Aggregate # input: batch_size * seq_len * (2 * hidden_size) # output: batch_size * (4 * hidden_size) q1_rep = self.apply_multiple(q1_compose) q2_rep = self.apply_multiple(q2_compose) # Classifier x = torch.cat([q1_rep, q2_rep], -1) similarity = self.fc(x) return similarity
四、运行模型
import torch import os from torch.utils.data import DataLoader from my_dataset import MyDataset import pandas as pd import numpy as np from my_esim import ESIM import torch.nn as nn from my_word2vec import WordEmbedding class RunESIM(): def __init__(self): self.learning_rate = 0.001 self.device = torch.device("cuda") parent_path = os.path.split(os.path.realpath(__file__))[0] self.root = parent_path[:parent_path.find("models")] # E:\personas\semantics\ self.train_path = os.path.join(self.root, "datas", "bert_data", "sim_data", "train.csv") self.val_path = os.path.join(self.root, "datas", "bert_data", "sim_data", "val.csv") self.test_path = os.path.join(self.root, "datas", "bert_data", "sim_data", "test.csv") self.batch_size =64 self.epoch = 50 self.criterion = nn.BCEWithLogitsLoss().to(self.device) self.word = WordEmbedding() self.check_point = os.path.join(self.root, "checkpoints", "char_bilstm", "char_bilstm.pth") def get_loader(self, path): data = pd.read_csv(path, sep="\t") d1, d2, y = data["s1"], data["s2"], list(data["y"]) dataset = MyDataset(d1, d2, torch.LongTensor(y)) data_iter = DataLoader(dataset=dataset, batch_size=self.batch_size, shuffle=True) return data_iter def binary_acc(self, preds, y): preds = torch.round(torch.sigmoid(preds)) correct = torch.eq(preds, y).float() acc = correct.sum() / len(correct) return acc def train(self, mynet, train_iter, optimizer, criterion, epoch, device): avg_acc = [] avg_loss = [] mynet.train() for batch_id, (data1, data2, label) in enumerate(train_iter): try: a, b = self.word.sentenceTupleToEmbedding(data1, data2) except Exception as e: print("错误") a, b, label = a.to(device), b.to(device), label.to(device) distence = mynet(a, b) distence = distence.squeeze(1) loss = criterion(distence, label.float()) acc = self.binary_acc(distence, label.float()).item() avg_acc.append(acc) optimizer.zero_grad() loss.backward() optimizer.step() if batch_id % 100 == 0: print("轮数:", epoch, "batch: ", batch_id, "训练损失:", loss.item(), "准确率:", acc) avg_loss.append(loss.item()) avg_acc = np.array(avg_acc).mean() avg_loss = np.array(avg_loss).mean() print('train acc:', avg_acc) print("train loss", avg_loss) def eval(self, mynet, test_iter, criteon, epoch, device): mynet.eval() avg_acc = [] avg_loss = [] with torch.no_grad(): for batch_id, (data1, data2, label) in enumerate(test_iter): try: a, b = self.word.sentenceTupleToEmbedding(data1, data2) except Exception as e: continue a, b, label = a.to(device), b.to(device), label.to(device) distence = mynet(a, b) distence = distence.squeeze(1) loss = criteon(distence, label.float()) acc = self.binary_acc(distence, label.float()).item() avg_acc.append(acc) avg_loss.append(loss.item()) if batch_id>50: break avg_acc = np.array(avg_acc).mean() avg_loss = np.array(avg_loss).mean() print('>>test acc:', avg_acc) print(">>test loss:", avg_loss) return (avg_acc, avg_loss) def run_train(self): model = ESIM().to(self.device) max_acc = 0 train_iter = self.get_loader(self.train_path) val_iter = self.get_loader(self.val_path) optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate) for epoch in range(self.epoch): self.train(model, train_iter, optimizer, self.criterion, epoch, self.device) eval_acc, eval_loss = self.eval(model, val_iter, self.criterion, epoch, self.device) if eval_acc > max_acc: print("save model") torch.save(model.state_dict(), self.check_point) max_acc = eval_acc if __name__ == '__main__': RunESIM().run_train()
五、实验结果
轮数: 30 batch: 0 训练损失: 0.30833131074905396 准确率: 0.875 轮数: 30 batch: 100 训练损失: 0.15703552961349487 准确率: 0.953125 轮数: 30 batch: 200 训练损失: 0.25020748376846313 准确率: 0.90625 轮数: 30 batch: 300 训练损失: 0.2216322124004364 准确率: 0.90625 轮数: 30 batch: 400 训练损失: 0.21571914851665497 准确率: 0.921875 轮数: 30 batch: 500 训练损失: 0.23061133921146393 准确率: 0.890625 轮数: 30 batch: 600 训练损失: 0.2357763797044754 准确率: 0.90625 轮数: 30 batch: 700 训练损失: 0.180502250790596 准确率: 0.9375 轮数: 30 batch: 800 训练损失: 0.3004327118396759 准确率: 0.875 轮数: 30 batch: 900 训练损失: 0.22875544428825378 准确率: 0.90625 轮数: 30 batch: 1000 训练损失: 0.21407470107078552 准确率: 0.921875 轮数: 30 batch: 1100 训练损失: 0.20641490817070007 准确率: 0.921875 轮数: 30 batch: 1200 训练损失: 0.2836620509624481 准确率: 0.875 train acc: 0.8965875 train loss 0.2476300214469433 >>test acc: 0.9613281264901161 >>test loss: 0.10271382739301771