NLP(三):word2vec + siameseLSTM
一、词嵌入
import jieba from gensim.models import Word2Vec import torch import gensim import numpy as np model = gensim.models.KeyedVectors.load_word2vec_format('model\\word2vec.bin', binary=True) class WordEmbedding(object): def __init__(self): pass def sentenceTupleToEmbedding(self, data1, data2): aCutListMaxLen = max([len(list(jieba.cut(sentence_a))) for sentence_a in data1]) bCutListMaxLen = max([len(list(jieba.cut(sentence_a))) for sentence_a in data2]) maxLen = max(aCutListMaxLen,bCutListMaxLen) seq_len = maxLen a = self.sqence_vec(data1, seq_len) #batch_size, sqence, embedding b = self.sqence_vec(data2, seq_len) return torch.FloatTensor(a), torch.FloatTensor(b) def sqence_vec(self, data, seq_len): data_a_vec = [] for sequence_a in data: sequence_vec = [] # sequence * 128 for word_a in jieba.cut(sequence_a): if word_a in model: sequence_vec.append(model[word_a]) sequence_vec = np.array(sequence_vec) add = np.zeros((seq_len - sequence_vec.shape[0], 128)) sequenceVec = np.vstack((sequence_vec, add)) data_a_vec.append(sequenceVec) a_vec = np.array(data_a_vec) return a_vec
二、dataSet设置
import torch.utils.data as data import torch class DatasetIterater(data.Dataset): def __init__(self, texta, textb, label): self.texta = texta self.textb = textb self.label = label def __getitem__(self, item): texta = self.texta[item] textb = self.textb[item] label = self.label[item] return texta, textb, label def __len__(self): return len(self.texta)
三、SiameseLSTM
import torch from torch import nn class SiameseLSTM(nn.Module): def __init__(self, input_size): super(SiameseLSTM, self).__init__() self.lstm = nn.LSTM(input_size=input_size, hidden_size=10, num_layers=4, batch_first=True) self.fc = nn.Linear(10, 1) def forward(self, data1, data2): out1, (h1, c1) = self.lstm(data1) out2, (h2, c2) = self.lstm(data2) pre1 = out1[:, -1, :] pre2 = out2[:, -1, :] dis = torch.abs(pre1 - pre2) out = self.fc(dis) return out
四、mainProcess
import torch from torch import nn from torch.utils.data import DataLoader import pandas as pd from datasetIterater import DatasetIterater import jieba from wordEmbedding import WordEmbedding from siameseLSTM import SiameseLSTM learning_rate = 0.001 train_texta = pd.read_csv("data/POI/negtive.csv")["address_1"] train_textb = pd.read_csv("data/POI/negtive.csv")["address_2"] train_label = pd.read_csv("data/POI/negtive.csv")["tag"] train_data = DatasetIterater(train_texta,train_textb,train_label) train_iter = DataLoader(dataset=train_data,batch_size=32,shuffle=True) net = SiameseLSTM(128) criterion = nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate) word = WordEmbedding() train_loss = [] for epoch in range(10): for batch_id, (data1, data2, label) in enumerate(train_iter): a, b = word.sentenceTupleToEmbedding(data1, data2) distence = net(a, b) loss = criterion(distence, label.float().unsqueeze(-1)) optimizer.zero_grad() loss.backward() optimizer.step() train_loss.append(loss.item()) if batch_id % 10==0: print(epoch, batch_id, loss.item())