【文本分类-07】ELMo
目录
- 大纲概述
- 数据集合
- 数据处理
- 预训练word2vec模型
一、大纲概述
文本分类这个系列将会有8篇左右文章,从github直接下载代码,从百度云下载训练数据,在pycharm上导入即可使用,包括基于word2vec预训练的文本分类,与及基于近几年的预训练模型(ELMo,BERT等)的文本分类。总共有以下系列:
二、数据集合
数据集为IMDB 电影影评,总共有三个数据文件,在/data/rawData目录下,包括unlabeledTrainData.tsv,labeledTrainData.tsv,testData.tsv。在进行文本分类时需要有标签的数据(labeledTrainData),但是在训练word2vec词向量模型(无监督学习)时可以将无标签的数据一起用上。
训练数据地址:链接:https://pan.baidu.com/s/1-XEwx1ai8kkGsMagIFKX_g 提取码:rtz8
ELMo模型是利用BiLM(双向语言模型)来预训练词的向量表示,可以根据我们的训练集动态的生成词的向量表示。ELMo预训练模型来源于论文:Deep contextualized word representations。具体的ELMo模型的详细介绍见这篇文章。
在使用之前我们还需要去下载已经预训练好的模型参数权重,打开https://allennlp.org/elmo链接,在Pre-trained ELMo Models 这个版块下总共有四个不同版本的模型,可以自己选择,我们在这里选择Small这个规格的模型,总共有两个文件需要下载,一个"options"的json文件,保存了模型的配置参数,另一个是"weights"的hdf5文件,保存了模型的结构和权重值(可以用h5py读取看看)。
三、主要代码
3.1 配置训练参数:parameter_config.py
在这里我们需要将optionFile,vocabFile,weightsFile,tokenEmbeddingFile的路径配置上,还有一个需要注意的地方就是这里的embeddingSize的值要和ELMo的词向量的大小一致。我们需要导入bilm文件夹中的函数和类
# Author:yifan # _*_ coding:utf-8 _*_ #需要的所有导入包,存放留用,转换到jupyter后直接使用 # 1 配置训练参数 class TrainingConfig(object): epoches = 5 evaluateEvery = 100 checkpointEvery = 100 learningRate = 0.001 class ModelConfig(object): embeddingSize = 256 # 这个值是和ELMo模型的output Size 对应的值 hiddenSizes = [128] # LSTM结构的神经元个数 dropoutKeepProb = 0.5 l2RegLambda = 0.0 class Config(object): sequenceLength = 200 # 取了所有序列长度的均值 batchSize = 128 dataSource = "../data/preProcess/labeledTrain.csv" stopWordSource = "../data/english" optionFile = "../data/elmodata/elmo_options.json" weightFile = "../data/elmodata/elmo_weights.hdf5" vocabFile = "../data/elmodata/vocab.txt" tokenEmbeddingFile = '../data/elmodata/elmo_token_embeddings.hdf5' numClasses = 2 rate = 0.8 # 训练集的比例 training = TrainingConfig() model = ModelConfig()
3.2 获取训练数据:get_train_data.py
1)将数据读取出来,
2)根据训练集生成vocabFile文件,
3)调用bilm文件夹中的dump_token_embeddings方法生成初始化的词向量表示,并保存为hdf5文件,文件中的键为"embedding",
4)固定输入数据的序列长度
5)分割成训练集和测试集
# _*_ coding:utf-8 _*_ # Author:yifan import json from collections import Counter import gensim import pandas as pd import numpy as np import parameter_config # 2 数据预处理的类,生成训练集和测试集 # 数据预处理的类,生成训练集和测试集 class Dataset(object): def __init__(self, config): self._dataSource = config.dataSource self._stopWordSource = config.stopWordSource self._optionFile = config.optionFile self._weightFile = config.weightFile self._vocabFile = config.vocabFile self._tokenEmbeddingFile = config.tokenEmbeddingFile self._sequenceLength = config.sequenceLength # 每条输入的序列处理为定长 self._embeddingSize = config.model.embeddingSize self._batchSize = config.batchSize self._rate = config.rate self.trainReviews = [] self.trainLabels = [] self.evalReviews = [] self.evalLabels = [] def _readData(self, filePath): """ 从csv文件中读取数据集 """ df = pd.read_csv(filePath) labels = df["sentiment"].tolist() review = df["review"].tolist() reviews = [line.strip().split() for line in review] return reviews, labels def _genVocabFile(self, reviews): """ 用我们的训练数据生成一个词汇文件,并加入三个特殊字符 """ allWords = [word for review in reviews for word in review] wordCount = Counter(allWords) # 统计词频 sortWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True) words = [item[0] for item in sortWordCount.items()] allTokens = ['<S>', '</S>', '<UNK>'] + words with open(self._vocabFile, 'w',encoding='UTF-8') as fout: fout.write('\n'.join(allTokens)) def _fixedSeq(self, reviews): """ 将长度超过200的截断为200的长度 """ return [review[:self._sequenceLength] for review in reviews] def _genElmoEmbedding(self): """ 调用ELMO源码中的dump_token_embeddings方法,基于字符的表示生成词的向量表示。并保存成hdf5文件, 文件中的"embedding"键对应的value就是 词汇表文件中各词汇的向量表示,这些词汇的向量表示之后会作为BiLM的初始化输入。 """ dump_token_embeddings( self._vocabFile, self._optionFile, self._weightFile, self._tokenEmbeddingFile) def _genTrainEvalData(self, x, y, rate): """ 生成训练集和验证集 """ y = [[item] for item in y] trainIndex = int(len(x) * rate) trainReviews = x[:trainIndex] trainLabels = y[:trainIndex] evalReviews = x[trainIndex:] evalLabels = y[trainIndex:] return trainReviews, trainLabels, evalReviews, evalLabels def dataGen(self): """ 初始化训练集和验证集 """ # 初始化数据集 reviews, labels = self._readData(self._dataSource) # self._genVocabFile(reviews) # 生成vocabFile # self._genElmoEmbedding() # 生成elmo_token_embedding reviews = self._fixedSeq(reviews) # 初始化训练集和测试集 trainReviews, trainLabels, evalReviews, evalLabels = self._genTrainEvalData(reviews, labels, self._rate) self.trainReviews = trainReviews self.trainLabels = trainLabels self.evalReviews = evalReviews self.evalLabels = evalLabels
3.3 模型构建:mode_structure.py
# Author:yifan # _*_ coding:utf-8 _*_ import tensorflow as tf import parameter_config config = parameter_config.Config() # 构建模型 3 ELMo模型 # 构建模型 class ELMo(object): """""" def __init__(self, config): # 定义模型的输入 self.inputX = tf.placeholder(tf.float32, [None, config.sequenceLength, config.model.embeddingSize], name="inputX") self.inputY = tf.placeholder(tf.float32, [None, 1], name="inputY") self.dropoutKeepProb = tf.placeholder(tf.float32, name="dropoutKeepProb") # 定义l2损失 l2Loss = tf.constant(0.0) with tf.name_scope("embedding"): embeddingW = tf.get_variable( "embeddingW", shape=[config.model.embeddingSize, config.model.embeddingSize], initializer=tf.contrib.layers.xavier_initializer()) reshapeInputX = tf.reshape(self.inputX, shape=[-1, config.model.embeddingSize]) self.embeddedWords = tf.reshape(tf.matmul(reshapeInputX, embeddingW), shape=[-1, config.sequenceLength, config.model.embeddingSize]) self.embeddedWords = tf.nn.dropout(self.embeddedWords, self.dropoutKeepProb) # 定义两层双向LSTM的模型结构 with tf.name_scope("Bi-LSTM"): for idx, hiddenSize in enumerate(config.model.hiddenSizes): with tf.name_scope("Bi-LSTM" + str(idx)): # 定义前向LSTM结构 lstmFwCell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=hiddenSize, state_is_tuple=True), output_keep_prob=self.dropoutKeepProb) # 定义反向LSTM结构 lstmBwCell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=hiddenSize, state_is_tuple=True), output_keep_prob=self.dropoutKeepProb) # 采用动态rnn,可以动态的输入序列的长度,若没有输入,则取序列的全长 # outputs是一个元组(output_fw, output_bw),其中两个元素的维度都是[batch_size, max_time, hidden_size],fw和bw的hidden_size一样 # self.current_state 是最终的状态,二元组(state_fw, state_bw),state_fw=[batch_size, s],s是一个元祖(h, c) outputs_, self.current_state = tf.nn.bidirectional_dynamic_rnn(lstmFwCell, lstmBwCell, self.embeddedWords, dtype=tf.float32, scope="bi-lstm" + str(idx)) # 对outputs中的fw和bw的结果拼接 [batch_size, time_step, hidden_size * 2], 传入到下一层Bi-LSTM中 self.embeddedWords = tf.concat(outputs_, 2) # 将最后一层Bi-LSTM输出的结果分割成前向和后向的输出 outputs = tf.split(self.embeddedWords, 2, -1) # 在Bi-LSTM+Attention的论文中,将前向和后向的输出相加 with tf.name_scope("Attention"): H = outputs[0] + outputs[1] # 得到Attention的输出 output = self._attention(H) outputSize = config.model.hiddenSizes[-1] # 全连接层的输出 with tf.name_scope("output"): outputW = tf.get_variable( "outputW", shape=[outputSize, 1], initializer=tf.contrib.layers.xavier_initializer()) outputB= tf.Variable(tf.constant(0.1, shape=[1]), name="outputB") l2Loss += tf.nn.l2_loss(outputW) l2Loss += tf.nn.l2_loss(outputB) self.predictions = tf.nn.xw_plus_b(output, outputW, outputB, name="predictions") self.binaryPreds = tf.cast(tf.greater_equal(self.predictions, 0.0), tf.float32, name="binaryPreds") # 计算二元交叉熵损失 with tf.name_scope("loss"): losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.predictions, labels=self.inputY) self.loss = tf.reduce_mean(losses) + config.model.l2RegLambda * l2Loss def _attention(self, H): """ 利用Attention机制得到句子的向量表示 """ # 获得最后一层LSTM的神经元数量 hiddenSize = config.model.hiddenSizes[-1] # 初始化一个权重向量,是可训练的参数 W = tf.Variable(tf.random_normal([hiddenSize], stddev=0.1)) # 对Bi-LSTM的输出用激活函数做非线性转换 M = tf.tanh(H) # 对W和M做矩阵运算,W=[batch_size, time_step, hidden_size],计算前做维度转换成[batch_size * time_step, hidden_size] # newM = [batch_size, time_step, 1],每一个时间步的输出由向量转换成一个数字 newM = tf.matmul(tf.reshape(M, [-1, hiddenSize]), tf.reshape(W, [-1, 1])) # 对newM做维度转换成[batch_size, time_step] restoreM = tf.reshape(newM, [-1, config.sequenceLength]) # 用softmax做归一化处理[batch_size, time_step] self.alpha = tf.nn.softmax(restoreM) # 利用求得的alpha的值对H进行加权求和,用矩阵运算直接操作 r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha, [-1, config.sequenceLength, 1])) # 将三维压缩成二维sequeezeR=[batch_size, hidden_size] sequeezeR = tf.squeeze(r) sentenceRepren = tf.tanh(sequeezeR) # 对Attention的输出可以做dropout处理 output = tf.nn.dropout(sentenceRepren, self.dropoutKeepProb) return output
3.4 模型训练:mode_trainning.py
# Author:yifan # _*_ coding:utf-8 _*_ import os import datetime import numpy as np import tensorflow as tf import parameter_config import get_train_data import mode_structure from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score from data import TokenBatcher #不能从bilm直接导入TokenBatcher,因为需要修改内部的open为with open(filename, encoding="utf8") as f: from bilm import BidirectionalLanguageModel, weight_layers, dump_token_embeddings, Batcher #获取前些模块的数据 config =parameter_config.Config() data = get_train_data.Dataset(config) data.dataGen() #4生成batch数据集 def nextBatch(x, y, batchSize): # 生成batch数据集,用生成器的方式输出 # perm = np.arange(len(x)) #返回[0 1 2 ... len(x)]的数组 # np.random.shuffle(perm) #乱序 # # x = x[perm] # # y = y[perm] # x = np.array(x)[perm] # y = np.array(y)[perm] # print(x) # # np.random.shuffle(x) #不能用这种,会导致x和y不一致 # # np.random.shuffle(y) midVal = list(zip(x, y)) np.random.shuffle(midVal) x, y = zip(*midVal) x = list(x) y = list(y) print(x) numBatches = len(x) // batchSize for i in range(numBatches): start = i * batchSize end = start + batchSize batchX = np.array(x[start: end]) batchY = np.array(y[start: end]) yield batchX, batchY # 5 定义计算metrics的函数 """ 定义各类性能指标 """ def mean(item): return sum(item) / len(item) def genMetrics(trueY, predY, binaryPredY): """ 生成acc和auc值 """ auc = roc_auc_score(trueY, predY) accuracy = accuracy_score(trueY, binaryPredY) precision = precision_score(trueY, binaryPredY) recall = recall_score(trueY, binaryPredY) return round(accuracy, 4), round(auc, 4), round(precision, 4), round(recall, 4) # 6 训练模型 # 生成训练集和验证集 trainReviews = data.trainReviews trainLabels = data.trainLabels evalReviews = data.evalReviews evalLabels = data.evalLabels # 定义计算图 with tf.Graph().as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_conf.gpu_options.allow_growth=True session_conf.gpu_options.per_process_gpu_memory_fraction = 0.9 # 配置gpu占用率 sess = tf.Session(config=session_conf) # 定义会话 with sess.as_default(): elmoMode = mode_structure.ELMo(config) # 实例化BiLM对象,这个必须放置在全局下,不能在elmo函数中定义,否则会出现重复生成tensorflow节点。 with tf.variable_scope("bilm", reuse=True): bilm = BidirectionalLanguageModel( config.optionFile, config.weightFile, use_character_inputs=False, embedding_weight_file=config.tokenEmbeddingFile ) inputData = tf.placeholder('int32', shape=(None, None)) # 调用bilm中的__call__方法生成op对象 inputEmbeddingsOp = bilm(inputData) # 计算ELMo向量表示 elmoInput = weight_layers('input', inputEmbeddingsOp, l2_coef=0.0) globalStep = tf.Variable(0, name="globalStep", trainable=False) # 定义优化函数,传入学习速率参数 optimizer = tf.train.AdamOptimizer(config.training.learningRate) # 计算梯度,得到梯度和变量 gradsAndVars = optimizer.compute_gradients(elmoMode.loss) # 将梯度应用到变量下,生成训练器 trainOp = optimizer.apply_gradients(gradsAndVars, global_step=globalStep) # 用summary绘制tensorBoard gradSummaries = [] for g, v in gradsAndVars: if g is not None: tf.summary.histogram("{}/grad/hist".format(v.name), g) tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) outDir = os.path.abspath(os.path.join(os.path.curdir, "summarys")) print("Writing to {}\n".format(outDir)) lossSummary = tf.summary.scalar("loss", elmoMode.loss) summaryOp = tf.summary.merge_all() trainSummaryDir = os.path.join(outDir, "train") trainSummaryWriter = tf.summary.FileWriter(trainSummaryDir, sess.graph) evalSummaryDir = os.path.join(outDir, "eval") evalSummaryWriter = tf.summary.FileWriter(evalSummaryDir, sess.graph) # 初始化所有变量 saver = tf.train.Saver(tf.global_variables(), max_to_keep=5) savedModelPath ="../model/ELMo/savedModel" if os.path.exists(savedModelPath): os.rmdir(savedModelPath) # 保存模型的一种方式,保存为pb文件 builder = tf.saved_model.builder.SavedModelBuilder(savedModelPath) sess.run(tf.global_variables_initializer()) def elmo(reviews): """ 对每一个输入的batch都动态的生成词向量表示 """ # tf.reset_default_graph() # TokenBatcher是生成词表示的batch类 # print("________") batcher = TokenBatcher(config.vocabFile) # 生成batch数据 inputDataIndex = batcher.batch_sentences(reviews) # 计算ELMo的向量表示 elmoInputVec = sess.run( [elmoInput['weighted_op']], feed_dict={inputData: inputDataIndex} ) return elmoInputVec def trainStep(batchX, batchY): """ 训练函数 """ feed_dict = { elmoMode.inputX: elmo(batchX)[0], # inputX直接用动态生成的ELMo向量表示代入 elmoMode.inputY: np.array(batchY, dtype="float32"), elmoMode.dropoutKeepProb: config.model.dropoutKeepProb } _, summary, step, loss, predictions, binaryPreds = sess.run( [trainOp, summaryOp, globalStep, elmoMode.loss, elmoMode.predictions, elmoMode.binaryPreds], feed_dict) timeStr = datetime.datetime.now().isoformat() acc, auc, precision, recall = genMetrics(batchY, predictions, binaryPreds) print("{}, step: {}, loss: {}, acc: {}, auc: {}, precision: {}, recall: {}".format(timeStr, step, loss, acc, auc, precision, recall)) trainSummaryWriter.add_summary(summary, step) def devStep(batchX, batchY): """ 验证函数 """ feed_dict = { elmoMode.inputX: elmo(batchX)[0], elmoMode.inputY: np.array(batchY, dtype="float32"), elmoMode.dropoutKeepProb: 1.0 } summary, step, loss, predictions, binaryPreds = sess.run( [summaryOp, globalStep, elmoMode.loss, elmoMode.predictions, elmoMode.binaryPreds], feed_dict) acc, auc, precision, recall = genMetrics(batchY, predictions, binaryPreds) evalSummaryWriter.add_summary(summary, step) return loss, acc, auc, precision, recall for i in range(config.training.epoches): # 训练模型 print("start training model") for batchTrain in nextBatch(trainReviews, trainLabels, config.batchSize): trainStep(batchTrain[0], batchTrain[1]) currentStep = tf.train.global_step(sess, globalStep) if currentStep % config.training.evaluateEvery == 0: print("\nEvaluation:") losses = [] accs = [] aucs = [] precisions = [] recalls = [] for batchEval in nextBatch(evalReviews, evalLabels, config.batchSize): loss, acc, auc, precision, recall = devStep(batchEval[0], batchEval[1]) losses.append(loss) accs.append(acc) aucs.append(auc) precisions.append(precision) recalls.append(recall) time_str = datetime.datetime.now().isoformat() print("{}, step: {}, loss: {}, acc: {}, auc: {}, precision: {}, recall: {}".format(time_str, currentStep, mean(losses), mean(accs), mean(aucs), mean(precisions), mean(recalls))) if currentStep % config.training.checkpointEvery == 0: # 保存模型的另一种方法,保存checkpoint文件 path = saver.save(sess, "../model/ELMo/model/my-model", global_step=currentStep) print("Saved model checkpoint to {}\n".format(path)) inputs = {"inputX": tf.saved_model.utils.build_tensor_info(elmoMode.inputX), "keepProb": tf.saved_model.utils.build_tensor_info(elmoMode.dropoutKeepProb)} outputs = {"binaryPreds": tf.saved_model.utils.build_tensor_info(elmoMode.binaryPreds)} prediction_signature = tf.saved_model.signature_def_utils.build_signature_def(inputs=inputs, outputs=outputs, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME) legacy_init_op = tf.group(tf.tables_initializer(), name="legacy_init_op") builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={"predict": prediction_signature}, legacy_init_op=legacy_init_op) builder.save()
训练结果
相关代码可见:https://github.com/yifanhunter/NLP_textClassifier