基于Transformer进行多标签文本预测
1.深度模型增加参数空间,提高拟合能力;
2.Attention机制捕捉各基础特征间的关联信息,组合性更加强悍;
3.文本多标签预测难度较大,采用0,1进行label表示。
以下给出模型类供参考,分类效果很不错:
class BaseClassier(object): def __init__(self, config, sess): # configuration self.max_len = config["max_len"] self.position_len = config["position_len"] self.sess = sess self.num_classes = config["n_class"] self.lstm_layers = config["lstm_layers"] self.vocab_size = config["vocab_size"] self.embedding_size = config["embedding_size"] self.hidden_size = config["hidden_size"] self.l2_reg_lambda = config["l2_reg_lambda"] self.learning_rate = config["learning_rate"] self.filter_heights = config["filter_heights"] self.filter_num_per_height = config["filter_num_per_height"] self.numBlocks = config['numBlocks'] self.filters = config['filters'] self.numHeads = config['numHeads'] self.keepProp = config['keepProp'] # multi head attention 中的dropout self.norm_epsilon = config['norm_epsilon'] # placeholder self.x = tf.compat.v1.placeholder(tf.float32, [None, self.max_len], name="input_x") self.label = tf.compat.v1.placeholder(tf.float32, [None, self.num_classes], name="input_y") self.trans_keep_prob = tf.compat.v1.placeholder(tf.float32, name="trans_keep_prob") self.multi_keep_prob = tf.compat.v1.placeholder(tf.float32, name="multi_keep_prob") self.embeddedPosition = tf.placeholder(tf.float32, [None, self.position_len, self.position_len], name="embed_position") def transformer_layer(self): l2Loss = tf.constant(0.0) with tf.name_scope("embedding"): self.embed_fusion = self.embedding_layer_fusion_v1(self.x) self.embeddedWords = tf.concat([self.embed_fusion, self.embeddedPosition], -1) with tf.name_scope("transformer"): for i in range(self.numBlocks): with tf.name_scope("transformer-{}".format(i + 1)): multiHeadAtt = self._multiheadAttention(rawKeys=self.original_feature, queries=self.embeddedWords, keys=self.embeddedWords) self.embeddedWords = self._feedForward(multiHeadAtt, [self.filters, self.embedding_size + self.position_len]) outputs = tf.reshape(self.embeddedWords, [-1, self.position_len * (self.embedding_size + self.position_len)]) outputSize = outputs.get_shape()[-1].value with tf.name_scope("dropout"): outputs = tf.nn.dropout(outputs, keep_prob=self.trans_keep_prob) with tf.name_scope("output"): outputW = tf.compat.v1.get_variable( "outputW", shape=[outputSize, self.num_classes], initializer=tf.contrib.layers.xavier_initializer()) outputB = tf.Variable(tf.constant(0.1, shape=[self.num_classes]), name="outputB") l2Loss += tf.nn.l2_loss(outputW) l2Loss += tf.nn.l2_loss(outputB) self.logits = tf.compat.v1.nn.xw_plus_b(outputs, outputW, outputB, name="logits") self.possibility = tf.nn.sigmoid(self.logits, name="possibility") self.prediction = tf.round(self.possibility, name="prediction") with tf.name_scope("loss"): if self.num_classes == 1: pass elif self.num_classes > 1: losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.label) sum_losses = tf.reduce_sum(losses, axis=1) self.loss = tf.reduce_mean(sum_losses) + self.l2_reg_lambda * l2Loss def embedding_layer_fusion_v1(self, input_x, name=None): """ :param input_x: :param name: :return: """ with tf.name_scope('word_embedding' if not name else name), tf.device('/cpu:0'): embeddings = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0) ,name='embeddings') geek_hist = input_x[:, :112] geek_profile = input_x[:, 112:115] geek_query = input_x[:, 115:] pos_list = [] score_list = [] for i in range(112): if i % 2 == 0: pos_list.append(tf.expand_dims(geek_hist[:, i], -1)) else: score_list.append(tf.expand_dims(geek_hist[:, i], -1)) geek_position = tf.cast(tf.concat(pos_list, axis=1), tf.int32) geek_profile = tf.cast(geek_profile, tf.int32) geek_query = tf.cast(geek_query, tf.int32) geek_score = tf.concat(score_list, axis=1) expand_score = tf.expand_dims(geek_score, -1) geek_score = tf.tile(expand_score, [1, 1, self.embedding_size]) position_embed = tf.nn.embedding_lookup(embeddings, geek_position, name='position_embed') position_embed_with_score = tf.multiply(position_embed, geek_score) profile_embed = tf.nn.embedding_lookup(embeddings, geek_profile, name='profile_embed') query_embed = tf.nn.embedding_lookup(embeddings, geek_query, name='query_embed') self.original_feature = tf.concat([geek_position, geek_profile, geek_query], axis=1) embed_fusion = tf.concat([position_embed_with_score, profile_embed, query_embed], axis=1) return embed_fusion def _layerNormalization(self, inputs, scope="layerNorm"): epsilon = self.norm_epsilon inputsShape = inputs.get_shape() # [batch_size, sequence_length, embedding_size] paramsShape = inputsShape[-1:] mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True) beta = tf.Variable(tf.zeros(paramsShape)) gamma = tf.Variable(tf.ones(paramsShape)) normalized = (inputs - mean) / ((variance + epsilon) ** .5) outputs = gamma * normalized + beta return outputs def _multiheadAttention(self, rawKeys, queries, keys, numUnits=None, causality=False, scope="multiheadAttention"): numHeads = self.numHeads keepProp = self.keepProp if numUnits is None: numUnits = queries.get_shape().as_list()[-1] Q = tf.layers.dense(queries, numUnits, activation=tf.nn.relu) K = tf.layers.dense(keys, numUnits, activation=tf.nn.relu) V = tf.layers.dense(keys, numUnits, activation=tf.nn.relu) Q_ = tf.concat(tf.split(Q, numHeads, axis=-1), axis=0) K_ = tf.concat(tf.split(K, numHeads, axis=-1), axis=0) V_ = tf.concat(tf.split(V, numHeads, axis=-1), axis=0) similary = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) scaledSimilary = similary / (K_.get_shape().as_list()[-1] ** 0.5) keyMasks = tf.tile(rawKeys, [numHeads, 1]) keyMasks = tf.tile(tf.expand_dims(keyMasks, 1), [1, tf.shape(queries)[1], 1]) paddings = tf.ones_like(scaledSimilary) * (-2 ** (32 + 1)) maskedSimilary = tf.where(tf.equal(keyMasks, 0), paddings, scaledSimilary) if causality: diagVals = tf.ones_like(maskedSimilary[0, :, :]) # [queries_len, keys_len] tril = tf.contrib.linalg.LinearOperatorTriL(diagVals).to_dense() # [queries_len, keys_len] masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(maskedSimilary)[0], 1, 1]) # [batch_size * numHeads, queries_len, keys_len] paddings = tf.ones_like(masks) * (-2 ** (32 + 1)) maskedSimilary = tf.where(tf.equal(masks, 0), paddings, maskedSimilary) # [batch_size * numHeads, queries_len, keys_len] weights = tf.nn.softmax(maskedSimilary) outputs = tf.matmul(weights, V_) outputs = tf.concat(tf.split(outputs, numHeads, axis=0), axis=2) outputs = tf.nn.dropout(outputs, keep_prob=self.multi_keep_prob) outputs += queries outputs = self._layerNormalization(outputs) return outputs def _feedForward(self, inputs, filters, scope="multiheadAttention"): params = {"inputs": inputs, "filters": filters[0], "kernel_size": 1, "activation": tf.nn.relu, "use_bias": True} outputs = tf.layers.conv1d(**params) params = {"inputs": outputs, "filters": filters[1], "kernel_size": 1, "activation": None, "use_bias": True} outputs = tf.layers.conv1d(**params) outputs += inputs outputs = self._layerNormalization(outputs) return outputs def _positionEmbedding(self, scope="positionEmbedding"): batchSize = self.config.batchSize sequenceLen = self.config.sequenceLength embeddingSize = self.config.model.embeddingSize positionIndex = tf.tile(tf.expand_dims(tf.range(sequenceLen), 0), [batchSize, 1]) positionEmbedding = np.array([[pos / np.power(10000, (i - i % 2) / embeddingSize) for i in range(embeddingSize)] for pos in range(sequenceLen)]) positionEmbedding[:, 0::2] = np.sin(positionEmbedding[:, 0::2]) positionEmbedding[:, 1::2] = np.cos(positionEmbedding[:, 1::2]) positionEmbedding_ = tf.cast(positionEmbedding, dtype=tf.float32) positionEmbedded = tf.nn.embedding_lookup(positionEmbedding_, positionIndex) return positionEmbedded def build_graph(self): print("building graph...") with tf.compat.v1.variable_scope("discriminator"): self.transformer_layer() self.global_step = tf.Variable(0, name="globalStep", trainable=False) optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate) gradsAndVars = optimizer.compute_gradients(self.loss) self.train_op = optimizer.apply_gradients(gradsAndVars, global_step=self.global_step) gradSummaries = [] for g, v in gradsAndVars: if g is not None: tf.compat.v1.summary.histogram("{}/grad/hist".format(v.name), g) tf.compat.v1.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) outDir = r"./summarys" print("Writing to {}\n".format(outDir)) lossSummary = tf.compat.v1.summary.scalar("loss", self.loss) summaryOp = tf.compat.v1.summary.merge_all() trainSummaryDir = os.path.join(outDir, "train") trainSummaryWriter = tf.compat.v1.summary.FileWriter(trainSummaryDir, self.sess.graph) evalSummaryDir = os.path.join(outDir, "eval") evalSummaryWriter = tf.compat.v1.summary.FileWriter(evalSummaryDir, self.sess.graph) self.saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables(), max_to_keep=3) print("graph built successfully!") if __name__ == '__main__': y_train = transform_multilabel_as_multihot(y_train, position2id, nums_class) y_test = transform_multilabel_as_multihot(y_test, position2id, nums_class) x_train, x_test, vocab_size = data_processing(features, x_train, x_test, max_len=135) print("train size: ", len(x_train)) print("test size", len(x_test)) print("vocab size: ", vocab_size) config = { "position_len": 79, "max_len": 135, "vocab_size": vocab_size, "embedding_size": 161, "learning_rate": 1e-3, "l2_reg_lambda": 1e-3, "batch_size": 32, "n_class": nums_class, "hidden_size": 256, "lstm_layers": 2, "filter_heights": [2, 3, 4, 5], "filter_num_per_height": [100, 100, 300, 300], "numBlocks": 1, "filters": 128, "numHeads": 8, "keepProp": 0.9, # multi head attention 中的dropout "norm_epsilon": 1e-8, "train_epoch": 20, "savedModelPath": r'./PBModel', } embeddedPosition = fixedPositionEmbedding(config["batch_size"], config["position_len"]) # auto GPU growth, avoid occupy all GPU memory tf_config = tf.compat.v1.ConfigProto() tf_config.gpu_options.allow_growth = True sess = tf.compat.v1.Session(config=tf_config) classifier = BaseClassier(config, sess) classifier.build_graph() sess.run(tf.compat.v1.global_variables_initializer()) dev_batch = (x_test, y_test) start = time.time() best_auc = .0 for e in range(config["train_epoch"]): t0 = time.time() print("\nEpoch {} start !".format(e + 1)) trained_samples = 0 for batch_idx, (x_batch, y_batch) in enumerate(fill_feed_dict(x_train, y_train, config["batch_size"], is_shuffle=False)): return_dict = run_train_step(classifier, sess, (x_batch, y_batch)) trained_samples += len(x_batch) progress = math.ceil(batch_idx / (x_train.shape[0] // config["batch_size"]) * 50) print('\rTrain epoch: {} {}/{} [{}]{}% '.format(e+1, trained_samples, len(x_train), '-' * progress + '>', progress * 2), end='') t1 = time.time() print("Train Epoch time: {:.4f} s".format(t1 - t0)) auc, _, _, _ = run_eval_step(classifier, sess, dev_batch) print("validation loss:{:.4f}\tauc:{:.4f}".format( return_dict["loss"], auc)) if auc > best_auc: best_auc = auc saver = tf.train.Saver() saver.save(sess, "Model/model.ckpt") output_graph_def = tf.compat.v1.graph_util.convert_variables_to_constants( sess=sess, input_graph_def=sess.graph_def, output_node_names=['discriminator/output/prediction']) with tf.io.gfile.GFile(output_graph, 'wb') as fw: fw.write(output_graph_def.SerializeToString()) print('best model have saved!') print("Training finished, time consumed : {:.2f} s\n Training over!".format(time.time() - start))
时刻记着自己要成为什么样的人!