个性化搜索召回模型设计--训练部分
模型结构:双塔结构,包括user tower和item tower
重要的几点:
1.user tower包括了用户的年龄、性别、期望等基本属性,行为数据:搜索query及前7天历史query
2.item tower包括了物品数据(行业特性决定),title、skills、desc,纯文本形式
3.文本数据均采用bert编码(剪枝后3层),独立训练bert编码模型,使用对比学习思路习得
4.当前query和历史query做Dot-Product Attention,获取深层语义表征向量
5.特征表示方面,使用hash方法ID化,替换原有的字典ID方法,省去了映射字典的存储,同时提高了未知数据的表征能力
6.负采样可以使用in-batch抽样或事先确定负样本
主体代码如下:
""" Function: 向量召回模型训练-训练模型 """ import os import sys os.system('pip install -i https://pypi.tuna.tsinghua.edu.cn/simple keras==2.3.1') import csv import time import math import copy import json import keras import logging import numpy as np import pandas as pd import tensorflow as tf import matplotlib.pyplot as plt from keras.models import Model from keras import layers from keras.layers import Input,Dense,Lambda,concatenate,GlobalAveragePooling1D,Embedding import keras.backend.tensorflow_backend as KTF from bert4keras.tokenizers import Tokenizer from bert4keras.snippets import sequence_padding, DataGenerator from bert4keras.optimizers import Adam, extend_with_weight_decay from bert4keras.models import build_transformer_model from bert4keras.backend import set_gelu, K, sequence_masking, pool1d from sklearn.metrics import log_loss, roc_auc_score from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, MinMaxScaler from keras.callbacks import ModelCheckpoint,EarlyStopping logger = logging.getLogger("simple_logger") logger.setLevel(logging.DEBUG) handler = logging.StreamHandler(sys.stdout) handler.setLevel(logging.DEBUG) formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") handler.setFormatter(formatter) logger.addHandler(handler) logger.info("info message.") epochs = 5 maxlen = 64 batch_size= 256 config = tf.ConfigProto() config.gpu_options.allow_growth = True session = tf.Session(config=config) KTF.set_session(session) path = 'bert_file/chinese_roformer-sim-char-ft_L-6_H-384_A-6/' config_path = path+'bert_config.json' checkpoint_path = path+'bert_model.ckpt' dict_path = path+'vocab.txt' base_dir = r"/***/embedding_recall" data_dir = r"data/20220208_data" model_path = os.path.join(base_dir, r"model/online_recall_model.h5") geek_encoder_path = os.path.join(base_dir, "model/online_geek_encoder.h5") job_encoder_path = os.path.join(base_dir,"model/online_job_encoder.h5") new_train_path = os.path.join(base_dir, data_dir, "train_data.csv") new_eval_path = os.path.join(base_dir, data_dir, "eval_data.csv") checkpoint_save_dir = os.path.join(base_dir, "ckpt") best_model_name = "best_model.h5" if not os.path.exists(checkpoint_save_dir): os.makedirs(checkpoint_save_dir) geek_names = ["age","gender","degree_code","work_years","work_position_code","expect_position_code","expect_city_code","expect_low_salary", "expect_high_salary","expect_type","expect_position_type","expect_sub_location"] job_names = ["job_position_code","job_type","job_city_code", "job_degree_code","job_low_salary","job_high_salary","job_salary_type","job_salary_month","area_id","area_city_code"] def create_encode(): def bert_dynamic_fusion(model, layer_num): all_encoder_layers = [] for i in range(layer_num - 1): all_encoder_layers.append(model.get_layer('Transformer-' + str(i) + '-FeedForward-Norm').output) layer_logits = [] for i, layer in enumerate(all_encoder_layers): layer_logits.append( keras.layers.Dense( 1, kernel_initializer=keras.initializers.TruncatedNormal(stddev=0.02), name="layer_logit%d" % i )(layer) ) layer_logits = Lambda(concatenate, arguments={'axis': 2})(layer_logits) layer_dist = Lambda(lambda x: tf.nn.softmax(x))(layer_logits) seq_out = Lambda(lambda x: K.concatenate(x, axis=2))( [Lambda(lambda x: K.expand_dims(x, axis=2))(x) for x in all_encoder_layers]) pooled_output = Lambda(lambda x: tf.matmul(tf.expand_dims(x[0], axis=2), x[1]))([layer_dist, seq_out]) pooled_output = Lambda(lambda x: K.squeeze(x, axis=2))(pooled_output) return pooled_output bert1 = build_transformer_model( config_path, checkpoint_path, model='roformer', return_keras_model=False, dropout_rate=0.2, ) bert_output = bert_dynamic_fusion(bert1.model, 6) x_avg = GlobalAveragePooling1D()(bert_output) v1 = Lambda(lambda x: K.l2_normalize(x, 1), name="output")(x_avg) encode = keras.models.Model(bert1.model.input, v1) encode.load_weights("/simbert+userfeature/encode.weights") return encode encoder_a = create_encode() logger.info("### bert model loaded!") # 3.抽取query、title编码向量,离线存储 data = pd.read_csv(new_train_path) eval_data = pd.read_csv(new_eval_path) querys = data["query"].unique().tolist() + eval_data["query"].unique().tolist() titles = data["title_skill_keyword"].unique().tolist() + eval_data["title_skill_keyword"].unique().tolist() hitory_querys = data["history_query"].unique().tolist() new_hitory_querys = [] for each in hitory_querys: new_hitory_querys.extend(eval(each)) new_hitory_querys = list(set(new_hitory_querys)) querys = querys + new_hitory_querys query_embedd_dict = {} title_embedd_dict = {} class bert_vector_generator(DataGenerator): def __iter__(self, random=False,batch_size =1): tokenizer = Tokenizer(dict_path, do_lower_case=True) batch_token_ids, batch_segment_ids = [], [] for is_end, text in self.sample(random): token_id, segment_id = tokenizer.encode(text, maxlen=64) batch_token_ids.append(token_id) batch_segment_ids.append(segment_id) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) yield [batch_token_ids, batch_segment_ids] batch_token_ids, batch_segment_ids = [], [] def forpred(self, random=False): while True: for d in self.__iter__(random): yield d def get_text_vecs(tag_data, encoder, batch_size): """ """ res = np.array([]) data_gen = bert_vector_generator(data=tag_data, batch_size=batch_size) vecs = encoder.predict_generator(data_gen.forpred(), steps=len(data_gen), verbose=1) return vecs query_vecs = get_text_vecs(querys,encoder_a,256) title_vecs = get_text_vecs(titles,encoder_a,256) for k,v in zip(querys,query_vecs): query_embedd_dict[k]=v for k,v in zip(titles,title_vecs): title_embedd_dict[k]=v logger.info("### query_embedd_dict:{} title_embedd_dict:{}".format(len(query_embedd_dict), len(title_embedd_dict))) logger.info("data:{}; eval_data{}".format(len(data),len(eval_data))) # 4.数据预处理 geek_features = ["G"+str(i) for i in range(1,len(geek_names)+1)] job_features = ["J"+str(i) for i in range(1,len(job_names)+1)] # 类型转换,hash必须接收字符串 for col in geek_features+job_features: data[col] = data[col].astype(str) eval_data[col] = eval_data[col].astype(str) train_data = [e for e in data.iterrows()] online_eval_data = [e for e in eval_data.iterrows()] train, test = train_test_split(train_data, test_size=0.02, random_state=2022) logger.info("### train:{} test:{} online:{}".format(len(train),len(test),len(eval_data))) class data_generator(DataGenerator): def __iter__(self, random=False,mode=""): if mode=="batch_negative": data_dict = {} label_dict = {} labels = [] query_feats = [] title_feats = [] G1,G2,G3,G4,G5,G6,G7,G8,G9,G10,G11,G12 = [],[],[],[],[],[],[],[],[],[],[],[] J1,J2,J3,J4,J5,J6,J7,J8,J9,J10 = [],[],[],[],[],[],[],[],[],[] negative_names = ['J1','J2','J3','J4','J5','J6','J7','J8','J9','J10'] for is_end, data in self.sample(random): cur_label = data[1]["label"] if cur_label==1: for _,s in zip(range(2),self.sample(random=True)): if len(labels)<self.batch_size: for name in geek_features: exec("{}.append({})".format(name,[data[1][name]])) for each_name in negative_names: exec("{}.append({})".format(each_name, [s[1][1][each_name]])) query_feats.append(query_embedd_dict[data[1]["query"]]) title_feats.append(title_embedd_dict[s[1][1]["title_skill_keyword"]]) labels.append([0]) else: if len(labels)<self.batch_size: for name in geek_features+job_features: exec("{}.append({})".format(name,[data[1][name]])) query_feats.append(query_embedd_dict[data[1]["query"]]) title_feats.append(title_embedd_dict[data[1]["title_skill_keyword"]]) labels.append([data[1]["label"]]) if len(labels) == self.batch_size or is_end: for e in geek_features+job_features: data_dict[e]=np.array(eval(e)) data_dict["query_embedding"] = np.array(query_feats) data_dict["title_embedding"] = np.array(title_feats) label_dict["output"] = np.array(labels) yield(((data_dict, label_dict))) data_dict = {} label_dict = {} query_feats = [] title_feats = [] G1,G2,G3,G4,G5,G6,G7,G8,G9,G10,G11,G12 = [],[],[],[],[],[],[],[],[],[],[],[] J1,J2,J3,J4,J5,J6,J7,J8,J9,J10 = [],[],[],[],[],[],[],[],[],[] labels = [] else: query_span = 10 data_dict = {} label_dict = {} labels = [] query_feats = [] query_hist_feats = [] title_feats = [] G1,G2,G3,G4,G5,G6,G7,G8,G9,G10,G11,G12 = [],[],[],[],[],[],[],[],[],[],[],[] J1,J2,J3,J4,J5,J6,J7,J8,J9,J10 = [],[],[],[],[],[],[],[],[],[] negative_names = ['J1','J2','J3','J4','J5','J6','J7','J8','J9','J10'] for is_end, data in self.sample(random): cur_label = data[1]["label"] for name in geek_features+job_features: exec("{}.append({})".format(name,[data[1][name]])) query_feats.append(query_embedd_dict[data[1]["query"]]) title_feats.append(title_embedd_dict[data[1]["title_skill_keyword"]]) query_temp = [] for each_query in eval(data[1]["history_query"]): try: query_temp.append(query_embedd_dict[each_query]) except Exception as e: query_temp.append(np.zeros(shape=(384,))) padding_size = query_span - len(query_temp) for i in range(padding_size): query_temp.append(np.zeros(shape=(384,))) query_hist_feats.append(query_temp) labels.append([data[1]["label"]]) if len(labels) == self.batch_size or is_end: for e in geek_features+job_features: data_dict[e]=np.array(eval(e)) data_dict["query_embedding"] = np.array(query_feats) data_dict["title_embedding"] = np.array(title_feats) data_dict["query_history_embedding"] = np.array(query_hist_feats) label_dict["output"] = np.array(labels) yield((data_dict, label_dict)) data_dict = {} label_dict = {} query_feats = [] title_feats = [] query_hist_feats = [] G1,G2,G3,G4,G5,G6,G7,G8,G9,G10,G11,G12 = [],[],[],[],[],[],[],[],[],[],[],[] J1,J2,J3,J4,J5,J6,J7,J8,J9,J10 = [],[],[],[],[],[],[],[],[],[] labels = [] def forfit(self, random=False, mode="train"): while True: for d in self.__iter__(random,mode=mode): yield d # 5.主干模型搭建 def make_or_restore_model(): # Either restore the latest model, or create a fresh one # if there is no checkpoint available. checkpoints = [checkpoint_dir + "/" + name for name in os.listdir(checkpoint_dir)] if checkpoints: latest_checkpoint = max(checkpoints, key=os.path.getctime) print("Restoring from", latest_checkpoint) return keras.models.load_model(latest_checkpoint) print("Creating a new model") return get_compiled_model() def get_recall_model(geek_features, job_features): embedding_size = 384 bert_encoder_shape = (embedding_size,) vocab_bias = 50 def model_input(shape,name): return Input(shape=shape,name=name,dtype="string") def sparse_feat(feat,vocab_size,embedding_dim): return Embedding(vocab_size, embedding_dim)(feat) def dense_feat(feat): return Lambda(lambda x:tf.expand_dims(x, axis=2))(feat) def embedd_feat(shape,name): return Input(shape=shape, name=name) def hash_bucket(x, vocab_size_max): return Lambda(lambda x: tf.strings.to_hash_bucket_fast(x, vocab_size_max - 1) + 1)(x) geek_feats = [] job_feats = [] for each in geek_features: geek_feats.append(model_input(shape=(None,),name=each)) for each in job_features: job_feats.append(model_input(shape=(None,),name=each)) geek_hash_feats = [hash_bucket(e, len(data[feat_name].value_counts())+vocab_bias) for e,feat_name in zip(geek_feats,geek_features)] job_hash_feats = [hash_bucket(e, len(data[feat_name].value_counts())+vocab_bias) for e,feat_name in zip(job_feats,job_features)] geek_feature_inputs = [sparse_feat(e, len(data[feat_name].value_counts())+vocab_bias, 64) for e,feat_name in zip(geek_hash_feats,geek_features)] geek_feature_columns = [Lambda(lambda x:tf.squeeze(x,[1]))(e) for e in geek_feature_inputs] query_feature_columns = [embedd_feat(shape=bert_encoder_shape,name="query_embedding")] query_history_feature_columns = embedd_feat(shape=(None,embedding_size),name="query_history_embedding") job_feature_inputs = [sparse_feat(e, len(data[feat_name].value_counts())+vocab_bias, 64) for e,feat_name in zip(job_hash_feats,job_features)] job_feature_columns = [Lambda(lambda x:tf.squeeze(x,[1]))(e) for e in job_feature_inputs] title_feature_columns = [embedd_feat(shape=bert_encoder_shape,name="title_embedding")] # query(?,384) with history query(?,?,384) --> Dot-Product Attention query_embeddings = Lambda(lambda x:tf.expand_dims(x,axis=1))(query_feature_columns[0]) # shape:(?,1,384) # layers.Attention()(query_embeddings, query_history_feature_columns) query_history_transpose = Lambda(lambda x:tf.transpose(x,[0,2,1]))(query_history_feature_columns) # shape:(?,384,?) query_matmul = Lambda(lambda x:tf.matmul(x[0],x[1]))([query_embeddings,query_history_transpose]) # shape:(?,1,?) attention_weights = layers.Softmax(name="softmax_layer")(query_matmul) # shape:(?,1,?) (?,?,384) query_columns_with_weights = Lambda(lambda x:tf.matmul(x[0],x[1]))([attention_weights,query_history_feature_columns]) # shape:(?,1,384) query_columns_with_weights = Lambda(lambda x:tf.squeeze(x,[1]))(query_columns_with_weights) # geek tower geek_vector_tmp = Lambda(lambda x:K.concatenate(x, axis=-1))(geek_feature_columns+[query_columns_with_weights]) geek_vector = Dense(64, activation="relu")(geek_vector_tmp) geek_vector = Dense(32, activation="relu",kernel_regularizer="l2",name="geek_vector")(geek_vector) # job tower job_vector_tmp = Lambda(lambda x:K.concatenate(x, axis=-1))(job_feature_columns+title_feature_columns) job_vector = Dense(64, activation="relu")(job_vector_tmp) job_vector = Dense(32, activation="relu",kernel_regularizer="l2",name="job_vector")(job_vector) dot_geek_job = Lambda(lambda x:tf.multiply(x[0],x[1]))([geek_vector, job_vector]) dot_geek_job = Lambda(lambda x:tf.reduce_sum(x,axis=1))(dot_geek_job) dot_geek_job = Lambda(lambda x:tf.expand_dims(x,1))(dot_geek_job) output = layers.Dense(1, activation="sigmoid", name="output")(dot_geek_job) model = Model(inputs=geek_feats+job_feats+query_feature_columns+title_feature_columns+[query_history_feature_columns], outputs=output,name="merge") return model def run_training(epochs, geek_features, job_features, train, test,online_eval_data): model = get_recall_model(geek_features, job_features) model.compile("adam","binary_crossentropy",metrics=['binary_crossentropy']) logger.info("### model compiled!") logger.info("### data generating...") train_data_gen = data_generator(data=train, batch_size=batch_size) test_data_gen = data_generator(data=test, batch_size=batch_size) online_data_gen = data_generator(data=online_eval_data, batch_size=batch_size) logger.info("### data generator finished!") early_stop = EarlyStopping(monitor='loss', patience=3, verbose=1) checkpoint = ModelCheckpoint(os.path.join(checkpoint_save_dir, best_model_name), monitor='loss', verbose=1, save_best_only=True, mode='min') now = int(round(time.time()*1000)) TIMESTAMP = time.strftime('%Y-%m-%d_%H_%M_%S.traning',time.localtime(now/1000)) tensorboard_callback = keras.callbacks.TensorBoard(log_dir=os.path.join(base_dir, "logs/"+TIMESTAMP)) callbacks_list = [checkpoint, early_stop, tensorboard_callback] history = model.fit_generator(train_data_gen.forfit(mode="no_batch_negative"), steps_per_epoch=len(train_data_gen), epochs=epochs, callbacks=callbacks_list, verbose=1) model.save(model_path) logger.info("### model saved!") def eval_function(data, data_gen, name): y_true = np.array([[e[1]["label"]] for e in data]) y_pred = model.predict_generator(data_gen.forfit(mode="no_batch_negative"),steps=len(data_gen), verbose=1) logger.info("{} Loss: {}".format(name, round(log_loss(y_true, y_pred), 4))) logger.info("{} AUC : {}".format(name, round(roc_auc_score(y_true, y_pred), 4))) eval_function(test, test_data_gen, "Test_offline") eval_function(online_eval_data, online_data_gen, "Test_online") if __name__ == "__main__": run_training(epochs, geek_features, job_features, train, test, online_eval_data) logger.info("### model train over!")
代码仓促有些乱,有时间再整理。
遗留问题:Loss 需要重新设计,采用hinge loss,数据形式为三元组
时刻记着自己要成为什么样的人!