个性化搜索召回模型设计--训练部分

模型结构:双塔结构,包括user tower和item tower

重要的几点:

1.user tower包括了用户的年龄、性别、期望等基本属性,行为数据:搜索query及前7天历史query

2.item tower包括了物品数据(行业特性决定),title、skills、desc,纯文本形式

3.文本数据均采用bert编码(剪枝后3层),独立训练bert编码模型,使用对比学习思路习得

4.当前query和历史query做Dot-Product Attention,获取深层语义表征向量

5.特征表示方面,使用hash方法ID化,替换原有的字典ID方法,省去了映射字典的存储,同时提高了未知数据的表征能力

6.负采样可以使用in-batch抽样或事先确定负样本

主体代码如下:

"""
    Function: 向量召回模型训练-训练模型
"""
import os
import sys
os.system('pip install -i https://pypi.tuna.tsinghua.edu.cn/simple keras==2.3.1')
import csv
import time
import math
import copy
import json
import keras
import logging
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from keras.models import Model
from keras import layers
from keras.layers import Input,Dense,Lambda,concatenate,GlobalAveragePooling1D,Embedding
import keras.backend.tensorflow_backend as KTF
from bert4keras.tokenizers import Tokenizer
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.optimizers import Adam, extend_with_weight_decay
from bert4keras.models import build_transformer_model
from bert4keras.backend import set_gelu, K, sequence_masking, pool1d
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from keras.callbacks import ModelCheckpoint,EarlyStopping

logger = logging.getLogger("simple_logger")
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.info("info message.")

epochs = 5
maxlen = 64
batch_size= 256
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
KTF.set_session(session)

path = 'bert_file/chinese_roformer-sim-char-ft_L-6_H-384_A-6/'
config_path = path+'bert_config.json'
checkpoint_path = path+'bert_model.ckpt'
dict_path = path+'vocab.txt'

base_dir = r"/***/embedding_recall"
data_dir = r"data/20220208_data"

model_path = os.path.join(base_dir, r"model/online_recall_model.h5")
geek_encoder_path = os.path.join(base_dir, "model/online_geek_encoder.h5")
job_encoder_path = os.path.join(base_dir,"model/online_job_encoder.h5")
new_train_path = os.path.join(base_dir, data_dir, "train_data.csv")
new_eval_path = os.path.join(base_dir, data_dir, "eval_data.csv")

checkpoint_save_dir = os.path.join(base_dir, "ckpt")
best_model_name = "best_model.h5"
if not os.path.exists(checkpoint_save_dir):
    os.makedirs(checkpoint_save_dir)

geek_names = ["age","gender","degree_code","work_years","work_position_code","expect_position_code","expect_city_code","expect_low_salary",
                "expect_high_salary","expect_type","expect_position_type","expect_sub_location"]
job_names = ["job_position_code","job_type","job_city_code",
            "job_degree_code","job_low_salary","job_high_salary","job_salary_type","job_salary_month","area_id","area_city_code"]

def create_encode():
    def bert_dynamic_fusion(model, layer_num):
        all_encoder_layers = []
        for i in range(layer_num - 1):
            all_encoder_layers.append(model.get_layer('Transformer-' + str(i) + '-FeedForward-Norm').output)
        layer_logits = []
        for i, layer in enumerate(all_encoder_layers):
            layer_logits.append(
                keras.layers.Dense(
                    1,
                    kernel_initializer=keras.initializers.TruncatedNormal(stddev=0.02),
                    name="layer_logit%d" % i
                )(layer)
            )
        layer_logits = Lambda(concatenate, arguments={'axis': 2})(layer_logits)
        layer_dist = Lambda(lambda x: tf.nn.softmax(x))(layer_logits)
        seq_out = Lambda(lambda x: K.concatenate(x, axis=2))(
            [Lambda(lambda x: K.expand_dims(x, axis=2))(x) for x in all_encoder_layers])
        pooled_output = Lambda(lambda x: tf.matmul(tf.expand_dims(x[0], axis=2), x[1]))([layer_dist, seq_out])
        pooled_output = Lambda(lambda x: K.squeeze(x, axis=2))(pooled_output)
        return pooled_output
    bert1 = build_transformer_model(
        config_path,
        checkpoint_path,
        model='roformer',
        return_keras_model=False,
        dropout_rate=0.2,
    )
    bert_output = bert_dynamic_fusion(bert1.model, 6)
    x_avg = GlobalAveragePooling1D()(bert_output)
    v1 = Lambda(lambda x: K.l2_normalize(x, 1), name="output")(x_avg)
    encode = keras.models.Model(bert1.model.input, v1)
    encode.load_weights("/simbert+userfeature/encode.weights")
    return encode
encoder_a = create_encode()
logger.info("### bert model loaded!")

# 3.抽取query、title编码向量,离线存储
data = pd.read_csv(new_train_path)
eval_data = pd.read_csv(new_eval_path)
querys = data["query"].unique().tolist() + eval_data["query"].unique().tolist()
titles = data["title_skill_keyword"].unique().tolist() + eval_data["title_skill_keyword"].unique().tolist()
hitory_querys = data["history_query"].unique().tolist()
new_hitory_querys = []
for each in hitory_querys:
    new_hitory_querys.extend(eval(each))
new_hitory_querys = list(set(new_hitory_querys))
querys = querys + new_hitory_querys
query_embedd_dict = {}
title_embedd_dict = {}

class bert_vector_generator(DataGenerator):
    def __iter__(self, random=False,batch_size =1):
        tokenizer = Tokenizer(dict_path, do_lower_case=True)
        batch_token_ids, batch_segment_ids = [], []
        for is_end, text in self.sample(random):
            token_id, segment_id = tokenizer.encode(text, maxlen=64)
            batch_token_ids.append(token_id)
            batch_segment_ids.append(segment_id)
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                yield [batch_token_ids, batch_segment_ids]
                batch_token_ids, batch_segment_ids = [], []
    def forpred(self, random=False):
        while True:
            for d in self.__iter__(random):
                yield d

def get_text_vecs(tag_data, encoder, batch_size):
    """
    """
    res = np.array([])
    data_gen = bert_vector_generator(data=tag_data, batch_size=batch_size)
    vecs = encoder.predict_generator(data_gen.forpred(), steps=len(data_gen), verbose=1)
    return vecs

query_vecs = get_text_vecs(querys,encoder_a,256)
title_vecs = get_text_vecs(titles,encoder_a,256)
for k,v in zip(querys,query_vecs):
    query_embedd_dict[k]=v
for k,v in zip(titles,title_vecs):
    title_embedd_dict[k]=v
logger.info("### query_embedd_dict:{} title_embedd_dict:{}".format(len(query_embedd_dict), len(title_embedd_dict)))
logger.info("data:{}; eval_data{}".format(len(data),len(eval_data)))

# 4.数据预处理
geek_features = ["G"+str(i) for i in range(1,len(geek_names)+1)]
job_features = ["J"+str(i) for i in range(1,len(job_names)+1)]
# 类型转换,hash必须接收字符串
for col in geek_features+job_features:
    data[col] = data[col].astype(str)
    eval_data[col] = eval_data[col].astype(str)
    
train_data = [e for e in data.iterrows()]
online_eval_data = [e for e in eval_data.iterrows()]
train, test = train_test_split(train_data, test_size=0.02, random_state=2022)
logger.info("### train:{} test:{} online:{}".format(len(train),len(test),len(eval_data)))

class data_generator(DataGenerator):
    def __iter__(self, random=False,mode=""):
        if mode=="batch_negative":
            data_dict = {}
            label_dict = {}
            labels = []
            query_feats = []
            title_feats = []
            G1,G2,G3,G4,G5,G6,G7,G8,G9,G10,G11,G12 = [],[],[],[],[],[],[],[],[],[],[],[]
            J1,J2,J3,J4,J5,J6,J7,J8,J9,J10 = [],[],[],[],[],[],[],[],[],[]
            negative_names = ['J1','J2','J3','J4','J5','J6','J7','J8','J9','J10']
            for is_end, data in self.sample(random):
                cur_label = data[1]["label"]
                if cur_label==1:
                    for _,s in zip(range(2),self.sample(random=True)):
                        if len(labels)<self.batch_size:
                            for name in geek_features:
                                exec("{}.append({})".format(name,[data[1][name]]))
                            for each_name in negative_names:
                                exec("{}.append({})".format(each_name, [s[1][1][each_name]]))
                            query_feats.append(query_embedd_dict[data[1]["query"]])
                            title_feats.append(title_embedd_dict[s[1][1]["title_skill_keyword"]])
                            labels.append([0])
                else:
                    if len(labels)<self.batch_size:
                        for name in geek_features+job_features:
                            exec("{}.append({})".format(name,[data[1][name]]))
                        query_feats.append(query_embedd_dict[data[1]["query"]])
                        title_feats.append(title_embedd_dict[data[1]["title_skill_keyword"]])
                        labels.append([data[1]["label"]])
                if len(labels) == self.batch_size or is_end:
                    for e in geek_features+job_features:
                        data_dict[e]=np.array(eval(e))
                    data_dict["query_embedding"] = np.array(query_feats)
                    data_dict["title_embedding"] = np.array(title_feats)
                    label_dict["output"] = np.array(labels)
                    yield(((data_dict, label_dict)))
                    data_dict = {}
                    label_dict = {}
                    query_feats = []
                    title_feats = []
                    G1,G2,G3,G4,G5,G6,G7,G8,G9,G10,G11,G12 = [],[],[],[],[],[],[],[],[],[],[],[]
                    J1,J2,J3,J4,J5,J6,J7,J8,J9,J10 = [],[],[],[],[],[],[],[],[],[]
                    labels = []
        else:
            query_span = 10
            data_dict = {}
            label_dict = {}
            labels = []
            query_feats = []
            query_hist_feats = []
            title_feats = []
            G1,G2,G3,G4,G5,G6,G7,G8,G9,G10,G11,G12 = [],[],[],[],[],[],[],[],[],[],[],[]
            J1,J2,J3,J4,J5,J6,J7,J8,J9,J10 = [],[],[],[],[],[],[],[],[],[]
            negative_names = ['J1','J2','J3','J4','J5','J6','J7','J8','J9','J10']
            for is_end, data in self.sample(random):
                cur_label = data[1]["label"]
                for name in geek_features+job_features:
                    exec("{}.append({})".format(name,[data[1][name]]))
                query_feats.append(query_embedd_dict[data[1]["query"]])
                title_feats.append(title_embedd_dict[data[1]["title_skill_keyword"]])
                query_temp = []
                for each_query in eval(data[1]["history_query"]):
                    try:
                        query_temp.append(query_embedd_dict[each_query])
                    except Exception as e:
                        query_temp.append(np.zeros(shape=(384,)))
                padding_size = query_span - len(query_temp)
                for i in range(padding_size):
                    query_temp.append(np.zeros(shape=(384,)))
                query_hist_feats.append(query_temp)
                labels.append([data[1]["label"]])
                if len(labels) == self.batch_size or is_end:
                    for e in geek_features+job_features:
                        data_dict[e]=np.array(eval(e))
                    data_dict["query_embedding"] = np.array(query_feats)
                    data_dict["title_embedding"] = np.array(title_feats)
                    data_dict["query_history_embedding"] = np.array(query_hist_feats)
                    label_dict["output"] = np.array(labels)
                    yield((data_dict, label_dict))
                    data_dict = {}
                    label_dict = {}
                    query_feats = []
                    title_feats = []
                    query_hist_feats = []
                    G1,G2,G3,G4,G5,G6,G7,G8,G9,G10,G11,G12 = [],[],[],[],[],[],[],[],[],[],[],[]
                    J1,J2,J3,J4,J5,J6,J7,J8,J9,J10 = [],[],[],[],[],[],[],[],[],[]
                    labels = []
            
    def forfit(self, random=False, mode="train"):
        while True:
            for d in self.__iter__(random,mode=mode):
                yield d

# 5.主干模型搭建
def make_or_restore_model():
    # Either restore the latest model, or create a fresh one
    # if there is no checkpoint available.
    checkpoints = [checkpoint_dir + "/" + name for name in os.listdir(checkpoint_dir)]
    if checkpoints:
        latest_checkpoint = max(checkpoints, key=os.path.getctime)
        print("Restoring from", latest_checkpoint)
        return keras.models.load_model(latest_checkpoint)
    print("Creating a new model")
    return get_compiled_model()

def get_recall_model(geek_features, job_features):
    embedding_size = 384
    bert_encoder_shape = (embedding_size,)
    vocab_bias = 50
    def model_input(shape,name):
        return Input(shape=shape,name=name,dtype="string")
    def sparse_feat(feat,vocab_size,embedding_dim):
        return Embedding(vocab_size, embedding_dim)(feat)
    def dense_feat(feat):
        return Lambda(lambda x:tf.expand_dims(x, axis=2))(feat)
    def embedd_feat(shape,name):
        return Input(shape=shape, name=name)
    def hash_bucket(x, vocab_size_max):
        return Lambda(lambda x: tf.strings.to_hash_bucket_fast(x, vocab_size_max - 1) + 1)(x)

    geek_feats = []
    job_feats = []
    for each in geek_features:
        geek_feats.append(model_input(shape=(None,),name=each))
    for each in job_features:
        job_feats.append(model_input(shape=(None,),name=each))

    geek_hash_feats = [hash_bucket(e, len(data[feat_name].value_counts())+vocab_bias) for e,feat_name in zip(geek_feats,geek_features)]
    job_hash_feats = [hash_bucket(e, len(data[feat_name].value_counts())+vocab_bias) for e,feat_name in zip(job_feats,job_features)]

    geek_feature_inputs = [sparse_feat(e, len(data[feat_name].value_counts())+vocab_bias, 64) for e,feat_name in zip(geek_hash_feats,geek_features)]
    geek_feature_columns = [Lambda(lambda x:tf.squeeze(x,[1]))(e) for e in geek_feature_inputs]
    query_feature_columns = [embedd_feat(shape=bert_encoder_shape,name="query_embedding")]
    query_history_feature_columns = embedd_feat(shape=(None,embedding_size),name="query_history_embedding")
    job_feature_inputs = [sparse_feat(e, len(data[feat_name].value_counts())+vocab_bias, 64) for e,feat_name in zip(job_hash_feats,job_features)]
    job_feature_columns = [Lambda(lambda x:tf.squeeze(x,[1]))(e) for e in job_feature_inputs]
    title_feature_columns = [embedd_feat(shape=bert_encoder_shape,name="title_embedding")]

    # query(?,384) with history query(?,?,384) --> Dot-Product Attention
    query_embeddings =  Lambda(lambda x:tf.expand_dims(x,axis=1))(query_feature_columns[0])   # shape:(?,1,384)
    # layers.Attention()(query_embeddings, query_history_feature_columns)
    query_history_transpose = Lambda(lambda x:tf.transpose(x,[0,2,1]))(query_history_feature_columns)  # shape:(?,384,?)
    query_matmul = Lambda(lambda x:tf.matmul(x[0],x[1]))([query_embeddings,query_history_transpose])   # shape:(?,1,?)
    attention_weights = layers.Softmax(name="softmax_layer")(query_matmul)  # shape:(?,1,?)                     (?,?,384)
    query_columns_with_weights = Lambda(lambda x:tf.matmul(x[0],x[1]))([attention_weights,query_history_feature_columns])   # shape:(?,1,384)
    query_columns_with_weights = Lambda(lambda x:tf.squeeze(x,[1]))(query_columns_with_weights)

    # geek tower
    geek_vector_tmp = Lambda(lambda x:K.concatenate(x, axis=-1))(geek_feature_columns+[query_columns_with_weights])
    geek_vector = Dense(64, activation="relu")(geek_vector_tmp)
    geek_vector = Dense(32, activation="relu",kernel_regularizer="l2",name="geek_vector")(geek_vector)

    # job tower
    job_vector_tmp = Lambda(lambda x:K.concatenate(x, axis=-1))(job_feature_columns+title_feature_columns)
    job_vector = Dense(64, activation="relu")(job_vector_tmp)
    job_vector = Dense(32, activation="relu",kernel_regularizer="l2",name="job_vector")(job_vector)

    dot_geek_job = Lambda(lambda x:tf.multiply(x[0],x[1]))([geek_vector, job_vector])
    dot_geek_job = Lambda(lambda x:tf.reduce_sum(x,axis=1))(dot_geek_job)
    dot_geek_job = Lambda(lambda x:tf.expand_dims(x,1))(dot_geek_job)

    output = layers.Dense(1, activation="sigmoid", name="output")(dot_geek_job)
    model = Model(inputs=geek_feats+job_feats+query_feature_columns+title_feature_columns+[query_history_feature_columns], outputs=output,name="merge")
    return model

    
def run_training(epochs, geek_features, job_features, train, test,online_eval_data):
    model = get_recall_model(geek_features, job_features)
    model.compile("adam","binary_crossentropy",metrics=['binary_crossentropy'])
    logger.info("### model compiled!")
    
    logger.info("### data generating...")
    train_data_gen = data_generator(data=train, batch_size=batch_size)
    test_data_gen = data_generator(data=test, batch_size=batch_size)
    online_data_gen = data_generator(data=online_eval_data, batch_size=batch_size)
    logger.info("### data generator finished!")
    
    early_stop = EarlyStopping(monitor='loss', patience=3, verbose=1)
    checkpoint = ModelCheckpoint(os.path.join(checkpoint_save_dir, best_model_name),
                             monitor='loss', verbose=1, save_best_only=True, mode='min')
    now = int(round(time.time()*1000))
    TIMESTAMP = time.strftime('%Y-%m-%d_%H_%M_%S.traning',time.localtime(now/1000))
    tensorboard_callback = keras.callbacks.TensorBoard(log_dir=os.path.join(base_dir, "logs/"+TIMESTAMP))
    callbacks_list = [checkpoint, early_stop, tensorboard_callback]
    history = model.fit_generator(train_data_gen.forfit(mode="no_batch_negative"), steps_per_epoch=len(train_data_gen), epochs=epochs, callbacks=callbacks_list, verbose=1)
    model.save(model_path)
    logger.info("### model saved!")

    def eval_function(data, data_gen, name):
        y_true = np.array([[e[1]["label"]] for e in data])
        y_pred = model.predict_generator(data_gen.forfit(mode="no_batch_negative"),steps=len(data_gen), verbose=1)
        logger.info("{} Loss: {}".format(name, round(log_loss(y_true, y_pred), 4)))
        logger.info("{} AUC : {}".format(name, round(roc_auc_score(y_true, y_pred), 4)))
    eval_function(test, test_data_gen, "Test_offline")
    eval_function(online_eval_data, online_data_gen, "Test_online")

if __name__ == "__main__":
    run_training(epochs, geek_features, job_features, train, test, online_eval_data)
    logger.info("### model train over!")

代码仓促有些乱,有时间再整理。

遗留问题:Loss 需要重新设计,采用hinge loss,数据形式为三元组

posted @ 2022-02-09 21:48  今夜无风  阅读(121)  评论(0编辑  收藏  举报