模型召回之SimCSE

dataset
- unsuper
- super
loss
model
train
- unsuper train
- super train

dataset

unsuper

import numpy as np
import math


class UnsuperviseData(tf.keras.utils.Sequence):
    def __init__(self, x_set, batch_size):
        self.x = x_set
        self.batch_size = batch_size

    def __len__(self):
        return math.ceil(len(self.x) / self.batch_size)

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) *
        self.batch_size]
        batch_x = batch_x + batch_x
        bx = np.array([batch_x[i::self.batch_size] for i in range(self.batch_size)]).flatten().tolist()
        return self._tokenizer(bx)

    def _tokenizer(self,x):
        return tokenizer(x, max_length=50, padding=True,truncation=True,return_tensors="tf")

super

class SuperviseData(tf.keras.utils.Sequence):
    def __init__(self, query_set, doc_set, corpus, batch_size):
        self.querys = query_set
        self.docs = doc_set
        self.corpus = corpus
        self.batch_size = batch_size
        self.size = len(self.corpus)

    def __len__(self):
        return math.ceil(len(self.querys) / self.batch_size)

    def __getitem__(self, idx):
        batch_query = self.querys[idx * self.batch_size: (idx + 1) * self.batch_size]
        batch_doc = self.docs[idx * self.batch_size: (idx + 1) * self.batch_size]
        # naive in-batch negativate
        randix = random.randint(1,self.batch_size)-1)
        neg_doc = batch_doc[randix:] + batch_doc[:randix]
        bx = np.array([(batch_query[i],batch_doc[i],neg_doc[i]) for i in range(self.batch_size)]).flatten().tolist()
        return self._tokenizer(bx)

    def _tokenizer(self, inputs):
        return tokenizer(inputs, max_length=50, padding=True,truncation=True,return_tensors="tf")

loss

对比学习的核心就是loss的编写，记录下loss的tensorflow实现

假设embedding向量维度为3

y_pred = tf.random.uniform((6,3))

def unsupervise_loss(y_pred, alpha=0.05):
    idxs = tf.range(y_pred.shape[0])
    y_true = idxs + 1 - idxs % 2 * 2	# [1 0 3 2 5 4]
    y_pred = tf.math.l2_normalize(y_pred, dim = 1)
    similarities = tf.matmul(y_pred, y_pred,adjoint_b = True)
    similarities = similarities - tf.eye(tf.shape(y_pred)[0]) * 1e12
    similarities = similarities / alpha	# (6,6)
    print(y_true)
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, similarities, from_logits=True)	# softmax (6,)
    return tf.reduce_mean(loss)

def supervise_loss(y_pred, alpha=0.05):
    row = tf.range(0, y_pred.shape[0], 3)	# 0 3
    col = tf.range(y_pred.shape[0])	
    col = tf.squeeze(tf.where(col % 3 != 0),axis=1)	# 1 2 4 5
    y_true = tf.range(0, len(col), 2)	# [0 2]
    y_pred = tf.math.l2_normalize(y_pred, dim = 1)
    similarities = tf.matmul(y_pred, y_pred,adjoint_b = True)

    similarities = tf.gather(similarities, row, axis=0)
    similarities = tf.gather(similarities, col, axis=1)

    similarities = similarities / alpha	# (2,4)
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, similarities, from_logits=True)
    return tf.reduce_mean(loss)

model

from transformers import AutoConfig,AutoTokenizer,TFAutoModel

MODEL_NAME = "hfl/chinese-roberta-wwm-ext"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
config = AutoConfig.from_pretrained(MODEL_NAME) 
# backbone = TFAutoModel.from_pretrained(MODEL_NAME)

class baseModel(tf.keras.Model):
    def __init__(self,MODEL_NAME,finetune=False):
        super().__init__()
        self.backbone = TFAutoModel.from_pretrained(MODEL_NAME)
        if not finetune:
          self.backbone.trainable = False
          print("bert close")
        self.drop = tf.keras.layers.Dropout(0.2)
        self.dense_layer = tf.keras.layers.Dense(128)
        
    def call(self,inputs,training=False):
        x = self.backbone(inputs)[1]
        # x = self.drop(x)
        x = self.dense_layer(x)
        return x

model = baseModel(MODEL_NAME,finetune=False)

train

unsuper train

epochs = 5
batch_size = 64

t0 = time.time()
for i in range(epochs):
    ds = UnsuperviseData(doc_df["doc_content"].values.tolist(), batch_size)
    print(f"epoch {i}, training ")
    for step, batchx in enumerate(ds):
        with tf.GradientTape() as tape:
            y_pred = model(batchx, training=True)
            loss = unsupervise_loss(y_pred)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        if step % 50 == 0:
            print("Iteration step: {}; Loss: {:.3f}, Accuracy: {:.3%}, spend time: {:.3f}".format(step,loss,0,time.time()-t0))

super train

epochs = 5
batch_size = 32

t0 = time.time()
for i in range(epochs):
    ds = SuperviseData(train_data["query_content"].values.tolist(),train_data["doc_content"].values.tolist(), doc_df["doc_content"].values.tolist(), batch_size)
    print(f"epoch {i}, training ")
    for step, batchx in enumerate(ds):
        with tf.GradientTape() as tape:
            y_pred = model(batchx, training=True)
            loss = supervise_loss(y_pred)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        if step % 50 == 0:
            print("Iteration step: {}; Loss: {:.3f}, Accuracy: {:.3%}, spend time: {:.3f}".format(step,loss,0,time.time()-t0))

refenences:

电商搜索召回. https://github.com/muyuuuu/E-commerce-Search-Recall?spm=5176.21852664.0.0.79006ebf02bd2j

SimCSE pytorch. https://github.com/zhengyanzhao1997/NLP-model/tree/main/model/model/Torch_model/SimCSE-Chinese

SimCSE的loss实现源码解读. https://zhuanlan.zhihu.com/p/377862950

SimCSE简介以及核心代码详解——无监督文本向量抽取. https://zhuanlan.zhihu.com/p/462763973

in-batch negative sampling

https://github.com/facebookresearch/DPR/issues/110