基于macbert的语义模型训练记录
模型训练代码:https://github.com/zhoujx4/NLP-Series-sentence-embeddings
预训练macbert:https://github.com/ymcui/MacBERT
抱抱脸模型下载:https://huggingface.co/hfl/chinese-macbert-base/tree/main
公开数据集:https://github.com/zejunwang1/CSTS/tree/main
tensorflow2版本与torch版本互转:https://github.com/huggingface/transformers/tree/main/src/transformers
基于知识库构建数据集:不同意图下的相似问两两组合产生笛卡尔积,需注意脏数据以及特殊意图的处理
pat_pru = re.compile(r'[0-9()【】%*#+-\.\\\/:=:__。\s、;;“”""''’‘??!!<《》>^&{}|=……\s]')
from sentence_transformers import SentenceTransformer, LoggingHandler
model_save_path ="./data/output/xxx"
model = SentenceTransformer(model_save_path)
emb=model.encode(["你好","明天"])
#coding=utf-8
"""
Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.
"""
import numpy as np
import tensorflow.compat.v1 as tf
import torch
from transformers import BertModel
import os
def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str):
"""
:param model:BertModel Pytorch model instance to be converted
:param ckpt_dir: Tensorflow model directory
:param model_name: model name
:return:
Currently supported Huggingface models:
Y BertModel
N BertForMaskedLM
N BertForPreTraining
N BertForMultipleChoice
N BertForNextSentencePrediction
N BertForSequenceClassification
N BertForQuestionAnswering
"""
tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value")
var_map = (
("layer.", "layer_"),
("word_embeddings.weight", "word_embeddings"),
("position_embeddings.weight", "position_embeddings"),
("token_type_embeddings.weight", "token_type_embeddings"),
(".", "/"),
("LayerNorm/weight", "LayerNorm/gamma"),
("LayerNorm/bias", "LayerNorm/beta"),
("weight", "kernel"),
)
if not os.path.isdir(ckpt_dir):
os.makedirs(ckpt_dir)
state_dict = model.state_dict()
def to_tf_var_name(name: str):
for patt, repl in iter(var_map):
name = name.replace(patt, repl)
return "bert/{}".format(name)
def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
session.run(tf.variables_initializer([tf_var]))
session.run(tf_var)
return tf_var
tf.reset_default_graph()
with tf.Session() as session:
for var_name in state_dict:
tf_name = to_tf_var_name(var_name)
torch_tensor = state_dict[var_name].numpy()
if any([x in var_name for x in tensors_to_transpose]):
torch_tensor = torch_tensor.T
tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
tf.keras.backend.set_value(tf_var, torch_tensor)
tf_weight = session.run(tf_var)
print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))
saver = tf.train.Saver(tf.trainable_variables())
saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_").replace(".ckpt", "") + ".ckpt"))
def convert(pytorch_bin_path: str, pytorch_bin_model: str, tf_ckpt_path: str, tf_ckpt_model: str):
model = BertModel.from_pretrained(
pretrained_model_name_or_path=pytorch_bin_path,
state_dict=torch.load(os.path.join(pytorch_bin_path, pytorch_bin_model), map_location='cpu')
)
convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=tf_ckpt_path, model_name=tf_ckpt_model)
if __name__ == '__main__':
bin_path = './data/output/1000'
bin_model = 'pytorch_model.bin'
ckpt_path = './bert_v1'
ckpt_model = 'bert_model.ckpt'
convert(bin_path, bin_model, ckpt_path, ckpt_model)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人