onnx导出BERT示例
onnx导出BERT示例
BERT模型导出
1.基于transformers载入PyTorch模型
2.创建伪输入(dummy inputs),并利用伪输入在模型中前向inference,推理网络并在这个过程中追踪记录操作集合
3.在输入和输出tensors上定义动态轴
4.保存graph和网络参数
nlp与cv的区别不大,主要是注意一下不输入序列定长的问题,也就是export方法中的dynamic_axes参数
BERT-Large, Uncased.(Whole Word Masking): 24-layer, 1024-hidden, 16-heads, 340M parameters
BERT-Large, Cased(Whole Word Masking) : 24-layer, 1024-hidden, 16-heads, 340M parameters
BERT-Base, Uncased: 12-layer, 768-hidden, 12-heads, 110M parameters
BERT-Large, Uncased: 24-layer, 1024-hidden, 16-heads, 340M parameters
BERT-Base, Cased: 12-layer, 768-hidden, 12-heads , 110M parameters
BERT-Large, Cased: 24-layer, 1024-hidden, 16-heads, 340M parameters
BERT-Base, Multilingual Cased (New, recommended): 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
BERT-Base, Multilingual Uncased (Orig, not recommended) (Not recommended, use Multilingual Cased instead): 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
BERT-Base, Chinese: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
前6个为英文模型,Multilingual代表多语言模型,最后一个是中文模型 (字级别)
Uncased 代表将字母全部转换成小写,而Cased代表保留了大小写
加载模型
from pathlib import Path
from transformers import BertConfig,BertModel, BertTokenizer
from transformers.convert_graph_to_onnx import convert
from transformers import AutoTokenizer
import torch
config = BertConfig.from_pretrained("bert-base-uncased")
print(config)
bert_model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, config=config)
print(bert_model.config)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# BertConfig {
# "_name_or_path": "bert-base-uncased",
# "architectures": [
# "BertForMaskedLM"
# ],
# "attention_probs_dropout_prob": 0.1,
# "classifier_dropout": null,
# "gradient_checkpointing": false,
# "hidden_act": "gelu",
# "hidden_dropout_prob": 0.1,
# "hidden_size": 768,
# "initializer_range": 0.02,
# "intermediate_size": 3072,
# "layer_norm_eps": 1e-12,
# "max_position_embeddings": 512,
# "model_type": "bert",
# "num_attention_heads": 12,
# "num_hidden_layers": 12,
# "pad_token_id": 0,
# "position_embedding_type": "absolute",
# "transformers_version": "4.36.2",
# "type_vocab_size": 2,
# "use_cache": true,
# "vocab_size": 30522
# }
pt模型推理
from transformers import AutoModel, AutoConfig, AutoTokenizer
import torch
from itertools import chain
# 加载model,token,config
model = AutoModel.from_pretrained('bert-base-uncased')
config = AutoConfig.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# 定义句子
# 分词器分词
sentence = 'here is some text to encode'
inputs_pt = tokenizer(sentence, return_tensors='pt')
print(inputs_pt["input_ids"].shape)
outputs = model(**inputs_pt)
print(dir(outputs))
last_hidden_state = outputs.last_hidden_state
pooler_output = outputs.pooler_output
print("Token wise output: {}, Pooled output: {}".format(last_hidden_state.shape, pooler_output.shape))
print(last_hidden_state)
print("---" * 20)
torch.Size([1, 9])
['__annotations__', '__class__', '__contains__', '__dataclass_fields__', '__dataclass_params__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__post_init__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'attentions', 'clear', 'copy', 'cross_attentions', 'fromkeys', 'get', 'hidden_states', 'items', 'keys', 'last_hidden_state', 'move_to_end', 'past_key_values', 'pooler_output', 'pop', 'popitem', 'setdefault', 'to_tuple', 'update', 'values']
Token wise output: torch.Size([1, 9, 768]), Pooled output: torch.Size([1, 768])
tensor([[[-0.0549, 0.1053, -0.1065, ..., -0.3551, 0.0686, 0.6506],
[-0.5759, -0.3650, -0.1383, ..., -0.6782, 0.2092, -0.1639],
[-0.1641, -0.5597, 0.0150, ..., -0.1603, -0.1346, 0.6216],
...,
[ 0.2448, 0.1254, 0.1587, ..., -0.2749, -0.1163, 0.8809],
[ 0.0481, 0.4950, -0.2827, ..., -0.6097, -0.1212, 0.2527],
[ 0.9046, 0.2137, -0.5897, ..., 0.3040, -0.6172, -0.1950]]],
grad_fn=<NativeLayerNormBackward0>)
------------------------------------------------------------
重新导出config
# 利用config生成一个onnx的config
from transformers.onnx.features import FeaturesManager
onnx_config = FeaturesManager._SUPPORTED_MODEL_TYPE['bert']['sequence-classification'](config)
print(onnx_config.inputs.items())
print(onnx_config.outputs.items())
odict_items([('input_ids', {0: 'batch', 1: 'sequence'}), ('attention_mask', {0: 'batch', 1: 'sequence'}), ('token_type_ids', {0: 'batch', 1: 'sequence'})])
odict_items([('logits', {0: 'batch'})])
# dummy_inputs的计算需要利用到tokenizer
dummy_inputs = onnx_config.generate_dummy_inputs(tokenizer, framework='pt')
print(dummy_inputs)
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
导出onnx模型
import torch
from transformers import AutoModel, AutoConfig, AutoTokenizer
from transformers.onnx.features import FeaturesManager
from transformers.convert_graph_to_onnx import convert
# 加载model,token,config
model = AutoModel.from_pretrained('bert-base-uncased')
config = AutoConfig.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model.eval()
# 定义句子
sentence = 'here is some text to encode'
# 利用config生成一个onnx的config
# dummy_inputs的计算需要利用到tokenizer
onnx_config = FeaturesManager._SUPPORTED_MODEL_TYPE['bert']['sequence-classification'](config)
dummy_inputs = onnx_config.generate_dummy_inputs(tokenizer, framework='pt')
output_onnx_path = "assets/bert_uncased.onnx"
print("onnx input",onnx_config.inputs.items())
print("onnx output",onnx_config.outputs.items())
input_ids = dummy_inputs['input_ids']
attention_masks = dummy_inputs['attention_mask']
token_type_ids = dummy_inputs['token_type_ids']
input_names = ["input_ids", "attention_masks", "token_type_ids"]
output_names = ["output"]
torch.onnx.export(bert_model,
(input_ids, attention_masks, token_type_ids), # 或者 (dummy_inputs,)
f=output_onnx_path,
verbose=True,
input_names=list(onnx_config.inputs.keys()),
output_names=list(onnx_config.outputs.keys()),
dynamic_axes={name: axes for name, axes in chain(onnx_config.inputs.items(), onnx_config.outputs.items())
},
opset_version=onnx_config.default_onnx_opset)
print("转换完成")
onnx input odict_items([('input_ids', {0: 'batch', 1: 'sequence'}), ('attention_mask', {0: 'batch', 1: 'sequence'}), ('token_type_ids', {0: 'batch', 1: 'sequence'})])
onnx output odict_items([('logits', {0: 'batch'})])
转换完成
加载onnx测试
import onnxruntime as ort
# 定义句子
sentence = 'here is some text to encode'
options = ort.SessionOptions() # initialize session options
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# 这里的路径传上一节保存的onnx模型地址
session = ort.InferenceSession(
"assets/bert_uncased.onnx", sess_options=options, providers=["CUDAExecutionProvider","CPUExecutionProvider"]
)
# disable session.run() fallback mechanism, it prevents for a reset of the execution provider
session.disable_fallback()
inputs = tokenizer(sentence, return_tensors='pt')
inputs = {k: v.detach().cpu().numpy() for k, v in inputs.items()}
print(inputs.keys())
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
# 运行
# 这里的logits要有export的时候output_names相对应
output = session.run(output_names=['logits'], input_feed=inputs)
print(output)
print(output)
print(output[0].shape)
onnx output odict_items([('logits', {0: 'batch'})])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
[array([[[-0.05490887, 0.10528212, -0.10649522, ..., -0.3550497 ,
0.06862388, 0.650573 ],
[-0.5759427 , -0.36500782, -0.13834022, ..., -0.6781805 ,
0.20923868, -0.16394015],
[-0.16414754, -0.55971897, 0.01500742, ..., -0.16027743,
-0.13455114, 0.62159723],
...,
[ 0.2447815 , 0.125429 , 0.15869957, ..., -0.27489156,
-0.11634777, 0.88089377],
[ 0.0481048 , 0.4950128 , -0.28274378, ..., -0.6097362 ,
-0.12124838, 0.2527281 ],
[ 0.9046008 , 0.21367389, -0.5896968 , ..., 0.30398968,
-0.61721766, -0.19498175]]], dtype=float32)]
(1, 9, 768)