基于Hugging Face的transformers包的微调模型训练
transformers API参考链接:https://huggingface.co/docs/transformers/v4.21.2/en/training
train.py
from datasets import load_dataset from transformers import AutoTokenizer,AutoConfig from transformers import DataCollatorWithPadding from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer import os import json #from datasets import load_metric os.environ["CUDA_VISIBLE_DEVICES"]= "1,2,3,4,5,6,7" # 加载数据集(训练数据、测试数据) dataset = load_dataset("csv", data_files={"train": "./weibo_train.csv", "test": "./weibo_test.csv"}, cache_dir="./cache") dataset = dataset.class_encode_column("label") #对标签类进行编码,此过程对训练集的标签进行汇总 # 利用加载的数据集,对label进行编号,生成label_map,以便于训练、及后续的推理、计算准确率等 def generate_label_map(dataset): labels=dataset['train'].features['label'].names label2id=dict() for idx,label in enumerate(labels): label2id[label]=idx return label2id def save_label_map(dataset,label_map_file): # only take the labels of the training data for the label set of the model. label2id=generate_label_map(dataset) with open(label_map_file,'w',encoding='utf-8') as fout: json.dump(label2id,fout) # 保存label map label_map_file='label2id.json' save_label_map(dataset,label_map_file) # 读取label map【注意,在多卡训练时,这种读取文件的方法可能会导致报错】 #label2id={} #with open(label_map_file,'r',encoding='utf-8') as fin: # label2id=json.load(fin) label2id=generate_label_map(dataset) if not label2id: exit() id2label={id:label for label,id in label2id.items()} #加载tokenizer,会自动下载 tokenizer = AutoTokenizer.from_pretrained("./bert-base-chinese") def preprocess_function(examples): return tokenizer(examples["text"], truncation=True, max_length=45) tokenized_dataset = dataset.map(preprocess_function, batched=True) #small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(1000)) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) config = AutoConfig.from_pretrained("./bert-base-chinese", num_labels=len(label2id), id2label=id2label, label2id=label2id) # 加载Bert预训练模型 model=AutoModelForSequenceClassification.from_pretrained("./bert-base-chinese",config=config) training_args = TrainingArguments( output_dir="./results", learning_rate=2e-5, per_device_train_batch_size=180, per_device_eval_batch_size=128, num_train_epochs=20, weight_decay=0.01, #fp16=True, evaluation_strategy="epoch", save_strategy="epoch", ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], eval_dataset=tokenized_dataset["test"], #train_dataset=small_train_dataset, #eval_dataset=small_eval_dataset, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) print("training sample: ", trainer.train_dataset[1]) trainer.train() print("finished")
2、推理
参考文档:https://huggingface.co/docs/transformers/v4.21.2/en/pipeline_tutorial
infer.py
from transformers import pipeline from transformers.pipelines.pt_utils import KeyDataset from datasets import load_dataset import datasets from sklearn.metrics import accuracy_score # 加载测试数据 #dataset = load_dataset("csv", data_files={"train": "", "test": "./weibo_test.csv"}, split='test') dataset=load_dataset("csv", data_files={"train": "weibo_train.csv", "test": "weibo_test.csv"}, cache_dir="./cache") # 加载模型 model_dir='./results/checkpoint-1200' print('using checkpoint from dir:',model_dir) pipe = pipeline(task="text-classification",device=0,model=model_dir) # 模型预测 preds=[] for out in pipe(KeyDataset(dataset['test'], "text"), batch_size=128, truncation="only_first"): print(out) #print(out['label']) preds.append(out['label']) ''' with open('pred.txt','w',encoding='utf8') as fout: for label in preds: fout.write(label) fout.write('\n') ''' # 计算准确率 y_true=dataset['test']['label'] acc=accuracy_score(y_true,preds) print('Acc on test data:{:.4f}'.format(acc))