文本分类
基于BERT的文本分类任务
1、问题分析
文本分类作为NLP的入门任务,因为其应用广泛被大家熟知,文本以今年的竞赛为例,详细的说明一下文本分类任务的实现步骤,竞赛如下:
因为该赛题是一个文本分类的任务,我们需要知道该任务的输入和输出,数据介绍如下,根据数据我们可以看出文本的输入分别是专利的title、assignee和abstract三部分,对应的label是专利分类的标签,我们把这个任务看作为文本分类的任务,为此我们基于BERT来完成这个任务。
2、代码实现
# 安装必要的库 !pip install transformers
# 数据加载 import torch from torch.utils.data import Dataset class CLSDataset(Dataset): def __init__(self, data_list, label_list, tokenizer, max_len): """ :param data_list: :param label_list: :param tokenizer: :param max_len: """ self.data_list = data_list self.label_list = label_list self.tokenizer = tokenizer self.max_len = max_len def __getitem__(self, item): data_item = self.data_list[item] label_item = self.label_list[item] inputs = self.tokenizer.encode_plus(data_item, None, add_special_tokens=True, max_length=self.max_len, pad_to_max_length=True, return_token_type_ids=True) return { "input_ids": torch.tensor(inputs['input_ids'], dtype=torch.long), "attention_mask": torch.tensor(inputs['attention_mask'], dtype=torch.long), "token_type_ids": torch.tensor(inputs['token_type_ids'], dtype=torch.long), "labels": torch.tensor(label_item) } def __len__(self): return len(self.data_list)
# 模型的训练 from tqdm import tqdm def cls_train(train_loader, model, loss, optimizer, device): """ :param train_loader: :param model: :param loss: :param optimizer: :param schedule: :param device: :return: """ model.train() for batch in tqdm(train_loader): optimizer.zero_grad() input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) token_type_ids = batch['token_type_ids'].to(device) targets = batch['labels'].to(device) outputs1 = model(input_ids, attention_mask, token_type_ids) outputs1.loss.backward() optimizer.step()
# 模型的测试 from sklearn import metrics def cls_test_macro(test_dataloader, model, device): """ :param test_dataloader: :param model: :param device: :return: """ model.eval() fin_targets = [] fin_outputs = [] with torch.no_grad(): for idx, batch in enumerate(test_dataloader): input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) token_type_ids = batch['token_type_ids'].to(device) labels = batch['labels'].to(device) outputs = model(input_ids, attention_mask, token_type_ids) fin_outputs.extend(outputs.logits.cpu().detach().numpy().argmax(1).tolist()) fin_targets.extend(labels.cpu().detach().numpy().tolist()) f1_macro = metrics.f1_score(fin_targets, fin_outputs, average='macro') return f1_macro
import json from sklearn.model_selection import KFold, train_test_split from transformers import BertTokenizer,set_seed,AdamW,optimization,BertForSequenceClassification import numpy as np from torch.utils.data import DataLoader import torch.nn as nn # 随机种子的设置 set_seed(42) # 数据路径 train_path="./data/train_zl.json" model_path="hfl/chinese-roberta-wwm-ext-large" # 保存的路径 SAVE_PATH="./save_model/best.pth" # tokenizer的构建 tokenizer=BertTokenizer.from_pretrained(model_path) # 参数的定义 MAX_LEN = 512 train_batch_size = 4 dev_batch_size = 4 LEARNING_RATE = 2e-5 label_num = 36 device = "cuda" EPOCHS = 20 n_splits = 5 # 数据集的构建 data_list=[] label_list=[] with open(train_path,encoding="utf-8") as file_read: for line in file_read.readlines(): line_dict=json.loads(line) data_list.append( f"这份专利的标题为:《{line_dict['title']}》,由“{line_dict['assignee']}”公司申请,详细说明如下:{line_dict['abstract']}") label_list.append(int(line_dict['label_id'])) file_read.close() # 数据集的划分 MAX_ACC = 0 X_train, X_test, y_train, y_test=train_test_split(data_list,label_list) train_list = X_train train_label_ = y_train dev_text = X_test dev_label = y_test model=BertForSequenceClassification.from_pretrained(model_path,num_labels=label_num) model.to(device) train_dataset = CLSDataset(train_list, train_label_, tokenizer, MAX_LEN) dev_dataset = CLSDataset(dev_text, dev_label, tokenizer, MAX_LEN) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True) test_loader = DataLoader(dev_dataset, batch_size=dev_batch_size, shuffle=False) #loss loss = nn.CrossEntropyLoss() optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)for epoch in range(EPOCHS): cls_train(train_loader, model, loss, optimizer, device) acc = cls_test_macro(test_loader, model, device) if acc > MAX_ACC: MAX_ACC = acc print(MAX_ACC) torch.save(model.state_dict(), SAVE_PATH)