BERT-文本分类demo
title: Bert文本分类
一.模型准备
首先在huggingface上下载对应的模型,也可以通过安装transformer,来将tensorflow版模型改为pytorch版。
最后得到:config.json、pytorch_model.bin 和 vocab.txt。
-
1.config.json:顾名思义,该文件就是 BERT 模型的配置文件,里面记录了所有用于训练的参数设置。
-
2.PyTorch_model.bin:模型文件本身。
-
vocab:bert分词器认识的词,当要添加新词时可以去掉unused,防止单词被拆分。
以下记录关键步骤
二
# coding: UTF-8
import time
import torch
import numpy as np
from train_eval import train, init_network
from importlib import import_module
import argparse
from utils import build_dataset, build_iterator, get_time_dif
parser = argparse.ArgumentParser(description='Chinese Text Classification')
parser.add_argument('--model', type=str, required=True, help='choose a model: Bert, ERNIE')
args = parser.parse_args()
if __name__ == '__main__':
dataset = 'THUCNews' # 数据集
model_name = args.model # bert
x = import_module('models.' + model_name)
config = x.Config(dataset)
# 如下文会配置一系列参数,包括训练集等等
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True # 保证每次结果一样
start_time = time.time()
print("Loading data...")
train_data, dev_data, test_data = build_dataset(config)
train_iter = build_iterator(train_data, config)
dev_iter = build_iterator(dev_data, config)
test_iter = build_iterator(test_data, config)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
# train
model = x.Model(config).to(config.device)
train(config, model, train_iter, dev_iter, test_iter)
3 Config与model如下
# coding: UTF-8
import torch
import torch.nn as nn
from .modeling import BertModel
from .tokenization import BertTokenizer
class Config(object):
"""配置参数"""
def __init__(self, dataset):
self.model_name = 'bert'
self.train_path = dataset + '/data/train.txt'
# 训练集
self.dev_path = dataset + '/data/dev.txt'
# 验证集
self.test_path = dataset + '/data/test.txt'
# 测试集
self.class_list = [x.strip() for x in open(
dataset + '/data/class.txt').readlines()]
# 类别名单
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt'
# 模型训练结果
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 设备
self.require_improvement = 1000
# 若超过1000batch效果还没提升,则提前结束训练
self.num_classes = len(self.class_list)
# 类别数
self.num_epochs = 3
# epoch数
self.batch_size = 128
# mini-batch大小
self.pad_size = 32
# 每句话处理成的长度(短填长切)
self.learning_rate = 5e-5
# 学习率
self.bert_path = './bert_pretrain'
self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
self.hidden_size = 768
class Model(nn.Module):
def __init__(self, config):
super(Model, self).__init__()
self.bert = BertModel.from_pretrained(config.bert_path)
for param in self.bert.parameters():
param.requires_grad = True
self.fc = nn.Linear(config.hidden_size, config.num_classes)
def forward(self, x):
context = x[0] # 输入的句子
mask = x[2]
# 对padding部分进行mask,和句子一个size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0]
_, pooled = self.bert(context, attention_mask=mask, output_all_encoded_layers=False)
out = self.fc(pooled)
return out
四 数据集加载
def build_dataset(config):
def load_dataset(path, pad_size=32):
contents = []
with open(path, 'r', encoding='UTF-8') as f:
for line in tqdm(f):
lin = line.strip()
if not lin:
continue
content, label = lin.split('\t')
token = config.tokenizer.tokenize(content)
token = [CLS] + token
seq_len = len(token)
mask = []
token_ids = config.tokenizer.convert_tokens_to_ids(token)
if pad_size:
if len(token) < pad_size:
mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
token_ids += ([0] * (pad_size - len(token)))
else:
mask = [1] * pad_size
token_ids = token_ids[:pad_size]
seq_len = pad_size
contents.append((token_ids, int(label), seq_len, mask))
return contents
train = load_dataset(config.train_path, config.pad_size)
dev = load_dataset(config.dev_path, config.pad_size)
test = load_dataset(config.test_path, config.pad_size)
return train, dev, test
class DatasetIterater(object):
def __init__(self, batches, batch_size, device):
self.batch_size = batch_size
self.batches = batches
self.n_batches = len(batches) // batch_size
self.residue = False # 记录batch数量是否为整数
if len(batches) % self.n_batches != 0:
self.residue = True
self.index = 0
self.device = device
def _to_tensor(self, datas):
x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
# pad前的长度(超过pad_size的设为pad_size)
seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
return (x, seq_len, mask), y
def __next__(self):
if self.residue and self.index == self.n_batches:
batches = self.batches[self.index * self.batch_size: len(self.batches)]
self.index += 1
batches = self._to_tensor(batches)
return batches
elif self.index >= self.n_batches:
self.index = 0
raise StopIteration
else:
batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
self.index += 1
batches = self._to_tensor(batches)
return batches
def __iter__(self):
return self
def __len__(self):
if self.residue:
return self.n_batches + 1
else:
return self.n_batches
def build_iterator(dataset, config):
iter = DatasetIterater(dataset, config.batch_size, config.device)
return iter
def get_time_dif(start_time):
"""获取已使用时间"""
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds=int(round(time_dif)))
1.load_dataset中先构建一个token和其token_ids。
2.config.tokenizer.convert_tokens_to_ids(token)`是一个使用Hugging Face Transformers库进行文本处理时,将token转换成相应的id的函数。具体来说,它会从Tokenizer对象中获取当前模型所采用的词汇表(vocab),并将输入的token转换成对应的整数id。
例如:
python复制代码from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
token = 'apple'
# 将token转换成id
id = tokenizer.convert_tokens_to_ids(token)
print(id)
在上述代码片段中,我们首先实例化了一个BertTokenizer对象,并指定了使用预训练的'bert-base-uncased'模型。接着,我们传入一个字符串token 'apple'
,使用convert_tokens_to_ids()
函数将其转换成一个整数id。最后,我们打印输出这个id。
需要注意的是,不同的模型可能采用不同的词汇表,因此在进行token到id的转换时,需要使用当前模型所采用的Tokenizer对象。另外,一些特殊的标记(如[CLS]、[SEP]等)可能在词汇表中没有对应的字符串表示,但它们也可以被转换成唯一的整数id,以便在模型中正确处理。
3.接下来根据padding_size 生成对应的mask。
4.接着根据能不能完整划分batch做处理。
5. Train and evaluation
def train(config, model, train_iter, dev_iter, test_iter):
start_time = time.time()
model.train()
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
# optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
optimizer = BertAdam(optimizer_grouped_parameters,
lr=config.learning_rate,
warmup=0.05,
t_total=len(train_iter) * config.num_epochs)
total_batch = 0 # 记录进行到多少batch
dev_best_loss = float('inf')
last_improve = 0 # 记录上次验证集loss下降的batch数
flag = False # 记录是否很久没有效果提升
model.train()
for epoch in range(config.num_epochs):
print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
for i, (trains, labels) in enumerate(train_iter):
outputs = model(trains)
model.zero_grad()
loss = F.cross_entropy(outputs, labels)
loss.backward()
optimizer.step()
if total_batch % 100 == 0:
# 每多少轮输出在训练集和验证集上的效果
true = labels.data.cpu()
predic = torch.max(outputs.data, 1)[1].cpu()
train_acc = metrics.accuracy_score(true, predic)
dev_acc, dev_loss = evaluate(config, model, dev_iter)
if dev_loss < dev_best_loss:
dev_best_loss = dev_loss
torch.save(model.state_dict(), config.save_path)
improve = '*'
last_improve = total_batch
else:
improve = ''
time_dif = get_time_dif(start_time)
msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}'
print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
model.train()
total_batch += 1
if total_batch - last_improve > config.require_improvement:
# 验证集loss超过1000batch没下降,结束训练
print("No optimization for a long time, auto-stopping...")
flag = True
break
if flag:
break
test(config, model, test_iter)
- model.train() 开启训练模式 和droupout、 layernorm有关系 相似的还有model.eval()
- model.named_parameters() 对应参数的名称和值。(weight bias)
- 其中,
optimizer_grouped_parameters
是一个包含两个字典的列表,每个字典用于对应一组参数。在每个字典中,params
表示对应参数组的张量列表,weight_decay
表示该组参数需要进行的权重衰减系数。在这里,我们将需要进行权重衰减的参数组设置一个较小的权重衰减系数0.01
,而不需要进行权重衰减的参数组则设置为0.0
,以便更好地调节模型。
4.列表推导式重写:
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
# param_optimizer等价代码
params_no_decay = []
for name, param in param_optimizer:
need_decay = True
for nd in no_decay:
if nd in name:
need_decay = False
break
if need_decay:
params_no_decay.append(param)
metrics.accuracy_score
sklearn中方法,计算acc
补充foward()
我们在使用Pytorch的时候,模型训练时,不需要调用forward这个函数,只需要在实例化一个对象中传入对应的参数就可以自动调用 forward 函数。
class Module(nn.Module):
def __init__(self):
super().__init__()
# ......
def forward(self, x):
# ......
return x
data = ...... # 输入数据
# 实例化一个对象
model = Module()
# 前向传播
model(data)
# 而不是使用下面的
# model.forward(data)
链接:
https://github.com/649453932/Bert-Chinese-Text-Classification-Pytorch
https://github.com/rsanshierli/EasyBert 包含各种任务各种bert
Bert-CNN Bert-RNN
class Model(nn.Module):
def __init__(self, config):
super(Model, self).__init__()
self.bert = BertModel.from_pretrained(config.bert_path)
for param in self.bert.parameters():
param.requires_grad = True
self.convs = nn.ModuleList(
[nn.Conv2d(1, config.num_filters, (k, config.hidden_size)) for k in config.filter_sizes])
self.dropout = nn.Dropout(config.dropout)
self.fc_cnn = nn.Linear(config.num_filters * len(config.filter_sizes), config.num_classes)
def conv_and_pool(self, x, conv):
x = F.relu(conv(x)).squeeze(3)
x = F.max_pool1d(x, x.size(2)).squeeze(2)
return x
def forward(self, x):
context = x[0] # 输入的句子
mask = x[2] # 对padding部分进行mask,和句子一个size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0]
encoder_out, text_cls = self.bert(context, attention_mask=mask, output_all_encoded_layers=False)
out = encoder_out.unsqueeze(1)
out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1)
out = self.dropout(out)
out = self.fc_cnn(out)
return out
RNN
class Model(nn.Module):
def __init__(self, config):
super(Model, self).__init__()
self.bert = BertModel.from_pretrained(config.bert_path)
for param in self.bert.parameters():
param.requires_grad = True
self.lstm = nn.LSTM(config.hidden_size, config.rnn_hidden, config.num_layers,
bidirectional=True, batch_first=True, dropout=config.dropout)
self.dropout = nn.Dropout(config.dropout)
self.fc_rnn = nn.Linear(config.rnn_hidden * 2, config.num_classes)
def forward(self, x):
context = x[0] # 输入的句子
mask = x[2] # 对padding部分进行mask,和句子一个size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0]
encoder_out, text_cls = self.bert(context, attention_mask=mask, output_all_encoded_layers=False)
out, _ = self.lstm(encoder_out)
out = self.dropout(out)
out = self.fc_rnn(out[:, -1, :]) # 句子最后时刻的 hidden state
return out