BERT-文本分类demo

title: Bert文本分类

一.模型准备

首先在huggingface上下载对应的模型,也可以通过安装transformer,来将tensorflow版模型改为pytorch版。

最后得到:config.json、pytorch_model.bin 和 vocab.txt。

  • 1.config.json:顾名思义,该文件就是 BERT 模型的配置文件,里面记录了所有用于训练的参数设置。

  • 2.PyTorch_model.bin:模型文件本身。

  • vocab:bert分词器认识的词,当要添加新词时可以去掉unused,防止单词被拆分。

以下记录关键步骤

# coding: UTF-8
import time
import torch
import numpy as np
from train_eval import train, init_network
from importlib import import_module
import argparse
from utils import build_dataset, build_iterator, get_time_dif

parser = argparse.ArgumentParser(description='Chinese Text Classification')
parser.add_argument('--model', type=str, required=True, help='choose a model: Bert, ERNIE')
args = parser.parse_args()


if __name__ == '__main__':
    dataset = 'THUCNews'  # 数据集

    model_name = args.model  # bert
    x = import_module('models.' + model_name)

    config = x.Config(dataset)
    # 如下文会配置一系列参数,包括训练集等等
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True  # 保证每次结果一样

    start_time = time.time()
    print("Loading data...")

    train_data, dev_data, test_data = build_dataset(config)
    train_iter = build_iterator(train_data, config)
    dev_iter = build_iterator(dev_data, config)
    test_iter = build_iterator(test_data, config)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # train
    model = x.Model(config).to(config.device)
    train(config, model, train_iter, dev_iter, test_iter)

3 Config与model如下

# coding: UTF-8
import torch
import torch.nn as nn
from .modeling import BertModel
from .tokenization import BertTokenizer
class Config(object):

    """配置参数"""
    def __init__(self, dataset):
        self.model_name = 'bert'
        self.train_path = dataset + '/data/train.txt'                              
        # 训练集
        self.dev_path = dataset + '/data/dev.txt'                                   
        # 验证集
        self.test_path = dataset + '/data/test.txt'                                  
        # 测试集
        self.class_list = [x.strip() for x in open(
            dataset + '/data/class.txt').readlines()]                                
        # 类别名单
        self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt'       
        # 模型训练结果
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')   
        # 设备

        self.require_improvement = 1000                                 
        # 若超过1000batch效果还没提升,则提前结束训练
        self.num_classes = len(self.class_list)                         
        # 类别数
        self.num_epochs = 3                                             
        # epoch数
        self.batch_size = 128                                           
        # mini-batch大小
        self.pad_size = 32                                              
        # 每句话处理成的长度(短填长切)
        self.learning_rate = 5e-5                                       
        # 学习率
        self.bert_path = './bert_pretrain'
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
        self.hidden_size = 768


class Model(nn.Module):

    def __init__(self, config):
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained(config.bert_path)
        for param in self.bert.parameters():
            param.requires_grad = True
        self.fc = nn.Linear(config.hidden_size, config.num_classes)

    def forward(self, x):
        context = x[0]  # 输入的句子
        mask = x[2]  
        # 对padding部分进行mask,和句子一个size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0]
        _, pooled = self.bert(context, attention_mask=mask,                        												output_all_encoded_layers=False)
        out = self.fc(pooled)
        return out

四 数据集加载

def build_dataset(config):

    def load_dataset(path, pad_size=32):
        contents = []
        with open(path, 'r', encoding='UTF-8') as f:
            for line in tqdm(f):
                lin = line.strip()
                if not lin:
                    continue
                content, label = lin.split('\t')
                token = config.tokenizer.tokenize(content)
                token = [CLS] + token
                seq_len = len(token)
                mask = []
                token_ids = config.tokenizer.convert_tokens_to_ids(token)

                if pad_size:
                    if len(token) < pad_size:
                        mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
                        token_ids += ([0] * (pad_size - len(token)))
                    else:
                        mask = [1] * pad_size
                        token_ids = token_ids[:pad_size]
                        seq_len = pad_size
                contents.append((token_ids, int(label), seq_len, mask))
        return contents
    
    train = load_dataset(config.train_path, config.pad_size)
    dev = load_dataset(config.dev_path, config.pad_size)
    test = load_dataset(config.test_path, config.pad_size)
    return train, dev, test


class DatasetIterater(object):
    def __init__(self, batches, batch_size, device):
        self.batch_size = batch_size
        self.batches = batches
        self.n_batches = len(batches) // batch_size
        self.residue = False  # 记录batch数量是否为整数
        if len(batches) % self.n_batches != 0:
            self.residue = True
        self.index = 0
        self.device = device

    def _to_tensor(self, datas):
        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)

        # pad前的长度(超过pad_size的设为pad_size)
        seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
        mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
        return (x, seq_len, mask), y

    def __next__(self):
        if self.residue and self.index == self.n_batches:
            batches = self.batches[self.index * self.batch_size: len(self.batches)]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

        elif self.index >= self.n_batches:
            self.index = 0
            raise StopIteration
        else:
            batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

    def __iter__(self):
        return self

    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return self.n_batches


def build_iterator(dataset, config):
    iter = DatasetIterater(dataset, config.batch_size, config.device)
    return iter


def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

1.load_dataset中先构建一个token和其token_ids。

2.config.tokenizer.convert_tokens_to_ids(token)`是一个使用Hugging Face Transformers库进行文本处理时,将token转换成相应的id的函数。具体来说,它会从Tokenizer对象中获取当前模型所采用的词汇表(vocab),并将输入的token转换成对应的整数id。

例如:

python复制代码from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
token = 'apple'

# 将token转换成id
id = tokenizer.convert_tokens_to_ids(token)
print(id)

在上述代码片段中,我们首先实例化了一个BertTokenizer对象,并指定了使用预训练的'bert-base-uncased'模型。接着,我们传入一个字符串token 'apple',使用convert_tokens_to_ids()函数将其转换成一个整数id。最后,我们打印输出这个id。

需要注意的是,不同的模型可能采用不同的词汇表,因此在进行token到id的转换时,需要使用当前模型所采用的Tokenizer对象。另外,一些特殊的标记(如[CLS]、[SEP]等)可能在词汇表中没有对应的字符串表示,但它们也可以被转换成唯一的整数id,以便在模型中正确处理。

3.接下来根据padding_size 生成对应的mask。

4.接着根据能不能完整划分batch做处理。

5. Train and evaluation

def train(config, model, train_iter, dev_iter, test_iter):
    start_time = time.time()
    model.train()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=len(train_iter) * config.num_epochs)
    total_batch = 0  # 记录进行到多少batch
    dev_best_loss = float('inf')
    last_improve = 0  # 记录上次验证集loss下降的batch数
    flag = False  # 记录是否很久没有效果提升
    model.train()
    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
            if total_batch % 100 == 0:
                # 每多少轮输出在训练集和验证集上的效果
                true = labels.data.cpu()
                predic = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predic)
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}'
                print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
                model.train()
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                # 验证集loss超过1000batch没下降,结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break
    test(config, model, test_iter)

  1. model.train() 开启训练模式 和droupout、 layernorm有关系 相似的还有model.eval()
  2. model.named_parameters() 对应参数的名称和值。(weight bias)
  3. 其中,optimizer_grouped_parameters是一个包含两个字典的列表,每个字典用于对应一组参数。在每个字典中,params表示对应参数组的张量列表,weight_decay表示该组参数需要进行的权重衰减系数。在这里,我们将需要进行权重衰减的参数组设置一个较小的权重衰减系数0.01,而不需要进行权重衰减的参数组则设置为0.0,以便更好地调节模型。

4.列表推导式重写:

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
# param_optimizer等价代码
params_no_decay = []
for name, param in param_optimizer:
    need_decay = True
    for nd in no_decay:
        if nd in name:
            need_decay = False
            break
    if need_decay:
        params_no_decay.append(param)
  1. metrics.accuracy_score sklearn中方法,计算acc

补充foward()

我们在使用Pytorch的时候,模型训练时,不需要调用forward这个函数,只需要在实例化一个对象中传入对应的参数就可以自动调用 forward 函数。

class Module(nn.Module):
    def __init__(self):
        super().__init__()
        # ......
    def forward(self, x):
        # ......
        return x
data = ......  # 输入数据
# 实例化一个对象
model = Module()

# 前向传播
model(data)
# 而不是使用下面的
# model.forward(data)  

链接:

https://github.com/649453932/Bert-Chinese-Text-Classification-Pytorch

https://github.com/rsanshierli/EasyBert 包含各种任务各种bert

Bert-CNN Bert-RNN

class Model(nn.Module):

    def __init__(self, config):
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained(config.bert_path)
        for param in self.bert.parameters():
            param.requires_grad = True
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, config.num_filters, (k, config.hidden_size)) for k in config.filter_sizes])
        self.dropout = nn.Dropout(config.dropout)

        self.fc_cnn = nn.Linear(config.num_filters * len(config.filter_sizes), config.num_classes)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        context = x[0]  # 输入的句子
        mask = x[2]  # 对padding部分进行mask,和句子一个size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0]
        encoder_out, text_cls = self.bert(context, attention_mask=mask, 																	output_all_encoded_layers=False)
        out = encoder_out.unsqueeze(1)
        out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1)
        out = self.dropout(out)
        out = self.fc_cnn(out)
        return out

RNN

class Model(nn.Module):

    def __init__(self, config):
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained(config.bert_path)
        for param in self.bert.parameters():
            param.requires_grad = True
        self.lstm = nn.LSTM(config.hidden_size, config.rnn_hidden, config.num_layers,
                            bidirectional=True, batch_first=True, dropout=config.dropout)
        self.dropout = nn.Dropout(config.dropout)
        self.fc_rnn = nn.Linear(config.rnn_hidden * 2, config.num_classes)

    def forward(self, x):
        context = x[0]  # 输入的句子
        mask = x[2]  # 对padding部分进行mask,和句子一个size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0]
        encoder_out, text_cls = self.bert(context, attention_mask=mask, 																output_all_encoded_layers=False)
        out, _ = self.lstm(encoder_out)
        out = self.dropout(out)
        out = self.fc_rnn(out[:, -1, :])  # 句子最后时刻的 hidden state
        return out

posted @ 2023-04-06 18:50  ZZX11  阅读(219)  评论(0编辑  收藏  举报