NLP文本分类学习笔记6:基于tansformer的文本分类
transformer
参考:
https://www.cnblogs.com/erable/p/15072941.html
https://www.cnblogs.com/xiximayou/p/13978859.html
在论文Attention Is All You Need中提出了transformer这一模型,总体如下图
下面一步步从总体到局部来看,transformer由encoder与decoder两部分组成
encoder
对于encoder,由多个block连接。
在每一个block之前使用位置编码positional encoding,它可以提供token在序列中的位置信息(因为注意力机制并没有包含输入序列的位置信息,所以只好在输入时就添加好位置信息)
添加位置信息有很多方法,论文中使用了如下方法,其实就是对于每一个位置pos(即输入序列的位置)的偶数维加入一个sin的值,奇数维加入一个cos的值,d_model实际为词向量的维度
对每一个block中,采用如下结构,
其中可以分成两个部分
multi-head attention
第一部分中,输入在经过multi-head attention处理后和原本的值相加,这个叫做residual(残差连接),最后通过Layer norm归一化,归一化中每个向量中元素的值为原来的值减去平均值后再除以标准差
对于多头注意力机制multi-head attention,其实就是使用了多组自注意力机制(见NLP文本分类学习笔记5),然后将每组的结果拼接起来。
每组注意力机制如下所示:
- Q与K矩阵相乘
- 经过一个scale(其实就是除以$\sqrt{d_t} \(,它为Q,K,V的维数,值等于\)d_model(词向量维数)/注意力组数$)。
- mask因为在文本处理时,为了使每个样本等长,会进行一些填充(或者decoder中的mask操作),而这些填充会影响模型准确率,所以将他们的值设为一个很小的负数。
- 之后softmax归一化后与V相乘得到一组的结果
Feed-Forward Networks
对于第二个部分,为全连接层加残差连接加归一化
对于全连接层,论文中的公式如下,也就是一层全连接后经过一次relu激活,再加一层全连接
对于decoder
和encoder不同的有:
- 多加了masked multi-head attention块,与multi-head attention不同之处在于,decoder不应该在t时知道一个序列t+1时的输入,所以需要将之后的输入mask掉(如前面介绍的maxk操作,也就是会将相应的地方的值设为一个很小的负数)
- 另一个是multi-head attention的输入,与encoder的multi-head attention不同之处在于,它的输入K,V来自encoder输出得到的K和V,Q来自于自己经过masked multi-head attention块得到的Q,如下所示:
pytorch实现基于transformer的文本分类
transformer用于文本分类只需要使用encoder,并在它的输出后加一个分类即可
这里实现没有采用论文中的参数设置,在10分类的任务中,对测试集的准确率为86.58%
这里不得不有感而发下,在开始的模型中我没有加drpout,所以最后的准确率才百分之10几,所以有时候复现论文达不到作者的效果,可能不是模型结构的问题,可能是一些参数设置等小的细节,作者没有说,但却很重要。
代码更详细说明见NLP文本分类学习笔记0
import copy
import json
import pickle
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
class Config(object):
def __init__(self, embedding_pre):
self.embedding_path = 'data/embedding.npz'
self.embedding_model_path = "mymodel/word2vec.model"
self.train_path = 'data/train.df' # 训练集
self.dev_path = 'data/valid.df' # 验证集
self.test_path = 'data/test.df' # 测试集
self.class_path = 'data/class.json' # 类别名单
self.vocab_path = 'data/vocab.pkl' # 词表
self.save_path ='mymodel/transformer.pth' # 模型训练结果
self.embedding_pretrained = torch.tensor(np.load(self.embedding_path, allow_pickle=True)["embeddings"].astype(
'float32')) if embedding_pre == True else None # 预训练词向量
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设备
self.dropout = 0.5 # 随机失活
self.num_classes = len(json.load(open(self.class_path, encoding='utf-8'))) # 类别数
self.n_vocab = 0 # 词表大小,在运行时赋值
self.epochs = 10 # epoch数
self.batch_size = 128 # mini-batch大小
self.maxlen = 32 # 每句话处理成的长度(短填长切)
self.learning_rate = 5e-4 # 学习率
self.embed_size = self.embedding_pretrained.size(1)\
if self.embedding_pretrained is not None else 200 # 词向量维度
self.dim_model = 200 #其实是词向量的维度
self.hidden = 800
self.num_head = 5 #保证词向量维数能整除这个值
self.num_encoder = 2 #block数
class Pos_encoding(nn.Module):
def __init__(self, d_model,maxlen,dropout):
super(Pos_encoding, self).__init__()
self.PE=torch.tensor([[pos/1000**(i//2*2/d_model) for pos in range(maxlen)] for i in range(d_model)])
self.PE[:,0::2]=np.sin(self.PE[:,0::2])
self.PE[:,1::2] = np.sin(self.PE[:,1::2])
self.dropout = nn.Dropout(dropout)
def forward(self,input):
out = self.dropout(input+self.PE)
return out
class Multi_head_attention(nn.Module):
def __init__(self, d_model,num_head,dropout):
super(Multi_head_attention, self).__init__()
self.num_head=num_head
self.d_model=d_model
self.WQ = nn.Linear(d_model,d_model*num_head, bias=False)
self.WK = nn.Linear(d_model,d_model*num_head, bias=False)
self.WV = nn.Linear(d_model,d_model*num_head, bias=False)
self.scale=d_model/num_head **0.5
self.fc=nn.Linear(d_model*num_head,d_model)
self.norm=nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, inputs,mask=None):
batch_size=inputs.shape[0]
Q = self.WQ(inputs)
K = self.WQ(inputs)
V = self.WQ(inputs)
#相当于分为多组注意力机制[batch,序列长度,组数,每组维度]->[batch,组数,序列长度,每组维度]
Q = Q.view(batch_size , -1,self.num_head, self.d_model//self.num_head).permute(0,2,1,3)
K = K.view(batch_size , -1,self.num_head, self.d_model//self.num_head).permute(0,2,1,3)
V = V.view(batch_size , -1,self.num_head, self.d_model//self.num_head).permute(0,2,1,3)
# scale操作
a=Q @ K.permute(0, 1, 3, 2) / self.scale
# if mask:
# a = a.masked_fill_(mask == 0, -1e9)
a = F.softmax(a, dim=-1)
o = a @ V
#相当于多组注意力机制拼接
o=o.view(batch_size,-1,self.d_model*self.num_head)
o=self.fc(o)
o = self.dropout(o)
#残差连接加归一
o=self.norm(o+inputs)
return o
class Feed_forward(nn.Module):
def __init__(self, dim_model,hidden,dropout):
super(Feed_forward, self).__init__()
self.fc1=nn.Linear(dim_model,hidden)
self.fc2 = nn.Linear( hidden,dim_model)
self.norm=nn.LayerNorm(dim_model)
self.dropout = nn.Dropout(dropout)
def forward(self,input):
out=self.fc1(input)
out=F.relu(out)
out=self.fc2(out)
out = self.dropout(out)
out=self.norm(out+input)
return out
class Encoder(nn.Module):
def __init__(self,dim_model,num_head,hidden,dropout):
super(Encoder, self).__init__()
self.multi_head_attention=Multi_head_attention(dim_model,num_head,dropout)
self.feed_forward=Feed_forward(dim_model,hidden,dropout)
def forward(self,input):
out=self.multi_head_attention(input)
out=self.feed_forward(out)
return out
class Model(nn.Module):
def __init__(self, config):
super(Model, self).__init__()
if config.embedding_pretrained is not None:
self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
else:
vocab = pickle.load(open(config.vocab_path, 'rb'))
config.n_vocab=len(vocab.dict)
self.embedding = nn.Embedding(config.n_vocab, config.embed_size, padding_idx=config.n_vocab - 1)
self.postion=Pos_encoding(config.maxlen,config.embed_size,config.dropout)
self.encoder=Encoder(config.dim_model,config.num_head,config.hidden,config.dropout)
self.encoders=nn.ModuleList([copy.deepcopy(self.encoder) for _ in range(config.num_encoder)])
self.fc=nn.Linear(config.dim_model*config.maxlen,config.num_classes)
def forward(self,input):
out = self.embedding(input)
out=self.postion(out)
for encoder in self.encoders:
out=encoder(out)
print(out.shape)
out=out.view(out.shape[0],-1)
out=self.fc(out)
return out
nn.Transformer
其实pytorch,已经有transformer这一模型的实现,因为这里分类只使用encoder就可以,所以使用nn.TransformerEncoder进行编码,准确率为86.58%,代码如下
import json
import pickle
import torch
import torch.nn as nn
import numpy as np
class Config(object):
def __init__(self, embedding_pre):
self.embedding_path = 'data/embedding.npz'
self.embedding_model_path = "mymodel/word2vec.model"
self.train_path = 'data/train.df' # 训练集
self.dev_path = 'data/valid.df' # 验证集
self.test_path = 'data/test.df' # 测试集
self.class_path = 'data/class.json' # 类别名单
self.vocab_path = 'data/vocab.pkl' # 词表
self.save_path ='mymodel/transformer.pth' # 模型训练结果
self.embedding_pretrained = torch.tensor(np.load(self.embedding_path, allow_pickle=True)["embeddings"].astype(
'float32')) if embedding_pre == True else None # 预训练词向量
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设备
self.dropout = 0.5 # 随机失活
self.num_classes = len(json.load(open(self.class_path, encoding='utf-8'))) # 类别数
self.n_vocab = 0 # 词表大小,在运行时赋值
self.epochs = 10 # epoch数
self.batch_size = 128 # mini-batch大小
self.maxlen = 32 # 每句话处理成的长度(短填长切)
self.learning_rate = 5e-4 # 学习率
self.embed_size = self.embedding_pretrained.size(1)\
if self.embedding_pretrained is not None else 200 # 词向量维度
self.dim_model = 200 #其实是词向量的维度
self.num_head = 5 #保证词向量维数能整除这个值
self.num_encoder = 2 #block数
class Pos_encoding(nn.Module):
def __init__(self, d_model,maxlen,dropout):
super(Pos_encoding, self).__init__()
self.PE=torch.tensor([[pos/1000**(i//2*2/d_model) for pos in range(maxlen)] for i in range(d_model)])
self.PE[:,0::2]=np.sin(self.PE[:,0::2])
self.PE[:,1::2] = np.sin(self.PE[:,1::2])
self.dropout = nn.Dropout(dropout)
def forward(self,input):
out = self.dropout(input+self.PE)
return out
class Model(nn.Module):
def __init__(self, config):
super(Model, self).__init__()
if config.embedding_pretrained is not None:
self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
else:
vocab = pickle.load(open(config.vocab_path, 'rb'))
config.n_vocab=len(vocab.dict)
self.embedding = nn.Embedding(config.n_vocab, config.embed_size, padding_idx=config.n_vocab - 1)
self.postion=Pos_encoding(config.maxlen,config.embed_size,config.dropout)
self.transformer_layer=nn.TransformerEncoderLayer(config.dim_model,config.num_head)
self.transformer_encoder=nn.TransformerEncoder(self.transformer_layer,config.num_encoder)
self.fc=nn.Linear(config.dim_model*config.maxlen,config.num_classes)
def forward(self,input):
out = self.embedding(input)
out=self.postion(out)
out=self.transformer_encoder(out)
out=out.view(out.shape[0],-1)
out=self.fc(out)
return out
另外,还有一些预训练好的模型,直接调用即可,见https://pytorch.org/hub/