NLP文本分类学习笔记0:数据预处理及训练说明
本系列文章参考了github项目Chinese-Text-Classification-Pytorch
数据集及划分
使用来自github中的online_shopping_10_cats中文数据集,数据集包含10个类别,共6万多条评论数据,正、负向评论各约3万条。10个类别为:包括书籍、平板、手机、水果、洗发水、热水器、蒙牛、衣服、计算机、酒店。数据集为csv文件,结构为
cat | label | review |
---|---|---|
10个类别之一 | 正负向感情,值为0,1 | 评论内容 |
10个类别的数据是不平衡的
cat | 数量 |
---|---|
平板 | 10000 |
水果 | 10000 |
洗发水 | 10000 |
衣服 | 10000 |
酒店 | 10000 |
计算机 | 3992 |
书籍 | 3851 |
手机 | 2323 |
蒙牛 | 2033 |
热水器 | 575 |
将64%的数据作为训练数据,20%作为测试数据,16%作为验证数据 |
# 将csv数据进行分割
# 参考https://blog.csdn.net/weixin_38008864/article/details/99915084
def split_csv(infile, trainfile, valtestfile, seed=999, ratio=0.2):
df = pd.read_csv(infile)
idxs = np.arange(df.shape[0])
np.random.seed(seed)
np.random.shuffle(idxs)
val_size = int(len(idxs) * ratio)
df.iloc[idxs[:val_size], :].to_csv(valtestfile, index=False)
df.iloc[idxs[val_size:], :].to_csv(trainfile, index=False)
词嵌入向量
预训练词向量直接使用gensim库和微软中文语料msr_training训练wordvec词向量,词向量维数设置为200维,训练窗口大小设置为5
#预训练词向量模型,200维
dataset=word2vec.Text8Corpus(corpus)
model = word2vec.Word2Vec(sentences=dataset, vector_size=200, window=5, min_count=1, workers=4)
model.save(embedding_model_path)
通过gensim训练的模型不便直接使用,在之后建立数据的词典后,将其转化为这种形式:{词id号:词的预训练嵌入向量}
#得到预训练向量
def build_embed_pretrained(vocab_path,embedding_model_path,embedding_path):
#导入模型
model = word2vec.Word2Vec.load(embedding_model_path)
#导入词典
vocab=pickle.load(open(vocab_path,'rb'))
embeddings = [0] *len(vocab.dict)
#得到词典中的每一个词对应词向量
for word,id in vocab.dict.items():
if (model.wv.__contains__(word)):
embeddings[id]=model.wv[word]
#模型中没有对应的词用0代替
else:
embeddings[id]=np.zeros((200,))
#保存在.npz文件中
np.savez_compressed(embedding_path, embeddings=embeddings)
文本处理
标签类别处理
读取数据中cat这一列,构建类别对应为数字id,如下所示,最后保存为json文件
{"书籍": 0, "平板": 1, "手机": 2, "水果": 3, "洗发水": 4, "热水器": 5, "蒙牛": 6, "衣服": 7, "计算机": 8, "酒店": 9}
df = pd.read_csv(path)
# 获得标签与对应数字id
with open(class_path, 'w', encoding='utf-8') as f:
class_dict = {}
for i in df['cat'].unique():
class_dict[i] = len(class_dict)
json.dump(class_dict, f, ensure_ascii=False)
建立词典
对于文本,去除停用词并分词后,构造一个类,对词进行计数,在词频和词数的限制下,保存为字典形式{词:词对应数字编号},最后将这个类保存
vocab=mydataset.Word2Num()
for i in df['review']:
if pd.notnull(i):
i = remove(i)
sentence=jieba.lcut(i)
vocab.fit(sentence)
vocab.build_vocab(min=MIN_FREQ,max_features=MAX_VOCAB_SIZE)
pickle.dump(vocab, open(vocab_path, 'wb'))
数据序列化
读入建立好的词典类,标签类别文件和数据,对于每一行数据。将cat对应的类别转化为对应的id,将数据去除停用词并分词后,通过词典类转化为数字序列,最后将处理好的数据以这种形式保存:dict = {'label':label,'text':text},label为类别,text为数据
# 处理数据,序列化
def build_dataset(path,class_path,vocab_path,save_path):
w2=pickle.load(open(vocab_path, 'rb'))
label_dict=json.load(open(class_path,'r',encoding='utf-8'))
df = pd.read_csv(path)
label=[]
text=[]
for index,i in df.iterrows():
if pd.notnull(i['review']):
label.append(label_dict[i['cat']])
sentence=jieba.lcut(remove(i['review']))
text.append(w2.transform(sentence,max_len=32))
# 字典
dict = {'label':label,'text':text}
df = pd.DataFrame(dict)
# 保存 dataframe
df.to_pickle(save_path)
构建dataloader
为了便于之后batch操作,定义GetLoader类,继承torch中Dataset方法,并重写__getitem__()和__len__()方法
项目目录结构说明
统一在preTreatment.py中对数据集处理后,
再到run.py中调用相应模型与数据进行训练
训练
数据处理好后,只要在run.py中导入对应的模型结构和参数即可,模型在本系列的其它文章介绍
完整代码
mydataset.py
import torch
# 定义GetLoader类,继承Dataset方法,并重写__getitem__()和__len__()方法
class GetLoader(torch.utils.data.Dataset):
# 初始化函数,得到数据
def __init__(self, data_root, data_label):
self.data = data_root
self.label = data_label
# index是根据batchsize划分数据后得到的索引,最后将data和对应的labels进行一起返回
def __getitem__(self, index):
data = self.data[index]
labels = self.label[index]
return data, labels
# 该函数返回数据大小长度,目的是DataLoader方便划分,如果不知道大小,DataLoader会一脸懵逼
def __len__(self):
return len(self.data)
# 文本序列化
class Word2Num():
UNK_TAG="UNK"
PAD_TAG="PAD"
UNK=0
PAD=1
def __init__(self):
self.dict={
self.UNK_TAG:self.UNK,
self.PAD_TAG:self.PAD
}
self.count={}
#单个句子保存在词典中
def fit(self,sentence):
for word in sentence:
self.count[word]=self.count.get(word,0)+1
def build_vocab(self,min=5,max=None,max_features=None):
#限制词频的大小
if min is not None:
self.count={word:value for word,value in self.count.items() if value>min}
if max is not None:
self.count={word:value for word,value in self.count.items() if value<max}
#限制保留的词数
if max_features is not None:
temp=sorted(self.count.items(),key=lambda x:x[-1],reverse=True)[:max_features]
self.count=dict(temp)
#得到词典
for word in self.count:
self.dict[word]=len(self.dict)
#得到翻转词典
self.inverse_dict=dict(zip(self.dict.values(),self.dict.keys()))
# 由词得到其编号
def transform(self,sentence,max_len=None):
#对句子填充或裁剪
if max_len is not None:
if max_len>len(sentence):
sentence=sentence+[self.PAD_TAG]*(max_len-len(sentence))
if max_len<len(sentence):
sentence=sentence[:max_len]
return [self.dict.get(word,self.UNK) for word in sentence]
# 由编号得到词
def inverse_transform(self,indices):
return [self.inverse_dict.get(idx) for idx in indices]
preTreatment.py
import json
import pickle
import jieba
import pandas as pd
import numpy as np
from gensim.models import word2vec
import re
import mydataset
# 将csv数据进行分割
# https://blog.csdn.net/weixin_38008864/article/details/99915084
def split_csv(infile, trainfile, valtestfile, seed=999, ratio=0.2):
df = pd.read_csv(infile)
idxs = np.arange(df.shape[0])
np.random.seed(seed)
np.random.shuffle(idxs)
val_size = int(len(idxs) * ratio)
df.iloc[idxs[:val_size], :].to_csv(valtestfile, index=False)
df.iloc[idxs[val_size:], :].to_csv(trainfile, index=False)
# 去除停用词
def remove(text):
remove_chars = '[0-9a-zA-Z’!"#$%&\'()*+,-./:;<=>?@,,。。 ?★、…【】《》?“”,‘’。’![\\]^_`{|}~]+'
return re.sub(remove_chars, '', text).strip()
# 获得词典,获得处理后的数据,将数据标签都转化为数字形式
def build_vocab_label(MAX_VOCAB_SIZE,MIN_FREQ,path,class_path,vocab_path):
df = pd.read_csv(path)
# 获得标签与对应数字id
with open(class_path, 'w', encoding='utf-8') as f:
class_dict = {}
for i in df['cat'].unique():
class_dict[i] = len(class_dict)
json.dump(class_dict, f, ensure_ascii=False)
# 建立词表
vocab=mydataset.Word2Num()
for i in df['review']:
if pd.notnull(i):
i = remove(i)
sentence=jieba.lcut(i)
vocab.fit(sentence)
vocab.build_vocab(min=MIN_FREQ,max_features=MAX_VOCAB_SIZE)
pickle.dump(vocab, open(vocab_path, 'wb'))
# 处理数据,序列化
def build_dataset(path,class_path,vocab_path,save_path):
w2=pickle.load(open(vocab_path, 'rb'))
label_dict=json.load(open(class_path,'r',encoding='utf-8'))
df = pd.read_csv(path)
label=[]
text=[]
for index,i in df.iterrows():
if pd.notnull(i['review']):
label.append(label_dict[i['cat']])
sentence=jieba.lcut(remove(i['review']))
text.append(w2.transform(sentence,max_len=32))
# 字典
dict = {'label':label,'text':text}
df = pd.DataFrame(dict)
# 保存 dataframe
df.to_pickle(save_path)
#得到预训练向量
def build_embed_pretrained(vocab_path,embedding_model_path,embedding_path):
#导入模型
model = word2vec.Word2Vec.load(embedding_model_path)
#导入词典
vocab=pickle.load(open(vocab_path,'rb'))
embeddings = [0] *len(vocab.dict)
#得到词典中的每一个词对应词向量
for word,id in vocab.dict.items():
if (model.wv.__contains__(word)):
embeddings[id]=model.wv[word]
#模型中没有对应的词用0代替
else:
embeddings[id]=np.zeros((200,))
#保存在.npz文件中
np.savez_compressed(embedding_path, embeddings=embeddings)
if __name__ == '__main__':
# 词表长度
MAX_VOCAB_SIZE = 100000
# 词频限制
MIN_FREQ = 1
corpus='C:/Users/DELL/Downloads/icwb2-data/icwb2-data/training/msr_training.utf8'
embedding_path = 'data/embedding'
embedding_model_path = "mymodel/word2vec.model"
dataset_path = r'C:\Users\DELL\Desktop\mydata\online_shopping_10_cats.csv'
class_path='data/class.json'
vocab_path='data/vocab.pkl'
trainfiles='data/train.csv'
testfile='data/test.csv'
trainfile='data/dataset_train.csv'
vaildfile='data/dataset_valid.csv'
train_file='data/train.df'
test_file='data/test.df'
vaild_file='data/valid.df'
#预训练词向量模型,200维,已经有模型后,可以不用再运行
dataset=word2vec.Text8Corpus(corpus)
model = word2vec.Word2Vec(sentences=dataset, vector_size=200, window=5, min_count=1, workers=4)
model.save(embedding_model_path)
build_vocab_label(MAX_VOCAB_SIZE,MIN_FREQ,dataset_path,class_path,vocab_path)
#先将sms_spam.csv数据分为train.csv和test.csv,已经将数据集分割后可以不再运行
split_csv(infile=dataset_path,trainfile=trainfiles,valtestfile=testfile,seed=999,ratio=0.2)
#再将train.csv分为dataset_train.csv和dataset_valid.csv
split_csv(infile=trainfiles,trainfile=trainfile,valtestfile=vaildfile,seed=999,ratio=0.2)
build_dataset(trainfile,class_path,vocab_path,train_file)
build_dataset(vaildfile, class_path, vocab_path, vaild_file)
build_dataset(testfile, class_path, vocab_path, test_file)
build_embed_pretrained(vocab_path,embedding_model_path,embedding_path)
run.py
from mymodel import myMLP,myCNN,myRNN
import mydataset
import numpy as np
import torch
from torch import nn,optim
from torch.utils.data import DataLoader
#导入对应的参数,embedding_pre为True,则使用预训练词向量
#config=myMLP.Config(embedding_pre=True)
config=myCNN.Config(embedding_pre=True)
# config=myRNN.Config(embedding_pre=True)
#构造dataloader
def collate_fn(batch):
text,label=list(zip(*batch))
text=torch.LongTensor(text)
label = torch.LongTensor(label)
return text,label
# 加载训练,验证,测试数据集
vectorized_data=np.load(config.train_path,allow_pickle=True)
train_ds=mydataset.GetLoader(vectorized_data['text'],vectorized_data['label'])
train_dl=DataLoader(train_ds,batch_size=config.batch_size,shuffle=True,collate_fn=collate_fn)
vectorized_data=np.load(config.dev_path,allow_pickle=True)
valid_ds=mydataset.GetLoader(vectorized_data['text'],vectorized_data['label'])
valid_dl=DataLoader(valid_ds,batch_size=config.batch_size,shuffle=True,collate_fn=collate_fn)
vectorized_data=np.load(config.test_path,allow_pickle=True)
test_ds=mydataset.GetLoader(vectorized_data['text'],vectorized_data['label'])
test_dl=DataLoader(test_ds,batch_size=config.batch_size,shuffle=True,collate_fn=collate_fn)
#计算准确率
def accuracys(pre,label):
pre=torch.max(pre.data,1)[1]
accuracy=pre.eq(label.data.view_as(pre)).sum()
return accuracy,len(label)
#导入网络结构
#model=myMLP.MLP(config).to(config.device)
model=myCNN.Model(config).to(config.device)
# model=myRNN.Model(config).to(config.device)
#训练
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=config.learning_rate)
best_loss=float('inf')
for epoch in range(config.epochs):
train_acc = []
for batch_idx,(data,target)in enumerate(train_dl):
model.train()
out=model(data)
loss=criterion(out,target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_acc.append(accuracys(out,target))
train_r = (sum(tup[0] for tup in train_acc), sum(tup[1] for tup in train_acc))
print('当前epoch:{}\t[{}/{}]{:.0f}%\t损失:{:.6f}\t训练集准确率:{:.2f}%\t'.format(
epoch, batch_idx, len(train_dl), 100. * batch_idx / len(train_dl), loss.data,
100. * train_r[0].numpy() / train_r[1]
))
#每100批次进行一次验证
if batch_idx%100==0 and batch_idx!=0:
model.eval()
val_acc=[]
loss_total=0
with torch.no_grad():
for (data,target) in valid_dl:
out=model(data)
loss_total = criterion(out, target).data+loss_total
val_acc.append(accuracys(out,target))
val_r = (sum(tup[0] for tup in val_acc), sum(tup[1] for tup in val_acc))
print('损失:{:.6f}\t验证集准确率:{:.2f}%\t'.format(loss_total/len(valid_dl),100. * val_r[0].numpy() / val_r[1]))
#如果验证损失低于最好损失,则保存模型
if loss_total < best_loss:
best_loss = loss_total
torch.save(model.state_dict(), config.save_path)
#测试
model.load_state_dict(torch.load(config.save_path))
model.eval()
test_acc=[]
with torch.no_grad():
for (data, target) in test_dl:
out = model(data)
test_acc.append(accuracys(out, target))
test_r = (sum(tup[0] for tup in test_acc), sum(tup[1] for tup in test_acc))
print('测试集准确率:{:.2f}%\t'.format(100. * test_r[0].numpy() / test_r[1]))