NLP文本分类学习笔记0:数据预处理及训练说明

本系列文章参考了github项目Chinese-Text-Classification-Pytorch

数据集及划分

使用来自github中的online_shopping_10_cats中文数据集,数据集包含10个类别,共6万多条评论数据,正、负向评论各约3万条。10个类别为:包括书籍、平板、手机、水果、洗发水、热水器、蒙牛、衣服、计算机、酒店。数据集为csv文件,结构为

cat label review
10个类别之一 正负向感情,值为0,1 评论内容

10个类别的数据是不平衡的

cat 数量
平板 10000
水果 10000
洗发水 10000
衣服 10000
酒店 10000
计算机 3992
书籍 3851
手机 2323
蒙牛 2033
热水器 575
将64%的数据作为训练数据,20%作为测试数据,16%作为验证数据
# 将csv数据进行分割
# 参考https://blog.csdn.net/weixin_38008864/article/details/99915084
def split_csv(infile, trainfile, valtestfile, seed=999, ratio=0.2):
    df = pd.read_csv(infile)
    idxs = np.arange(df.shape[0])
    np.random.seed(seed)
    np.random.shuffle(idxs)
    val_size = int(len(idxs) * ratio)
    df.iloc[idxs[:val_size], :].to_csv(valtestfile, index=False)
    df.iloc[idxs[val_size:], :].to_csv(trainfile, index=False)

词嵌入向量

预训练词向量直接使用gensim库和微软中文语料msr_training训练wordvec词向量,词向量维数设置为200维,训练窗口大小设置为5

#预训练词向量模型,200维
dataset=word2vec.Text8Corpus(corpus)
model = word2vec.Word2Vec(sentences=dataset, vector_size=200, window=5, min_count=1, workers=4)
model.save(embedding_model_path)

通过gensim训练的模型不便直接使用,在之后建立数据的词典后,将其转化为这种形式:{词id号:词的预训练嵌入向量}

#得到预训练向量
def build_embed_pretrained(vocab_path,embedding_model_path,embedding_path):
    #导入模型
    model = word2vec.Word2Vec.load(embedding_model_path)
    #导入词典
    vocab=pickle.load(open(vocab_path,'rb'))
    embeddings = [0] *len(vocab.dict)
    #得到词典中的每一个词对应词向量
    for word,id in vocab.dict.items():
        if (model.wv.__contains__(word)):
            embeddings[id]=model.wv[word]
        #模型中没有对应的词用0代替
        else:
            embeddings[id]=np.zeros((200,))
    #保存在.npz文件中
    np.savez_compressed(embedding_path, embeddings=embeddings)

文本处理

标签类别处理

读取数据中cat这一列,构建类别对应为数字id,如下所示,最后保存为json文件
{"书籍": 0, "平板": 1, "手机": 2, "水果": 3, "洗发水": 4, "热水器": 5, "蒙牛": 6, "衣服": 7, "计算机": 8, "酒店": 9}

df = pd.read_csv(path)
# 获得标签与对应数字id
with open(class_path, 'w', encoding='utf-8') as f:
    class_dict = {}
    for i in df['cat'].unique():
        class_dict[i] = len(class_dict)
    json.dump(class_dict, f, ensure_ascii=False)

建立词典

对于文本,去除停用词并分词后,构造一个类,对词进行计数,在词频和词数的限制下,保存为字典形式{词:词对应数字编号},最后将这个类保存

vocab=mydataset.Word2Num()
for i in df['review']:
    if pd.notnull(i):
        i = remove(i)
        sentence=jieba.lcut(i)
        vocab.fit(sentence)
vocab.build_vocab(min=MIN_FREQ,max_features=MAX_VOCAB_SIZE)
pickle.dump(vocab, open(vocab_path, 'wb'))

数据序列化

读入建立好的词典类,标签类别文件和数据,对于每一行数据。将cat对应的类别转化为对应的id,将数据去除停用词并分词后,通过词典类转化为数字序列,最后将处理好的数据以这种形式保存:dict = {'label':label,'text':text},label为类别,text为数据

# 处理数据,序列化
def build_dataset(path,class_path,vocab_path,save_path):
    w2=pickle.load(open(vocab_path, 'rb'))
    label_dict=json.load(open(class_path,'r',encoding='utf-8'))
    df = pd.read_csv(path)
    label=[]
    text=[]
    for index,i in df.iterrows():
        if pd.notnull(i['review']):
            label.append(label_dict[i['cat']])
            sentence=jieba.lcut(remove(i['review']))
            text.append(w2.transform(sentence,max_len=32))
    # 字典
    dict = {'label':label,'text':text}
    df = pd.DataFrame(dict)
    # 保存 dataframe
    df.to_pickle(save_path)

构建dataloader

为了便于之后batch操作,定义GetLoader类,继承torch中Dataset方法,并重写__getitem__()和__len__()方法

项目目录结构说明

统一在preTreatment.py中对数据集处理后,
再到run.py中调用相应模型与数据进行训练

训练

数据处理好后,只要在run.py中导入对应的模型结构和参数即可,模型在本系列的其它文章介绍

完整代码

mydataset.py

import torch

# 定义GetLoader类,继承Dataset方法,并重写__getitem__()和__len__()方法
class GetLoader(torch.utils.data.Dataset):
    # 初始化函数,得到数据
    def __init__(self, data_root, data_label):
        self.data = data_root
        self.label = data_label
    # index是根据batchsize划分数据后得到的索引,最后将data和对应的labels进行一起返回
    def __getitem__(self, index):
        data = self.data[index]
        labels = self.label[index]
        return data, labels
    # 该函数返回数据大小长度,目的是DataLoader方便划分,如果不知道大小,DataLoader会一脸懵逼
    def __len__(self):
        return len(self.data)

# 文本序列化
class Word2Num():
    UNK_TAG="UNK"
    PAD_TAG="PAD"
    UNK=0
    PAD=1
    def __init__(self):
        self.dict={
            self.UNK_TAG:self.UNK,
            self.PAD_TAG:self.PAD
        }
        self.count={}

    #单个句子保存在词典中
    def fit(self,sentence):
        for word in sentence:
            self.count[word]=self.count.get(word,0)+1

    def build_vocab(self,min=5,max=None,max_features=None):
        #限制词频的大小
        if min is not None:
            self.count={word:value for word,value in self.count.items() if value>min}
        if max is not None:
            self.count={word:value for word,value in self.count.items() if value<max}

        #限制保留的词数
        if max_features is not None:
            temp=sorted(self.count.items(),key=lambda x:x[-1],reverse=True)[:max_features]
            self.count=dict(temp)
        #得到词典
        for word in self.count:
            self.dict[word]=len(self.dict)
        #得到翻转词典
        self.inverse_dict=dict(zip(self.dict.values(),self.dict.keys()))

    # 由词得到其编号
    def transform(self,sentence,max_len=None):
        #对句子填充或裁剪
        if max_len is not None:
            if max_len>len(sentence):
                sentence=sentence+[self.PAD_TAG]*(max_len-len(sentence))
            if max_len<len(sentence):
                sentence=sentence[:max_len]
        return [self.dict.get(word,self.UNK) for word in sentence]

    # 由编号得到词
    def inverse_transform(self,indices):
        return [self.inverse_dict.get(idx) for idx in indices]

preTreatment.py

import json
import pickle
import jieba
import pandas as pd
import numpy as np
from gensim.models import word2vec
import re
import mydataset

# 将csv数据进行分割
# https://blog.csdn.net/weixin_38008864/article/details/99915084
def split_csv(infile, trainfile, valtestfile, seed=999, ratio=0.2):
    df = pd.read_csv(infile)
    idxs = np.arange(df.shape[0])
    np.random.seed(seed)
    np.random.shuffle(idxs)
    val_size = int(len(idxs) * ratio)
    df.iloc[idxs[:val_size], :].to_csv(valtestfile, index=False)
    df.iloc[idxs[val_size:], :].to_csv(trainfile, index=False)

# 去除停用词
def remove(text):
    remove_chars = '[0-9a-zA-Z’!"#$%&\'()*+,-./:;<=>?@,,。。 ?★、…【】《》?“”,‘’。’![\\]^_`{|}~]+'
    return re.sub(remove_chars, '', text).strip()

# 获得词典,获得处理后的数据,将数据标签都转化为数字形式
def build_vocab_label(MAX_VOCAB_SIZE,MIN_FREQ,path,class_path,vocab_path):
    df = pd.read_csv(path)
    # 获得标签与对应数字id
    with open(class_path, 'w', encoding='utf-8') as f:
        class_dict = {}
        for i in df['cat'].unique():
            class_dict[i] = len(class_dict)
        json.dump(class_dict, f, ensure_ascii=False)
    # 建立词表
    vocab=mydataset.Word2Num()
    for i in df['review']:
        if pd.notnull(i):
            i = remove(i)
            sentence=jieba.lcut(i)
            vocab.fit(sentence)
    vocab.build_vocab(min=MIN_FREQ,max_features=MAX_VOCAB_SIZE)
    pickle.dump(vocab, open(vocab_path, 'wb'))

# 处理数据,序列化
def build_dataset(path,class_path,vocab_path,save_path):
    w2=pickle.load(open(vocab_path, 'rb'))
    label_dict=json.load(open(class_path,'r',encoding='utf-8'))
    df = pd.read_csv(path)
    label=[]
    text=[]
    for index,i in df.iterrows():
        if pd.notnull(i['review']):
            label.append(label_dict[i['cat']])
            sentence=jieba.lcut(remove(i['review']))
            text.append(w2.transform(sentence,max_len=32))
    # 字典
    dict = {'label':label,'text':text}
    df = pd.DataFrame(dict)
    # 保存 dataframe
    df.to_pickle(save_path)


#得到预训练向量
def build_embed_pretrained(vocab_path,embedding_model_path,embedding_path):
    #导入模型
    model = word2vec.Word2Vec.load(embedding_model_path)
    #导入词典
    vocab=pickle.load(open(vocab_path,'rb'))
    embeddings = [0] *len(vocab.dict)
    #得到词典中的每一个词对应词向量
    for word,id in vocab.dict.items():
        if (model.wv.__contains__(word)):
            embeddings[id]=model.wv[word]
        #模型中没有对应的词用0代替
        else:
            embeddings[id]=np.zeros((200,))
    #保存在.npz文件中
    np.savez_compressed(embedding_path, embeddings=embeddings)

if __name__ == '__main__':
    # 词表长度
    MAX_VOCAB_SIZE = 100000
    # 词频限制
    MIN_FREQ = 1
    corpus='C:/Users/DELL/Downloads/icwb2-data/icwb2-data/training/msr_training.utf8'
    embedding_path = 'data/embedding'
    embedding_model_path = "mymodel/word2vec.model"

    dataset_path = r'C:\Users\DELL\Desktop\mydata\online_shopping_10_cats.csv'
    class_path='data/class.json'
    vocab_path='data/vocab.pkl'
    trainfiles='data/train.csv'
    testfile='data/test.csv'
    trainfile='data/dataset_train.csv'
    vaildfile='data/dataset_valid.csv'
    train_file='data/train.df'
    test_file='data/test.df'
    vaild_file='data/valid.df'

    #预训练词向量模型,200维,已经有模型后,可以不用再运行
    dataset=word2vec.Text8Corpus(corpus)
    model = word2vec.Word2Vec(sentences=dataset, vector_size=200, window=5, min_count=1, workers=4)
    model.save(embedding_model_path)

    build_vocab_label(MAX_VOCAB_SIZE,MIN_FREQ,dataset_path,class_path,vocab_path)

    #先将sms_spam.csv数据分为train.csv和test.csv,已经将数据集分割后可以不再运行
    split_csv(infile=dataset_path,trainfile=trainfiles,valtestfile=testfile,seed=999,ratio=0.2)
    #再将train.csv分为dataset_train.csv和dataset_valid.csv
    split_csv(infile=trainfiles,trainfile=trainfile,valtestfile=vaildfile,seed=999,ratio=0.2)

    build_dataset(trainfile,class_path,vocab_path,train_file)
    build_dataset(vaildfile, class_path, vocab_path, vaild_file)
    build_dataset(testfile, class_path, vocab_path, test_file)

    build_embed_pretrained(vocab_path,embedding_model_path,embedding_path)

run.py

from mymodel import myMLP,myCNN,myRNN
import mydataset
import numpy as np
import torch
from torch import nn,optim
from torch.utils.data import DataLoader

#导入对应的参数,embedding_pre为True,则使用预训练词向量
#config=myMLP.Config(embedding_pre=True)
config=myCNN.Config(embedding_pre=True)
# config=myRNN.Config(embedding_pre=True)

#构造dataloader
def collate_fn(batch):
    text,label=list(zip(*batch))
    text=torch.LongTensor(text)
    label = torch.LongTensor(label)
    return text,label

# 加载训练,验证,测试数据集
vectorized_data=np.load(config.train_path,allow_pickle=True)
train_ds=mydataset.GetLoader(vectorized_data['text'],vectorized_data['label'])
train_dl=DataLoader(train_ds,batch_size=config.batch_size,shuffle=True,collate_fn=collate_fn)
vectorized_data=np.load(config.dev_path,allow_pickle=True)
valid_ds=mydataset.GetLoader(vectorized_data['text'],vectorized_data['label'])
valid_dl=DataLoader(valid_ds,batch_size=config.batch_size,shuffle=True,collate_fn=collate_fn)
vectorized_data=np.load(config.test_path,allow_pickle=True)
test_ds=mydataset.GetLoader(vectorized_data['text'],vectorized_data['label'])
test_dl=DataLoader(test_ds,batch_size=config.batch_size,shuffle=True,collate_fn=collate_fn)

#计算准确率
def accuracys(pre,label):
    pre=torch.max(pre.data,1)[1]
    accuracy=pre.eq(label.data.view_as(pre)).sum()
    return accuracy,len(label)

#导入网络结构
#model=myMLP.MLP(config).to(config.device)
model=myCNN.Model(config).to(config.device)
# model=myRNN.Model(config).to(config.device)

#训练
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=config.learning_rate)
best_loss=float('inf')
for epoch in range(config.epochs):
    train_acc = []
    for batch_idx,(data,target)in enumerate(train_dl):
        model.train()
        out=model(data)
        loss=criterion(out,target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_acc.append(accuracys(out,target))
        train_r = (sum(tup[0] for tup in train_acc), sum(tup[1] for tup in train_acc))
        print('当前epoch:{}\t[{}/{}]{:.0f}%\t损失:{:.6f}\t训练集准确率:{:.2f}%\t'.format(
            epoch, batch_idx, len(train_dl), 100. * batch_idx / len(train_dl), loss.data,
                   100. * train_r[0].numpy() / train_r[1]
        ))
        #每100批次进行一次验证
        if batch_idx%100==0 and batch_idx!=0:
            model.eval()
            val_acc=[]
            loss_total=0
            with torch.no_grad():
                for (data,target) in valid_dl:
                    out=model(data)
                    loss_total = criterion(out, target).data+loss_total
                    val_acc.append(accuracys(out,target))
            val_r = (sum(tup[0] for tup in val_acc), sum(tup[1] for tup in val_acc))
            print('损失:{:.6f}\t验证集准确率:{:.2f}%\t'.format(loss_total/len(valid_dl),100. * val_r[0].numpy() / val_r[1]))
            #如果验证损失低于最好损失,则保存模型
            if loss_total < best_loss:
                best_loss = loss_total
                torch.save(model.state_dict(), config.save_path)


#测试
model.load_state_dict(torch.load(config.save_path))
model.eval()
test_acc=[]
with torch.no_grad():
    for (data, target) in test_dl:
        out = model(data)
        test_acc.append(accuracys(out, target))
test_r = (sum(tup[0] for tup in test_acc), sum(tup[1] for tup in test_acc))
print('测试集准确率:{:.2f}%\t'.format(100. * test_r[0].numpy() / test_r[1]))

posted @ 2022-04-08 10:19  启林O_o  阅读(1158)  评论(0编辑  收藏  举报