基础实践1

1. IMDB影评数据集

2. THUCNews数据集

本文采用了清华NLP组提供的THUCNews新闻文本分类数据集的一个子集(原始的数据集大约74万篇文档,训练起来需要花较长的时间)。
本次训练使用了其中的10个分类,每个分类6500条,总共65000条新闻数据。
类别:体育, 财经, 房产, 家居, 教育, 科技, 时尚, 时政, 游戏, 娱乐
数据集划分如下:
    训练集: 5000*10
    验证集: 500*10
    测试集: 1000*10
用pytorch实现分类代码:

import random
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.nn.init as init
import os

class Alphabet():
    def __init__(self):
        self.id2string = []
        self.string2id = {}

def read_corpus(path, name="train"):
    data = []
    labels = []
    with open(path, 'r', encoding='utf-8') as fin:
        for idx, line in enumerate(fin):
            # if idx == 100: break
            # print(idx)
            label, text = line.split('\t')
            # text = clean_str(text.strip())
            labels.append(label)
            data.append(list(text.strip()))
    return data, labels

def createAlphabet(train_data, train_label):
    initalpha = Alphabet()
    initalpha.id2string.append('-unk')
    initalpha.string2id['-unk'] = 0
    for index in range(len(train_data)):
        for w in train_data[index]:
            if w not in initalpha.id2string:
                initalpha.id2string.append(w)
                # label_list.append()
                initalpha.string2id[w] = len(initalpha.id2string) - 1
    # print(initalpha.string2id)
    # print(initalpha.id2string)

    labelAlpha = Alphabet()
    labelAlpha.id2string.append('-unk')
    labelAlpha.string2id['-unk'] = 0
    # params.labelAlpha.id2string.extend(train_label)
    for index in range(len(train_label)):
        for w in train_label:
            if w not in labelAlpha.id2string:
                labelAlpha.id2string.append(w)
                labelAlpha.string2id[w] = len(labelAlpha.id2string) - 1
    # print(labelAlpha.id2string)    # ['-unk', '3', '4', '2']
    # print(labelAlpha.string2id)    # {'-unk': 0, '3': 1, '4': 2, '2': 3}
    return initalpha.string2id, initalpha.id2string, labelAlpha.id2string, labelAlpha.string2id


def addAlphabet(data, label, word2id, id2word, id2label, label2id):
    for index in range(len(data)):
        for w in data[index]:
            if w not in id2word:
                id2word.append(w)
                word2id[w] = len(id2word) - 1
    for index in range(len(label)):
        for w in label:
            if str(w) not in id2label:
                id2label.append(str(w))
                label2id[str(w)] = len(id2label) - 1
        return word2id, id2word, id2label, label2id


def create_data_paded(data, string2id, id2string):
    data_index = []
    for index in range(len(data)):
        sen_index = []
        for w in data[index]:
            if w not in id2string:
                sen_index.append(string2id['-unk'])
            else:
                sen_index.append(string2id[w])
        data_index.append(sen_index)
    data_index, string2id, id2string = pad(data_index, string2id, id2string)
    return data_index, string2id, id2string

def create_label_paded(data, string2id, id2string):
    labels_index = []
    # print(data)
    for index in range(len(data)):
        if data[index] not in id2string:
            labels_index.append(string2id['-unk'])
        else:
            labels_index.append(string2id[data[index]])
    return labels_index, string2id, id2string

def seek_max_len(data_index):
    max_len = 0
    for i in range(len(data_index)):
        if len(data_index[i]) > max_len:
            max_len = len(data_index[i])
    return max_len

def pad(data_index, string2id, id2string, pad_token='<pad>'):
    max_len = seek_max_len(data_index)
    # print(max_len)
    for i in range(len(data_index)):
        sen_index = data_index[i]
        if len(sen_index) < max_len:
            if pad_token not in id2string:
                id2string.append(pad_token)
                string2id[pad_token] = len(id2string)-1
            for j in range(max_len-len(sen_index)):
                sen_index.append(string2id[pad_token])
    return data_index, string2id, id2string


def create_batches(data_var, label_var, batch_size):
    data_iter = []
    label_iter = []
    batch_num = data_var.size(0) // batch_size + 1   # 21 3
    # print(batch_num)
    for i in range(batch_num):
        line_iter = []
        for j in range(batch_size):
            if i*batch_size+j+1 <= len(data_var):
                line_iter.append(data_var.data[i*batch_size+j].tolist())
                # line_iter.append(data_var.data[i * batch_size + j])
        data_iter.append(Variable(torch.LongTensor(line_iter)))

    # data_iter = Variable(torch.LongTensor(data_iter))   # 不整齐,不能variable
    # data_iter = torch.cat(data_iter, 1)
    # print(data_iter[0].size())    # [ torch.Size([64, 3054]),torch.Size([64, 3054]),torch.Size([64, 3054]) ]
    for i in range(batch_num):
        line_iter = []
        for j in range(batch_size):
            if i*batch_size+j+1 <= len(label_var):
                # line_iter.append(label_var.data[i*batch_size+j].tolist())
                line_iter.append(label_var.data[i * batch_size + j])
        label_iter.append(Variable(torch.LongTensor(line_iter)))
    # print(label_iter[0].size())   # [ torch.Size([64]),torch.Size([64]),torch.Size([64]) ]
    return data_iter, label_iter


class HyperParams:
    def __init__(self):
        self.lr = 0.001
        self.word_num = 0
        self.label_num = 0
        self.batch_size = 64

        self.epochs = 100
        self.embed_dim = 100
        self.static = False

        self.Ci = 1
        self.dropout = 0.5
        self.kernel_num = 100

        self.kernel_sizes = '3,4,5'
        self.cuda = True
        self.hidden_size = 200

        self.wordAlpha = Alphabet()
        self.labelAlpha = Alphabet()

        # self.train_path = "./data_raw/cnews.train.txt"
        # self.train_path = "../input/cnewscode/myclassifer_first/myclassifer_first/data_raw/cnews.train.txt"
        self.train_path = "../input/cnews.train.edit.txt"
        # self.dev_path = "./data_raw/cnews.val.txt"
        # self.dev_path = "../input/cnews-val/cnews.val.txt"
        self.dev_path = "../input/cnews.val.edit.txt"
        # self.test_path = "./data_raw/cnews.test.txt"
        # self.test_path = "../input/cnewscode/myclassifer_first/myclassifer_first/data_raw/cnews.test.txt"
        self.test_path = "../input/cnews.test.edit.txt"

        # self.save_path = './data_raw/'


class CNN(nn.Module):
    def __init__(self, params):
        super(CNN, self).__init__()
        self.params = params
        self.word_num = params.word_num
        self.label_num = params.label_num
        self.embed_dim = params.embed_dim
        self.embedding = nn.Embedding(self.word_num, self.embed_dim)
        # print(self.embedding)     # Embedding(297, 300)
        if params.cuda:
            self.convs1 = [nn.Conv2d(params.Ci, params.kernel_num, (K, params.embed_dim)).cuda() for K in params.kernel_sizes]
        else:
            self.convs1 = [nn.Conv2d(params.Ci, params.kernel_num, (K, params.embed_dim)) for K in
                           params.kernel_sizes]
        # init.xavier_normal([(conv.weight, gain=np.sqrt(2.0) for conv in self.convsl)])
        for conv in self.convs1:
            init.xavier_normal(conv.weight, gain=np.sqrt(2.0))
        self.dropout = nn.Dropout(params.dropout)
        self.fc1 = nn.Linear(len(params.kernel_sizes) * params.kernel_num, params.label_num)
        # self.fc = nn.Linear(self.embed_dim, self.label_num)
        self.bn = nn.BatchNorm2d(1)

    def forward(self, x):
        # print(x)      # [torch.LongTensor of size 10x41]
        x = self.embedding(x)   # [torch.FloatTensor of size 10x41x100]
        # x = torch.transpose(x, 1, 2)
        # x = F.max_pool1d(x, x.size(2)).squeeze(2)
        # x = self.fc(x)
        x = x.unsqueeze(1)
        a = []
        for conv in self.convs1:
            xx = conv(x)                        # variable [torch.FloatTensor of size 16x200x35x1]
            # print(xx)
            xx = Variable(torch.transpose(xx.data, 2, 3))
            xx = Variable(torch.transpose(xx.data, 1, 2))
            xx = self.bn(xx)
            xx = F.relu(xx)
            xx = xx.squeeze(1)
            a.append(xx)
        # print(a)
        x = a
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1)
        x = self.dropout(x)  # (N,len(Ks)*Co)
        # print(x)                            # [torch.FloatTensor of size 64x400]
        logit = self.fc1(x)
        return logit



def argmax_index(var):
    max_index = []
    # t = var.data[0][0]
    for i in range(var.size(0)):
        max_i = 0
        t = var.data[i][0]
        for j in range(var.size(1)):
            if t < var.data[i][j]:
                max_i = j
                t = var.data[i][j]
        max_index.append(max_i)
    return max_index

def train(train_iter, dev_iter, test_iter, model, params):
    optimizer = torch.optim.Adam(model.parameters(), lr=params.lr)
    best_dev_res = 0.0
    test_res = 0.0
    for epoch in range(1, params.epochs+1):
        data_iter, label_iter = train_iter
        correct = 0.0
        gold_t = 0.0
        for index in range(len(data_iter)):
            data_cur = data_iter[index]
            label_cur = label_iter[index]
            if params.cuda:
                data_cur = data_cur.cuda()
                label_cur = label_cur.cuda()
            optimizer.zero_grad()
            # print(batch)
            logit = model(data_cur)    # [torch.FloatTensor of size 10x4]
            # print(logit)
            loss = F.cross_entropy(logit, label_cur)
            loss.backward()
            optimizer.step()

            gold = argmax_index(logit)
            gold_t += len(gold)
            for i in range(len(gold)):
                if gold[i] == label_cur.data.tolist()[i]:
                    correct += 1
            # print('batch ' + str(index) + ' loss ' + str(loss.data[0]) + ' acc ' + str(acc) + '%')
            if index % 100 == 0: print('\rbatch ' + str(index) +  ' is finished!')

        acc = correct / gold_t * 100
        print('train step: '+ str(epoch)+' , acc: ' + str(acc) +'%')
        dev_res = eval(dev_iter, model, params)
        print('dev datasets:' + ' acc ' + str(dev_res) + '%')
        if dev_res > best_dev_res:
            best_dev_res = dev_res
            # save_prefix = os.path.join(params.save_path, 'snapshot')
            # save_path = '{}_steps{}.pt'.format(save_prefix, epoch)
            # torch.save(model, save_path)
            print('\nTesting...')
            test_res = eval(test_iter, model, params)
            print('test datasets:' + ' acc ' + str(test_res) + '%')
        print('now, last test datasets:' + ' acc ' + str(test_res) + '%\n')

def eval(iter, model, params):
    data_iter, label_iter = iter

    num = 0
    correct = 0.0
    for index in range(len(data_iter)):
        data_cur = data_iter[index]
        label_cur = label_iter[index]
        if params.cuda:
            data_cur = data_cur.cuda()
            label_cur = label_cur.cuda()
        logit = model(data_cur)
        gold = argmax_index(logit)
        for i in range(len(gold)):
            if gold[i] == label_cur.data.tolist()[i]:
                correct += 1
            num += 1
    avg_acc = correct/num *100
    # print('datasets:' + ' acc ' + str(avg_acc) + '%')
    return avg_acc


if __name__ == "__main__":
    params = HyperParams()
    torch.manual_seed(233)
    random.seed(233)
    train_data, train_label = read_corpus(params.train_path, "train")
    dev_data, dev_label = read_corpus(params.dev_path, "dev")
    test_data, test_label = read_corpus(params.test_path, "test")

    word2id, id2word, id2label, label2id = createAlphabet(train_data, train_label)

    train_data_index, word2id, id2word = create_data_paded(train_data, word2id, id2word)
    train_data_var = Variable(torch.LongTensor(train_data_index))
    train_label_index, label2id, id2label = create_label_paded(train_label, label2id, id2label)
    train_label_var = Variable(torch.LongTensor(train_label_index))

    params.word_num = len(id2word)
    params.label_num = len(id2label)
    # print(word2id)
    # print(params.word_num)
    # print(params.label_num)

    dev_data_index, word2id, id2word = create_data_paded(dev_data, word2id, id2word)
    dev_data_var = Variable(torch.LongTensor(dev_data_index))
    dev_label_index, label2id, id2label = create_label_paded(dev_label, label2id, id2label)
    dev_label_var = Variable(torch.LongTensor(dev_label_index))

    test_data_index, word2id, id2word = create_data_paded(test_data, word2id, id2word)
    test_data_var = Variable(torch.LongTensor(test_data_index))
    test_label_index, label2id, id2label = create_label_paded(test_label, label2id, id2label)
    test_label_var = Variable(torch.LongTensor(test_label_index))

    train_iter = create_batches(train_data_var, train_label_var, params.batch_size)
    dev_iter = create_batches(dev_data_var, dev_label_var, params.batch_size)
    test_iter = create_batches(test_data_var, test_label_var, params.batch_size)

    params.kernel_sizes = [int(k) for k in params.kernel_sizes.split(',')]
    cnn_model = CNN(params)

    if params.cuda:
        cnn_model = cnn_model.cuda()
    train(train_iter, dev_iter, test_iter, cnn_model, params)

3. 学习召回率、准确率、ROC曲线、AUC、PR曲线这些基本概念

1. ROC和PR曲线
在机器学习中,ROC(Receiver Operator Characteristic)曲线被广泛应用于二分类问题中来评估分类器的可信度,但是当处理一些高度不均衡的数据集时,PR曲线能表现出更多的信息,发现更多的问题。
1.ROC曲线和PR曲线是如何画出来的?
在二分类问题中,分类器将一个实例的分类标记为是或否,这可以用一个混淆矩阵来表示。
其中,列对应于实例实际所属的类别,行表示分类预测的类别。
  • TP(True Positive):指正确分类的正样本数,即预测为正样本,实际也是正样本。
  • FP(False Positive):指被错误的标记为正样本的负样本数,即实际为负样本而被预测为正样本,所以是False。
  • TN(True Negative):指正确分类的负样本数,即预测为负样本,实际也是负样本。
  • FN(False Negative):指被错误的标记为负样本的正样本数,即实际为正样本而被预测为负样本,所以是False。
  • TP+FP+TN+FN:样本总数。
  • TP+FN:实际正样本数。
  • TP+FP:预测结果为正样本的总数,包括预测正确的和错误的。
  • FP+TN:实际负样本数。
  • TN+FN:预测结果为负样本的总数,包括预测正确的和错误的。
在ROC曲线中,以FPR为x轴,TPR为y轴。FPR指实际负样本中被错误预测为正样本的概率。TPR指实际正样本中被预测正确的概率。
 在PR曲线中,以Recall(貌似翻译为召回率或者查全率)为x轴,Precision为y轴。Recall与TPR的意思相同,而Precision指正确分类的正样本数占总正样本的比例。
 绘制ROC曲线和PR曲线都是选定不同阈值,从而得到不同的x轴和y轴的值,画出曲线。
2.ROC曲线和PR曲线的关系
在ROC空间,ROC曲线越凸向左上方向效果越好。与ROC曲线左上凸不同的是,PR曲线是右上凸效果越好。
ROC和PR曲线都被用于评估机器学习算法对一个给定数据集的分类性能,每个数据集都包含固定数目的正样本和负样本。而ROC曲线和PR曲线之间有着很深的关系。
定理1:对于一个给定的包含正负样本的数据集,ROC空间和PR空间存在一一对应的关系,也就是说,如果recall不等于0,二者包含完全一致的混淆矩阵。我们可以将ROC曲线转化为PR曲线,反之亦然。
定理2:对于一个给定数目的正负样本数据集,一条曲线在ROC空间中比另一条曲线有优势,当且仅当第一条曲线在PR空间中也比第二条曲线有优势。(这里的“一条曲线比其他曲线有优势”是指其他曲线的所有部分与这条曲线重合或在这条曲线之下。)
 3.AUC
AUC(Area Under Curve)即指曲线下面积占总方格的比例。有时不同分类算法的ROC曲线存在交叉,因此很多时候用AUC值作为算法好坏的评判标准。面积越大,表示分类性能越好。
posted @ 2019-06-21 21:12  Joyce_song94  阅读(224)  评论(0)    收藏  举报