NLP(六):BiLSTM_Attention实现

一、文件结构

 

 二、attention

from torch import nn
import torch
import torch.nn.functional as F


class BiLSTM_Attention(nn.Module):
    def __init__(self,embedding_dim, num_hiddens, num_layers):
        super(BiLSTM_Attention, self).__init__()
        # bidirectional设为True即得到双向循环神经网络
        self.encoder = nn.LSTM(input_size=embedding_dim,
                               hidden_size=num_hiddens,
                               num_layers=num_layers,
                               batch_first=True,
                               bidirectional=True)
        # 初始时间步和最终时间步的隐藏状态作为全连接层输入
        self.w_omega = nn.Parameter(torch.Tensor(
            num_hiddens * 2, num_hiddens * 2))
        self.u_omega = nn.Parameter(torch.Tensor(num_hiddens * 2, 1))
        self.decoder = nn.Linear(2 * num_hiddens, 4)
        nn.init.uniform_(self.w_omega, -0.1, 0.1)
        nn.init.uniform_(self.u_omega, -0.1, 0.1)

    def forward(self, embeddings):


        # rnn.LSTM只返回最后一层的隐藏层在各时间步的隐藏状态。
        # embeddings形状是:(batch_size, seq_len, embedding_size)
        outputs, _ = self.encoder(embeddings)  # output, (h, c)
        # outputs形状是(batch_size, seq_len, 2 * num_hiddens)

        # Attention过程
        u = torch.tanh(torch.matmul(outputs, self.w_omega))
        # u形状是(batch_size, seq_len, 2 * num_hiddens)
        att = torch.matmul(u, self.u_omega)
        # att形状是(batch_size, seq_len, 1)
        att_score = F.softmax(att, dim=1)
        # att_score形状仍为(batch_size, seq_len, 1)
        scored_x = outputs * att_score
        # scored_x形状是(batch_size, seq_len, 2 * num_hiddens)
        # Attention过程结束

        feat = torch.sum(scored_x, dim=1)  # 加权求和
        # feat形状是(batch_size, 2 * num_hiddens)
        outs = self.decoder(feat)
        # out形状是(batch_size, 4)
        return outs

三、word_2_vec

import jieba
import torch
import gensim
import numpy as np


class WordEmbedding(object):
    def __init__(self):
        self.model = gensim.models.KeyedVectors.load_word2vec_format('checkpoints/word2vec.bin', binary=True)
    def sentenceTupleToEmbedding(self, data1):
        maxLen = max([len(list(jieba.cut(sentence_a))) for sentence_a in data1])
        seq_len = maxLen
        a = self.sqence_vec(data1, seq_len) #batch_size, sqence, embedding
        return torch.FloatTensor(a)
    def sqence_vec(self, data, seq_len):
        data_a_vec = []
        for sequence_a in data:
            sequence_vec = []  # sequence * 128
            for word_a in jieba.cut(sequence_a):
                if word_a in self.model:
                    sequence_vec.append(self.model[word_a])
            sequence_vec = np.array(sequence_vec)
            add = np.zeros((seq_len - sequence_vec.shape[0], 128))
            sequenceVec = np.vstack((sequence_vec, add))
            data_a_vec.append(sequenceVec)
        a_vec = np.array(data_a_vec)
        return a_vec

if __name__ == '__main__':
    word = WordEmbedding()
    a = word.sentenceTupleToEmbedding(["我爱北京天安门"])
    print(a)

四、data_generator

# -*- coding: UTF-8 -*-
from tqdm import tqdm
import os, sys
import json

"""生成整体的json文件"""
def get_all_path():
    """
    得到所有路径
    """
    #parent_path = os.path.dirname(os.path.realpath(__file__))
    parent_path = os.path.split(os.path.realpath(__file__))[0]
    root = parent_path[:parent_path.find("pre_proces")]
    f_in = os.path.join(root, "datas", "primary")
    primiry_file_paths = []
    dirs = os.listdir(f_in)
    # 输出所有文件和文件夹
    for fileName in dirs:
        path = os.path.join(f_in, fileName)
        primiry_file_paths.append([fileName, path])
    return primiry_file_paths

def organize_data(file_name, file_path):
    """
    根据一个路径整理数据
    """
    item = {}
    item["file_name"] = file_name

    with open(file_path, "r", encoding="utf8") as f:
        kefu = 0
        kehu = 0
        item_object = {}
        for line in f.readlines():
            line_data = line.split(":")
            if "" in line_data[0]:
                #这一行是客服
                kefu = kefu + 1
                key = "bank_" + str(kefu)
                item_object[key] = line_data[1].replace("\n", "")
            else:
                kehu = kehu + 1
                key = "user_" + str(kehu)
                item_object[key] = line_data[1].replace("\n", "")
    item["datas"] = item_object
    return item

def main():
    file_out = "../datas/format/primary.json"
    with open(file_out, "w", encoding="utf-8") as f:
        for path in tqdm(get_all_path()):
            file_name, file_path = path[0], path[1]
            item = organize_data(file_name, file_path)
            text = json.dumps(item, ensure_ascii=False) + "\n"
            f.write(text)

if __name__ == '__main__':
    main()

五、data_collect

# -*- coding: UTF-8 -*-
import json
import pandas as pd

"""获得所有的文本"""
def get_all_text():
    file_path = "../datas/format/primary.json"
    names = []
    roles = []
    texts = []
    with open(file_path, "r", encoding="utf8") as f:
        for data_line in f.readlines():
            json_data = json.loads(data_line)
            file_name = json_data["file_name"]
            file_data = json_data["datas"]
            for k,v in file_data.items():
                names.append(file_name)
                roles.append(k)
                texts.append(v)
    file_out = "../datas/format/all_text.csv"
    dataframe = pd.DataFrame({'names': names, 'roles': roles, "texts": texts})
    dataframe.to_csv(file_out, index=False, sep='\t')

"""从csv搜索数据"""
def search_text(key):
    file_out = "../datas/classes/" + key + ".csv"
    file_path = "../datas/format/all_text.csv"
    data = pd.read_csv(file_path, sep="\t")
    da = data[data["texts"].str.contains(key)]
    da.to_csv(file_out, index=False, sep='\t')

"""提取带有婚字的数据"""
def data_annotate():
    file_in = "../datas/format/primary.json"
    file_out = "../datas/annotate/label.json"
    with open(file_out, "w", encoding="utf8") as fo:
        with open(file_in, "r", encoding="utf8") as f:
            for line in f.readlines():
                item = {}
                label = 0
                json_data = json.loads(line)
                for k,v in json_data["datas"].items():
                    if "" in v:
                        label = 1
                if label == 1:
                    item["name"] = json_data["file_name"]
                    item["label"] = ""
                    item["datas"] = json_data["datas"]
                    fo.write(json.dumps(item, ensure_ascii=False) + "\n")
    return "success"

"""提取标注过的数据"""
def annotate():
    file_in = "../datas/annotate/label.json"
    file_labeled = "../datas/annotate/labeled.json"
    file_unlabeled = "../datas/annotate/unlabel.json"
    with open(file_in, "r", encoding="utf8") as f_in:
        with open(file_labeled, "w", encoding="utf8") as f_labeled:
            with open(file_unlabeled, "w", encoding="utf8") as f_unlabeled:
                for line in f_in.readlines():
                    json_data = json.loads(line)
                    if json_data["label"]:
                        f_labeled.write(json.dumps(json_data, ensure_ascii=False) + "\n")
                    else:
                        f_unlabeled.write(json.dumps(json_data, ensure_ascii=False) + "\n")
    return "success"

def label_to_csv():
    file_path = "../datas/annotate/labeled.json"
    labels = []
    datas = []
    data_dict = []
    with open(file_path, "r", encoding="utf8") as f:
        for data_line in f.readlines():
            json_data = json.loads(data_line)
            _label = json_data["label"]
            _data = "|".join(json_data["datas"].values())
            labels.append(_label)
            datas.append(_data)
            data_dict.append(data_line.replace("\n", ""))
    file_out = "../datas/annotate/labeled.csv"
    dataframe = pd.DataFrame({'labels': labels, 'datas': datas, "data_dict": data_dict})
    dataframe.to_csv(file_out, index=False, sep='\t')

if __name__ == '__main__':
    label_to_csv()

六、dataset_iter

import torch.utils.data as data
import torch

class DatasetIter(data.Dataset):
    def __init__(self, text, label):
        self.text = text
        self.label = label
    def __getitem__(self, item):
        text = self.text[item]
        label = self.label[item]
        return text, label
    def __len__(self):
        return len(self.text)

七、wrapper

from model.attention import BiLSTM_Attention
import torch
import torch.nn as nn
import pandas as pd
from pre_process.dataset_iter import DatasetIter
from torch.utils.data import DataLoader
from torch.utils.data import Subset
from model.word_2_vector import WordEmbedding
import numpy as np

word = WordEmbedding()
class MainProcess(object):
    def __init__(self):
        self.lr = 0.001
        data_frame = pd.read_csv("datas/annotate/labeled.csv", sep="\t")
        label = data_frame["labels"]
        text = data_frame["datas"]
        data_set = DatasetIter(text, label)
        dataSet_length = len(data_set)
        train_size = int(0.9 * dataSet_length)
        train_set = Subset(data_set, range(train_size))
        test_set = Subset(data_set, range(train_size, dataSet_length))
        self.train_iter = DataLoader(dataset=train_set, batch_size=32, shuffle=True)
        self.test_iter = DataLoader(dataset=test_set, batch_size=32, shuffle=True)
        embedding_dim = 128
        num_hiddens = 64
        num_layers = 4
        self.num_epochs = 100
        self.net = BiLSTM_Attention(embedding_dim, num_hiddens, num_layers)
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.lr)

    def binary_acc(self, preds, y):
        pred = torch.argmax(preds, dim=1)
        correct = torch.eq(pred, y).float()
        acc = correct.sum() / len(correct)
        return acc

    def train(self, mynet, train_iter, optimizer, criterion, epoch):
        avg_acc = []
        avg_loss = []
        mynet.train()
        for batch_id, (datas, label) in enumerate(train_iter):
            try:
                X = word.sentenceTupleToEmbedding(datas)
            except Exception as e:
                continue
            y_hat = mynet(X)
            loss = criterion(y_hat, label)
            acc = self.binary_acc(y_hat, label)
            avg_acc.append(acc)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if batch_id % 100==0:
                print("轮数:", epoch, "batch: ",batch_id,"训练损失:", loss.item())
            avg_loss.append(loss.item())
        avg_acc = np.array(avg_acc).mean()
        avg_loss = np.array(avg_loss).mean()
        print('train acc:', avg_acc)
        print("train loss", avg_loss)

    def eval(self, mynet, test_iter, criteon):
        mynet.eval()
        avg_acc = []
        avg_loss = []
        with torch.no_grad():
            for batch_id, (datas, label) in enumerate(test_iter):
                try:
                    X = word.sentenceTupleToEmbedding(datas)
                except Exception as e:
                    continue
                y_hat = mynet(X)
                loss = criteon(y_hat, label)
                acc = self.binary_acc(y_hat, label).item()
                avg_acc.append(acc)
                avg_loss.append(loss.item())
        avg_acc = np.array(avg_acc).mean()
        avg_loss = np.array(avg_loss).mean()
        print('>>test acc:', avg_acc)
        print(">>test loss:", avg_loss)
        return (avg_acc, avg_loss)

    def main(self):
        min_loss = 100000
        for epoch in range(50):
            self.train(self.net, self.train_iter, self.optimizer, self.criterion, epoch)
            eval_acc, eval_loss = self.eval(self.net, self.test_iter, self.criterion)
            if eval_loss < min_loss:
                min_loss = eval_loss
                print("save model")
                torch.save(self.net.state_dict(), 'model.pth')

if __name__ == '__main__':
    MainProcess().main()

 

posted @ 2021-04-20 09:38  jasonzhangxianrong  阅读(627)  评论(0编辑  收藏  举报