NLP-聊天机器人输入输出原理

NLP-聊天机器人输入输出原理

后续会加载 聊天机器人使用的transformer 更深入内部原理

import transformers
import torch
import os
import json
import random
import numpy as np
import argparse
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
from tqdm import tqdm
from torch.nn import DataParallel
import logging
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, GPT2Config
from transformers import BertTokenizerFast
# from transformers import BertTokenizer
from os.path import join, exists
from itertools import zip_longest, chain
# from chatbot.model import DialogueGPT2Model
from dataset import MyDataset
from torch.utils.data import Dataset, DataLoader
from torch.nn import CrossEntropyLoss
from sklearn.model_selection import train_test_split
import torch.nn.functional as F

# from util.speechgeneratebasebaidu import BaiduapiAudio
# from util.speechrecogn_spc_ado_txt import Speech2TextBaiduApi,APP_ID, API_KEY, SECRET_KEY
from aip import AipSpeech

PAD = '[PAD]'
pad_id = 0


def set_args():
    """
    Sets up the arguments.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', default='0', type=str, required=False, help='生成设备')
    parser.add_argument('--temperature', default=1, type=float, required=False, help='生成的temperature')
    parser.add_argument('--topk', default=8, type=int, required=False, help='最高k选1')
    parser.add_argument('--topp', default=0, type=float, required=False, help='最高积累概率')
    # parser.add_argument('--model_config', default='config/model_config_dialogue_small.json', type=str, required=False,
    #                     help='模型参数')
    parser.add_argument('--model_config', default='model_epoch40_50w/config.json', type=str, required=False,
                        help='模型参数')
    parser.add_argument('--log_path', default='data/interact.log', type=str, required=False, help='interact日志存放位置')
    parser.add_argument('--vocab_path', default='vocab/vocab.txt', type=str, required=False, help='选择词库')
    parser.add_argument('--model_path', default='model/epoch40', type=str, required=False, help='对话模型路径')
    parser.add_argument('--save_samples_path', default="sample/", type=str, required=False, help="保存聊天记录的文件路径")
    parser.add_argument('--repetition_penalty', default=1.0, type=float, required=False,
                        help="重复惩罚参数,若生成的对话重复性较高,可适当提高该参数")
    # parser.add_argument('--seed', type=int, default=None, help='设置种子用于生成随机数,以使得训练的结果是确定的')
    # parser.add_argument('--max_len', type=int, default=25, help='每个utterance的最大长度,超过指定长度则进行截断')
    parser.add_argument('--max_len', type=int, default=25, help='每个utterance的最大长度,超过指定长度则进行截断')
    parser.add_argument('--max_history_len', type=int, default=3, help="dialogue history的最大长度")
    parser.add_argument('--no_cuda', action='store_true', help='不使用GPU进行预测')
    return parser.parse_args()


def create_logger(args):
    """
    将日志输出到日志文件和控制台
    """
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)

    formatter = logging.Formatter(
        '%(asctime)s - %(levelname)s - %(message)s')

    # 创建一个handler,用于写入日志文件
    file_handler = logging.FileHandler(
        filename=args.log_path)
    file_handler.setFormatter(formatter)
    file_handler.setLevel(logging.INFO)
    logger.addHandler(file_handler)

    # 创建一个handler,用于将日志输出到控制台
    console = logging.StreamHandler()
    console.setLevel(logging.DEBUG)
    console.setFormatter(formatter)
    logger.addHandler(console)

    return logger


def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (vocab size)
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        # torch.topk()返回最后一维最大的top_k个元素,返回值为二维(values,indices)
        # ...表示其他维度由计算机自行推断
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value  # 对于topk之外的其他元素的logits值设为负无穷

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)  # 对logits进行递减排序
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
    return logits


def contentinactor(content, actor):
    for i in actor:
        if content in i:
            return True
    return False
    pass

def change_speech(content, obj):
    man = ["换男声。", "请换男声。"]
    woman = ["换一个美女。", "换女声。"]
    child = ["换用小孩声音。"]
    if contentinactor(content, man):
        obj.PER = 1
    if contentinactor(content, woman):
        obj.PER = 5
    if contentinactor(content, child):
        obj.PER = 111

def change_rol(content, args, device, model):
    poet = ["写一首诗。",]
    novelist = ["编个故事。"]
    chat = ["闲聊一会。","闲聊一会儿。"]
    if contentinactor(content, poet):
        model = GPT2LMHeadModel.from_pretrained("model/poem_model")
        args.max_len = 100
        args.max_history_len = 0
        model = model.to(device)
        model.eval()
    if contentinactor(content, novelist):
        model = GPT2LMHeadModel.from_pretrained("model/gpt2_general_chinese")
        args.max_len = 200
        args.max_history_len = 0
        model = model.to(device)
        model.eval()
    # 默认是聊天模式
    if contentinactor(content, chat):
        model = GPT2LMHeadModel.from_pretrained(args.model_path)
        args.max_history_len = 5
        model = model.to(device)
        model.eval()
    return model


def padding_input(input_ids, pad_num, padfunc="forward"):
    """
    padfunc  "forward"在101 后 , 所有对话前插入
    基本格式[101,42个0,101,872,1962, 102,872,1962, 102]
       每个序列中  有 一个101 ,n个对话就有n个102 结尾标识
    """
    leng = len(input_ids)
    diff = pad_num - leng
    pad_list = []
    if padfunc == "forward":
        if leng <= pad_num-1:
            start = input_ids[:1]
            end = input_ids[1:]
            pad_list.extend(start)
            pad_list.extend([0] * diff)
            pad_list.extend(end)
        else:
            if diff == 0:
                return input_ids
            else:
                cur = leng-pad_num
                flag = 0
                for i in range(cur, leng):
                    if int(input_ids[i]) == 102:
                        flag = i
                        break
                start = input_ids[:1]
                end = input_ids[flag:]
                pad_list.extend(start)
                pad_list.extend([0] * (cur+1))
                pad_list.extend(end)
    assert len(pad_list) == pad_num
    return pad_list


def save_inputbin(input_ids,savefile):
    # 1 转为np
    # 这里的tensor, 先从图中detach出来
    input_ids = input_ids.detach().numpy().astype(np.int16)
    # 2 tofile bin
    input_ids.tofile(savefile)
    print("save input bin end : %s" % savefile)


def read_output(param):
    with open(param, "rb") as f:
        _n = np.load(f, allow_pickle=False)   # fromfile
    ts = torch.from_numpy(_n)
    return ts

def main():
    args = set_args()
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    # padding length
    args.padding = 50
    args.save_input = "input.bin"
    device = 'cuda' if args.cuda else 'cpu'
    logger.info('using device:{}'.format(device))
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    tokenizer = BertTokenizerFast(vocab_file=args.vocab_path, sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]")
    # tokenizer = BertTokenizer(vocab_file=args.voca_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_path)
    model = model.to(device)
    model.eval()
    if args.save_samples_path:
        if not os.path.exists(args.save_samples_path):
            os.makedirs(args.save_samples_path)
        samples_file = open(args.save_samples_path + '/samples.txt', 'a', encoding='utf8')
        samples_file.write("聊天记录{}:\n".format(datetime.now()))
    # 存储聊天记录,每个utterance以token的id的形式进行存储
    history = []
    print('开始和chatbot聊天,输入CTRL + Z以退出')

    # # -------------插入语音模块-------------------
    # bdapi = BaiduapiAudio()
    # token = bdapi.getToken()
    # # tex = "四大名著是西游记,三国演义,水浒传,红楼梦"
    # # bdapi.text2audio(token, tex)
    # # bdapi.play_audio()
    # # -------------完毕 -------------------------

    while True:
        try:
            # # ---------------------------语音识别模块------------------
            # s2t = Speech2TextBaiduApi()
            # filepath = 'test.wav'
            # entryfile = s2t.get_audio(filepath)
            # # print('over!!!')
            # client = AipSpeech(APP_ID, API_KEY, SECRET_KEY)
            # # result = client.asr(get_file_content('audio3.wav'), 'wav',16000,{'dev_pid': 1537,})
            # result = client.asr(s2t.get_file_content(entryfile), 'wav', 16000, {'dev_pid': 1537, })
            # text = result["result"][0]
            # print("您说的内容:", text)
            #
            # # ------------- 男女声切换 -------------------
            # change_speech(text, bdapi)
            # # ---------------------完毕
            # # --------------主要功能接环 ----------------
            # # model = change_rol(text, args, device, model)
            # # ---------------------------完毕
            #
            # # --------------------------------完毕
            text = input("user:")
            # text = "你好"
            if args.save_samples_path:
                samples_file.write("user:{}\n".format(text))
            # add_special_tokens=True 则表示添加[CLS]起始 [SPE]结尾
            text_ids = tokenizer.encode(text, add_special_tokens=False)
            # text_ids = [872, 1962, 1435]
            history.append(text_ids)

            input_ids = [tokenizer.cls_token_id]  # 每个input以[CLS]为开头 tokenizer.cls_token=[CLS]

            for history_id, history_utr in enumerate(history[-args.max_history_len:]):
                input_ids.extend(history_utr)
                input_ids.append(tokenizer.sep_token_id)  # tokenizer.sep_token = [SEP]
                # 为了每小段语句 添加 开头结尾,以区分多次对话的每小句。
            # ------>>> --------padding 50, 固定输入,量化比较容易
            input_ids = padding_input(input_ids, args.padding)
            # --->>>--- 完毕 -----------------------------------------
            input_ids = torch.tensor(input_ids).long().to(device)
            input_ids = input_ids.unsqueeze(0)

            # ------>>> --------保存文件为bin文件
            # 1 保存在.bin文件  args.input = 'input.bin'
            save_inputbin(input_ids, args.save_input)
            # --->>>--- 完毕 -----------------------------------------

            response = []  # 根据context,生成的response
            # 最多生成max_len个token
            for _ in range(args.max_len):
                # import pdb
                # pdb.set_trace()
                # input_ids = tensor([[ 101,  872, 1962, 1435,  102]])
                # 不用转为0ne-hot 向量直接 是vocab 向量
                # 1 input.bin>>>>
                outputs = model(input_ids=input_ids)
                # 这里换为c++
                # res = os.popen("./main -s xxx") # outputs.bin printf("model out=()")
                # if res.code == 0:
                # 2 读bin
                # outputs是多维概率值
                # 输出为上下文向量的一个对象,其中是logits 可以理解为得分向量
                # CausalLMOutputWithCrossAttentions(loss=None, logits=tensor(
                # [[[ -8.6770,  -8.6747,  -8.6694,  ...,  -8.6542,  -8.6889,  -8.6647],
                #          [-10.3612, -10.3778, -10.2898,  ..., -10.2845, -10.3762, -10.3390],
                logits = outputs.logits
                # out: torch.Size([1, 50, 13317])

                # # --------测试版本 ------------------
                # save_inputbin(logits, "out_zan.bin")
                # logits = read_output("out_zan.bin") # 读出来以后必须reshape
                # # ---------------完毕 --------------------
                next_token_logits = logits[0, -1, :] #取最后一个词
                # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率
                for id in set(response):
                    next_token_logits[id] /= args.repetition_penalty
                next_token_logits = next_token_logits / args.temperature
                # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token
                next_token_logits[tokenizer.convert_tokens_to_ids('[UNK]')] = -float('Inf')
                filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=args.topk, top_p=args.topp)
                # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标
                next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
                if next_token == tokenizer.sep_token_id:  # 遇到[SEP]则表明response生成结束
                    break
                response.append(next_token.item())
                input_ids = torch.cat((input_ids, next_token.unsqueeze(0)), dim=1)
                # his_text = tokenizer.convert_ids_to_tokens(curr_input_tensor.tolist())
                # print("his_text:{}".format(his_text))
                # 输出为tensor([[101, 872, 1962, 1435, 102, 872]]) 其中872是你
            history.append(response)
            text = tokenizer.convert_ids_to_tokens(response)
            print("chatbot:" + "".join(text))

            # # ------------------ 插入语音模块播放 -------------------
            # # tex = "四大名著是西游记,三国演义,水浒传,红楼梦"
            # audiofile = bdapi.text2audio(token, "".join(text))
            # bdapi.play_audio(audiofile)
            # # --------------------完毕
            if args.save_samples_path:
                samples_file.write("chatbot:{}\n".format("".join(text)))
        except KeyboardInterrupt:
            if args.save_samples_path:
                samples_file.close()
            break


if __name__ == '__main__':
    main()
View Code

 

说明

聊天模型的一次输入是一串int数字,一个数字代表一个文字的token,这串数字就代表当前对话和之前历史对话的拼接。由于量化后模型输入大小固定为50,如果输入大小不是50就进行裁剪或者用0补全,比如输入是“你吃饭了吗?”,那么这次的模型输入是 [0...0  101  872 1391 7649  749 1408 8043]。

 

输出是一个维度为[50,  13317]的向量,后处理根据这个输出向量生成出下一个要说的文字的token(即数字)。把预测的token拼接到前面提到的输入后,继续给模型做推理,通过后处理再生成下一个文字的token,以此类推。

 

再补充一下

大概结构,比如 “你” 对应 vocab中872  “好” 对应 1962

输入基本为 每个小段都会有 [CLS]开头 和[SEP]结尾  对应101, 和 102

 

固定输入长度50 实际里面是有一些 [CLS]开头 和[SEP]结尾占位的

最后输入可能是[101,46个0,872,1391, 102]

然后chat 也回复你好,那么下一轮的输入

就是 [101,42个0,101,872,1962, 102,872,1962, 102]   每个序列中  有 一个101 ,n个对话就有n个102 结尾标识

具体填充策略可以随程序灵活改写。

posted on 2023-03-09 08:26  lexn  阅读(127)  评论(0编辑  收藏  举报

导航