53语言模型

点击查看代码

import random
import torch
from d2l import torch as d2l

"""
    语言模型
    做预训练模型（BERT，GPT-3）
    生成文本，给定前面几个词，不断的使用xt~p(xt|xt-1,...x1)来生成后续文本
    判断多个序列中哪个更常见
    
    N元语法，可以处理比较长的序列
"""

tokens = d2l.tokenize(d2l.read_time_machine())
# 因为每个文本行不一定是一个句子或一个段落，因此我们把所有文本行拼接到一起
corpus = [token for line in tokens for token in line]
vocab = d2l.Vocab(corpus)
print(vocab.token_freqs[:10])

# 二元语法
print('二元语法')
"""
zip() 函数用于将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，
然后返回由这些元组组成的对象
corpus = [0, 1, 5, 6, 9]
bigram_tokens = [pair for pair in zip(corpus[:-1], corpus[1:])]
print('bigram_tokens', bigram_tokens)
bigram_tokens [(0, 1), (1, 5), (5, 6), (6, 9)]
"""
bigram_tokens = [pair for pair in zip(corpus[:-1], corpus[1:])]
bigram_vocab = d2l.Vocab(bigram_tokens)
print(bigram_vocab.token_freqs[:10])

# 三元语法
print('三元语法')
trigram_tokens = [triple for triple in zip(
    corpus[:-2], corpus[1:-1], corpus[2:])]
trigram_vocab = d2l.Vocab(trigram_tokens)
print(trigram_vocab.token_freqs[:10])

# 随机采样
# 随机地生成一个小批量数据的特征和标签以供读取。
# 在随机采样中，每个样本都是在原始的长序列上任意捕获的子序列
print('随机采样')
"""
序列模型中，每遍历一次，一个数据被用过多次
扫一遍数据，所有数据只用过一次
起始点随机起始，[0,T)随机取值k,从k元素开始，切成长为T的序列，前k个不要
遍历一次，每个数据用一次
"""
# num_steps / T / tau
# batch_size 有几行数据
def seq_data_iter_random(corpus, batch_size, num_steps):  #@save
    """使用随机抽样生成一个小批量子序列"""
    # 从随机偏移量开始对序列进行分区，随机范围包括num_steps-1
    # 丢弃前k个元素，保留剩余元素
    corpus = corpus[random.randint(0, num_steps - 1):]
    # 减去1，是因为我们需要考虑标签
    # 生成子序列的个数
    print(('len(corpus) - 1)', len(corpus) - 1))
    print('num_steps', num_steps)
    num_subseqs = (len(corpus) - 1) // num_steps
    # 长度为num_steps的子序列的起始索引
    # 子序列开始下标
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    # 在随机抽样的迭代过程中，
    # 来自两个相邻的、随机的、小批量中的子序列不一定在原始序列上相邻
    random.shuffle(initial_indices)

    def data(pos):
        # 返回从pos位置开始的长度为num_steps的序列
        return corpus[pos: pos + num_steps]
    # 子序列个数 // batch_size
    num_batches = num_subseqs // batch_size
    print('num_subseqs', num_subseqs)
    print('num_batches', num_batches)
    for i in range(0, batch_size * num_batches, batch_size):
        # 在这里，initial_indices包含子序列的随机起始索引
        initial_indices_per_batch = initial_indices[i: i + batch_size]
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j + 1) for j in initial_indices_per_batch]
        yield torch.tensor(X), torch.tensor(Y)

my_seq = list(range(30))
# 每一batch之间采样是独立的
for X, Y in seq_data_iter_random(my_seq, batch_size=1, num_steps=5):
    print('X: ', X, '\nY: ', Y)
"""

batch_size=1
('len(corpus) - 1)', 27)
num_steps 5
num_subseqs 5
num_batches 5
X:  tensor([[12, 13, 14, 15, 16]]) 
Y:  tensor([[13, 14, 15, 16, 17]])
X:  tensor([[2, 3, 4, 5, 6]]) 
Y:  tensor([[3, 4, 5, 6, 7]])
X:  tensor([[17, 18, 19, 20, 21]]) 
Y:  tensor([[18, 19, 20, 21, 22]])
X:  tensor([[22, 23, 24, 25, 26]]) 
Y:  tensor([[23, 24, 25, 26, 27]])
X:  tensor([[ 7,  8,  9, 10, 11]]) 
Y:  tensor([[ 8,  9, 10, 11, 12]])


batch_size=2
('len(corpus) - 1)', 28)
num_steps 5
num_subseqs 5
num_batches 2
X:  tensor([[14, 15, 16, 17, 18],
        [24, 25, 26, 27, 28]]) 
Y:  tensor([[15, 16, 17, 18, 19],
        [25, 26, 27, 28, 29]])
X:  tensor([[ 9, 10, 11, 12, 13],
        [ 4,  5,  6,  7,  8]]) 
Y:  tensor([[10, 11, 12, 13, 14],
        [ 5,  6,  7,  8,  9]])


batch_size=3     
('len(corpus) - 1)', 28)
num_steps 5
num_subseqs 5
num_batches 1
X:  tensor([[ 9, 10, 11, 12, 13],
        [24, 25, 26, 27, 28],
        [ 4,  5,  6,  7,  8]]) 
Y:  tensor([[10, 11, 12, 13, 14],
        [25, 26, 27, 28, 29],
        [ 5,  6,  7,  8,  9]])
"""
print('-----------')
# 保证两个相邻的小批量中的子序列在原始序列上也是相邻的
def seq_data_iter_sequential(corpus, batch_size, num_steps):  #@save
    """使用顺序分区生成一个小批量子序列"""
    # 从随机偏移量开始划分序列
    offset = random.randint(0, num_steps)
    num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size
    print('num_tokens', num_tokens)
    Xs = torch.tensor(corpus[offset: offset + num_tokens])
    Ys = torch.tensor(corpus[offset + 1: offset + 1 + num_tokens])
    Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
    print(Xs)
    print('Xs.shape[1]', Xs.shape[1])
    num_batches = Xs.shape[1] // num_steps
    print('num_batches', num_batches)
    for i in range(0, num_steps * num_batches, num_steps):
        X = Xs[:, i: i + num_steps]
        Y = Ys[:, i: i + num_steps]
        yield X, Y

for X, Y in seq_data_iter_sequential(my_seq, batch_size=1, num_steps=5):
    print('X: ', X, '\nY:', Y)

"""
batch_size=1
num_tokens 25
tensor([[ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
         22, 23, 24, 25, 26, 27, 28]])
Xs.shape[1] 25
num_batches 5
X:  tensor([[4, 5, 6, 7, 8]]) 
Y: tensor([[5, 6, 7, 8, 9]])
X:  tensor([[ 9, 10, 11, 12, 13]]) 
Y: tensor([[10, 11, 12, 13, 14]])
X:  tensor([[14, 15, 16, 17, 18]]) 
Y: tensor([[15, 16, 17, 18, 19]])
X:  tensor([[19, 20, 21, 22, 23]]) 
Y: tensor([[20, 21, 22, 23, 24]])
X:  tensor([[24, 25, 26, 27, 28]]) 
Y: tensor([[25, 26, 27, 28, 29]])


batch_size=2
num_tokens 26
tensor([[ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]])
Xs.shape[1] 13
num_batches = Xs.shape[1] // num_steps
num_batches 2
X:  tensor([[ 3,  4,  5,  6,  7],
        [16, 17, 18, 19, 20]]) 
Y: tensor([[ 4,  5,  6,  7,  8],
        [17, 18, 19, 20, 21]])
X:  tensor([[ 8,  9, 10, 11, 12],
        [21, 22, 23, 24, 25]]) 
Y: tensor([[ 9, 10, 11, 12, 13],
        [22, 23, 24, 25, 26]])
"""


# 两个采样函数包装到一个类中
class SeqDataLoader:  #@save
    """加载序列数据的迭代器"""
    def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
        if use_random_iter:
            self.data_iter_fn = d2l.seq_data_iter_random
        else:
            self.data_iter_fn = d2l.seq_data_iter_sequential
        self.corpus, self.vocab = d2l.load_corpus_time_machine(max_tokens)
        self.batch_size, self.num_steps = batch_size, num_steps

    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)

def load_data_time_machine(batch_size, num_steps,  #@save
                           use_random_iter=False, max_tokens=10000):
    """返回时光机器数据集的迭代器和词表"""
    data_iter = SeqDataLoader(
        batch_size, num_steps, use_random_iter, max_tokens)
    return data_iter, data_iter.vocab