点击查看代码
import random
import torch
from d2l import torch as d2l
"""
语言模型
做预训练模型(BERT,GPT-3)
生成文本,给定前面几个词,不断的使用xt~p(xt|xt-1,...x1)来生成后续文本
判断多个序列中哪个更常见
N元语法,可以处理比较长的序列
"""
tokens = d2l.tokenize(d2l.read_time_machine())
corpus = [token for line in tokens for token in line]
vocab = d2l.Vocab(corpus)
print(vocab.token_freqs[:10])
print('二元语法')
"""
zip() 函数用于将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,
然后返回由这些元组组成的对象
corpus = [0, 1, 5, 6, 9]
bigram_tokens = [pair for pair in zip(corpus[:-1], corpus[1:])]
print('bigram_tokens', bigram_tokens)
bigram_tokens [(0, 1), (1, 5), (5, 6), (6, 9)]
"""
bigram_tokens = [pair for pair in zip(corpus[:-1], corpus[1:])]
bigram_vocab = d2l.Vocab(bigram_tokens)
print(bigram_vocab.token_freqs[:10])
print('三元语法')
trigram_tokens = [triple for triple in zip(
corpus[:-2], corpus[1:-1], corpus[2:])]
trigram_vocab = d2l.Vocab(trigram_tokens)
print(trigram_vocab.token_freqs[:10])
print('随机采样')
"""
序列模型中,每遍历一次,一个数据被用过多次
扫一遍数据,所有数据只用过一次
起始点随机起始,[0,T)随机取值k,从k元素开始,切成长为T的序列,前k个不要
遍历一次,每个数据用一次
"""
def seq_data_iter_random(corpus, batch_size, num_steps):
"""使用随机抽样生成一个小批量子序列"""
corpus = corpus[random.randint(0, num_steps - 1):]
print(('len(corpus) - 1)', len(corpus) - 1))
print('num_steps', num_steps)
num_subseqs = (len(corpus) - 1) // num_steps
initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
random.shuffle(initial_indices)
def data(pos):
return corpus[pos: pos + num_steps]
num_batches = num_subseqs // batch_size
print('num_subseqs', num_subseqs)
print('num_batches', num_batches)
for i in range(0, batch_size * num_batches, batch_size):
initial_indices_per_batch = initial_indices[i: i + batch_size]
X = [data(j) for j in initial_indices_per_batch]
Y = [data(j + 1) for j in initial_indices_per_batch]
yield torch.tensor(X), torch.tensor(Y)
my_seq = list(range(30))
for X, Y in seq_data_iter_random(my_seq, batch_size=1, num_steps=5):
print('X: ', X, '\nY: ', Y)
"""
batch_size=1
('len(corpus) - 1)', 27)
num_steps 5
num_subseqs 5
num_batches 5
X: tensor([[12, 13, 14, 15, 16]])
Y: tensor([[13, 14, 15, 16, 17]])
X: tensor([[2, 3, 4, 5, 6]])
Y: tensor([[3, 4, 5, 6, 7]])
X: tensor([[17, 18, 19, 20, 21]])
Y: tensor([[18, 19, 20, 21, 22]])
X: tensor([[22, 23, 24, 25, 26]])
Y: tensor([[23, 24, 25, 26, 27]])
X: tensor([[ 7, 8, 9, 10, 11]])
Y: tensor([[ 8, 9, 10, 11, 12]])
batch_size=2
('len(corpus) - 1)', 28)
num_steps 5
num_subseqs 5
num_batches 2
X: tensor([[14, 15, 16, 17, 18],
[24, 25, 26, 27, 28]])
Y: tensor([[15, 16, 17, 18, 19],
[25, 26, 27, 28, 29]])
X: tensor([[ 9, 10, 11, 12, 13],
[ 4, 5, 6, 7, 8]])
Y: tensor([[10, 11, 12, 13, 14],
[ 5, 6, 7, 8, 9]])
batch_size=3
('len(corpus) - 1)', 28)
num_steps 5
num_subseqs 5
num_batches 1
X: tensor([[ 9, 10, 11, 12, 13],
[24, 25, 26, 27, 28],
[ 4, 5, 6, 7, 8]])
Y: tensor([[10, 11, 12, 13, 14],
[25, 26, 27, 28, 29],
[ 5, 6, 7, 8, 9]])
"""
print('-----------')
def seq_data_iter_sequential(corpus, batch_size, num_steps):
"""使用顺序分区生成一个小批量子序列"""
offset = random.randint(0, num_steps)
num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size
print('num_tokens', num_tokens)
Xs = torch.tensor(corpus[offset: offset + num_tokens])
Ys = torch.tensor(corpus[offset + 1: offset + 1 + num_tokens])
Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
print(Xs)
print('Xs.shape[1]', Xs.shape[1])
num_batches = Xs.shape[1] // num_steps
print('num_batches', num_batches)
for i in range(0, num_steps * num_batches, num_steps):
X = Xs[:, i: i + num_steps]
Y = Ys[:, i: i + num_steps]
yield X, Y
for X, Y in seq_data_iter_sequential(my_seq, batch_size=1, num_steps=5):
print('X: ', X, '\nY:', Y)
"""
batch_size=1
num_tokens 25
tensor([[ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28]])
Xs.shape[1] 25
num_batches 5
X: tensor([[4, 5, 6, 7, 8]])
Y: tensor([[5, 6, 7, 8, 9]])
X: tensor([[ 9, 10, 11, 12, 13]])
Y: tensor([[10, 11, 12, 13, 14]])
X: tensor([[14, 15, 16, 17, 18]])
Y: tensor([[15, 16, 17, 18, 19]])
X: tensor([[19, 20, 21, 22, 23]])
Y: tensor([[20, 21, 22, 23, 24]])
X: tensor([[24, 25, 26, 27, 28]])
Y: tensor([[25, 26, 27, 28, 29]])
batch_size=2
num_tokens 26
tensor([[ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]])
Xs.shape[1] 13
num_batches = Xs.shape[1] // num_steps
num_batches 2
X: tensor([[ 3, 4, 5, 6, 7],
[16, 17, 18, 19, 20]])
Y: tensor([[ 4, 5, 6, 7, 8],
[17, 18, 19, 20, 21]])
X: tensor([[ 8, 9, 10, 11, 12],
[21, 22, 23, 24, 25]])
Y: tensor([[ 9, 10, 11, 12, 13],
[22, 23, 24, 25, 26]])
"""
class SeqDataLoader:
"""加载序列数据的迭代器"""
def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
if use_random_iter:
self.data_iter_fn = d2l.seq_data_iter_random
else:
self.data_iter_fn = d2l.seq_data_iter_sequential
self.corpus, self.vocab = d2l.load_corpus_time_machine(max_tokens)
self.batch_size, self.num_steps = batch_size, num_steps
def __iter__(self):
return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)
def load_data_time_machine(batch_size, num_steps,
use_random_iter=False, max_tokens=10000):
"""返回时光机器数据集的迭代器和词表"""
data_iter = SeqDataLoader(
batch_size, num_steps, use_random_iter, max_tokens)
return data_iter, data_iter.vocab
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)