60机器翻译数据集

点击查看代码

import os
import torch
from d2l import torch as d2l

# 下载和预处理数据集
print('下载数据集')
#@save
d2l.DATA_HUB['fra-eng'] = (d2l.DATA_URL + 'fra-eng.zip',
                           '94646ad1522d915e7b0f9296181140edcf86a4f5')

#@save
def read_data_nmt():
    """载入“英语－法语”数据集"""
    data_dir = d2l.download_extract('fra-eng')
    with open(os.path.join(data_dir, 'fra.txt'), 'r',
             encoding='utf-8') as f:
        return f.read()

raw_text = read_data_nmt()
print(raw_text[:75])
"""
Go.	Va !
Hi.	Salut !
Run!	Cours !
Run!	Courez !
Who?	Qui ?
Wow!	Ça alors !
"""
# 预处理
print('预处理')
#@save
def preprocess_nmt(text):
    """预处理“英语－法语”数据集"""
    def no_space(char, prev_char):
        return char in set(',.!?') and prev_char != ' '
    # 使用空格替换不间断空格
    # 使用小写字母替换大写字母
    text = text.replace('\u202f', ' ').replace('\xa0', ' ').lower()
    # 在单词和标点符号之间插入空格
    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
           for i, char in enumerate(text)]
    return ''.join(out)

text = preprocess_nmt(raw_text)
print(text[:80])
"""
go .	va !
hi .	salut !
run !	cours !
run !	courez !
who ?	qui ?
wow !	ça alors !
"""

print('词元化')
#@save
def tokenize_nmt(text, num_examples=None):
    """词元化“英语－法语”数据数据集"""
    source, target = [], []
    for i, line in enumerate(text.split('\n')):
        # print('i: ', i)
        # print('line: ', line)
        if num_examples and i > num_examples:
            break
        parts = line.split('\t')
        # print('parts:' ,parts)
        if len(parts) == 2:
            source.append(parts[0].split(' '))
            target.append(parts[1].split(' '))
    return source, target

source, target = tokenize_nmt(text)
print(source[:6], target[:6])
"""
[['go', '.'], ['hi', '.'], ['run', '!'], ['run', '!'], ['who', '?'], ['wow', '!']] 
[['va', '!'], ['salut', '!'], ['cours', '!'], ['courez', '!'], ['qui', '?'], ['ça', 'alors', '!']]
"""

# 词汇表
# pad 填充
# bos 开始
# eos 结束
src_vocab = d2l.Vocab(source, min_freq=2,
                      reserved_tokens=['<pad>', '<bos>', '<eos>'])
print('src_vocab: ', len(src_vocab))
"""
src_vocab:  10012
"""
print(list(src_vocab.token_to_idx.items())[:10])
"""
[('<unk>', 0), ('<pad>', 1), ('<bos>', 2), ('<eos>', 3), 
 ('.', 4), ('i', 5), ('you', 6), ('to', 7), ('the', 8), ('?', 9)]
"""
# 截断或填充文本序列
"""
句子长度是不一样的
句子变成一个同一个长度 固定一个长度
"""
#@save
def truncate_pad(line, num_steps, padding_token):
    """截断或填充文本序列"""
    if len(line) > num_steps:
        return line[:num_steps]  # 截断
    return line + [padding_token] * (num_steps - len(line))  # 填充

print('截断或填充文本序列')
print(source[0])
print(truncate_pad(src_vocab[source[0]], 10, src_vocab['<pad>']))
"""
['go', '.']
[47, 4, 1, 1, 1, 1, 1, 1, 1, 1]
"""
# 转换成小批量数据集用于训练
#@save
def build_array_nmt(lines, vocab, num_steps):
    """将机器翻译的文本序列转换成小批量"""
    lines = [vocab[l] for l in lines]
    lines = [l + [vocab['<eos>']] for l in lines]
    array = torch.tensor([truncate_pad(
        l, num_steps, vocab['<pad>']) for l in lines])
    # 实际长度
    valid_len = (array != vocab['<pad>']).type(torch.int32).sum(1)
    return array, valid_len

# 训练模型
#@save
def load_data_nmt(batch_size, num_steps, num_examples=600):
    """返回翻译数据集的迭代器和词表"""
    # 下载数据
    text = preprocess_nmt(read_data_nmt())
    # 词元化
    source, target = tokenize_nmt(text, num_examples)
    # 两个词汇表
    src_vocab = d2l.Vocab(source, min_freq=2,
                          reserved_tokens=['<pad>', '<bos>', '<eos>'])
    tgt_vocab = d2l.Vocab(target, min_freq=2,
                          reserved_tokens=['<pad>', '<bos>', '<eos>'])
    # 两组小批量数据 数据、实际长度
    src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps)
    tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps)
    data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len)
    data_iter = d2l.load_array(data_arrays, batch_size)
    return data_iter, src_vocab, tgt_vocab

# 读出“英语－法语”数据集中的第一个小批量数据
train_iter, src_vocab, tgt_vocab = load_data_nmt(batch_size=1, num_steps=4)
for X, X_valid_len, Y, Y_valid_len in train_iter:
    print('X:', X.type(torch.int32))
    print('X的有效长度:', X_valid_len)
    print('Y:', Y.type(torch.int32))
    print('Y的有效长度:', Y_valid_len)
    break

"""
batch_size=2
num_steps=8
X: tensor([[13, 46, 48,  4,  3,  1,  1,  1],
        [ 9, 28,  4,  3,  1,  1,  1,  1]], dtype=torch.int32)
X的有效长度: tensor([5, 4])
Y: tensor([[80, 23, 82,  5,  3,  1,  1,  1],
        [ 0, 19,  5,  3,  1,  1,  1,  1]], dtype=torch.int32)
Y的有效长度: tensor([5, 4])
"""

"""
batch_size=1
num_steps=4
X: tensor([[109,  81,   4,   3]], dtype=torch.int32)
X的有效长度: tensor([4])
Y: tensor([[0, 4, 3, 1]], dtype=torch.int32)
Y的有效长度: tensor([3])
"""