循环神经网络

循环神经网络

from mxnet import nd

x, w_xh = nd.random.normal(shape=(3, 1)), nd.random.normal(shape=(1, 4))
h, w_hh = nd.random.normal(shape=(3, 4)), nd.random.normal(shape=(4, 4))

print(nd.dot(x, w_xh) + nd.dot(h, w_hh))

print(nd.dot(nd.concat(x, h, dim=1), nd.concat(w_xh, w_hh, dim=0)))

语言模型数据集(歌词)

提取数据:

 

from mxnet import nd
import random
import zipfile

with zipfile.ZipFile(r'C:\Users\ROG\Downloads\d2l-zh\data\jaychou_lyrics.txt.zip') as zin:
    with zin.open('jaychou_lyrics.txt') as f:
        corpus_chars = f.read().decode('utf-8')
print(corpus_chars[:40])

# 这个数据集有6万多个字符。为了打印方便,我们把换行符替换成空格,然后仅使用前1万个字符来训练模型。

corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
corpus_chars = corpus_chars[0:10000]

# 建立字符索引

# 从你的先前代码中获取了一个字符串变量 corpus_chars,它包含了要处理的文本数据。

# Step 1: 创建索引到字符的映射 (idx_to_char)

# 这一步首先使用 set(corpus_chars) 获取了 corpus_chars 中不同的字符,然后使用 list() 转换为列表。
# 这样就创建了一个包含了文本中所有不同字符的列表 idx_to_char。
# 例如,如果文本中有字母 'a'、'b'、'c',那么 idx_to_char 列表就会包含 ['a', 'b', 'c']。

idx_to_char = list(set(corpus_chars))

# Step 2: 创建字符到索引的映射 (char_to_idx)
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])

# 这里使用了列表解析,遍历了 corpus_chars 中的每个字符,并为每个字符创建了一个键值对 (char, i),
# 其中 char 是字符,i 是该字符在列表中的索引。这样就建立了字符到索引的映射。
# 例如,如果字符 'a' 在 corpus_chars 中第一次出现,它会被映射到索引 0,以此类推。

# Step 3: 计算词汇表大小 (vocab_size)
vocab_size = len(char_to_idx)

# 这一步简单地计算了 char_to_idx 字典的长度,即不同字符的数量。
# 这个数量就是词汇表的大小,通常用于设置神经网络中的嵌入层的维度,或者作为输出层的类别数。
# 例如,如果文本中有 26 个不同的字母,则 vocab_size 就是 26。

# 最终,你可以使用 char_to_idx 字典将文本中的字符转换为整数索引,
# 使用 idx_to_char 列表将整数索引转换回字符,以便进行深度学习模型的训练和预测。

print(vocab_size)

corpus_indices = [char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[:20]
print('chars:', ''.join([idx_to_char[idx] for idx in sample]))
print('indices:', sample)


# 随机采样

def data_iter_random(corpus_indices, batch_size, num_steps):
    num_examples = (len(corpus_indices) - 1) // num_steps
    epoch_size = num_examples // batch_size
    examples_indices = list(range(num_examples))
    random.shuffle(examples_indices)

    def _data(pos):
        return corpus_indices[pos: pos + num_steps]

    for i in range(epoch_size):
        i = i * batch_size
        batch_indices = examples_indices[i:i + batch_size]

        x = [_data(j * num_steps) for j in batch_indices]
        y = [_data(j * num_steps + 1) for j in batch_indices]

        yield nd.array(x), nd.array(y)


my_seq = list(range(30))
for x, y in data_iter_random(my_seq, batch_size=2, num_steps=6):
    print('X: ', x, '\nY:', y, '\n')


def data_iter_consecutive(corpus_indices, batch_size, num_steps, ctx=None):
    corpus_indices = nd.array(corpus_indices, ctx=ctx)
    data_len = len(corpus_indices)
    batch_len = data_len // batch_size
    indices = corpus_indices[0: batch_size * batch_len].reshape((
        batch_size, batch_len))
    epoch_size = (batch_len - 1) // num_steps
    for i in range(epoch_size):
        i = i * num_steps
        X = indices[:, i: i + num_steps]
        Y = indices[:, i + 1: i + num_steps + 1]
        yield X, Y


for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6):
    print('X: ', X, '\nY:', Y, '\n')

 基本实现:

 

from mxnet.gluon import loss as gloss
from mxnet import autograd, nd
import d2lzh as d2l
import random
import zipfile
import math
import time

ctx = d2l.try_gpu()

with zipfile.ZipFile(r'C:\Users\ROG\Downloads\d2l-zh\data\jaychou_lyrics.txt.zip') as zin:
    with zin.open('jaychou_lyrics.txt') as f:
        corpus_chars = f.read().decode('utf-8')

corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
corpus_chars = corpus_chars[:10000]

idx_to_char = list(set(corpus_chars))
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)

corpus_indices = [char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[:20]

print(vocab_size)
print(nd.one_hot(nd.array([0, 2]), vocab_size))


def to_onehot(X, size):
    return [nd.one_hot(x, size) for x in X.T]


x = nd.arange(10).reshape((2, 5))
inputs = to_onehot(x, vocab_size)
print(len(inputs), inputs[0].shape)

num_inputs, num_hidden, num_outputs = vocab_size, 256, vocab_size


def get_params():
    def _one(shape):
        return nd.random.normal(scale=0.01, shape=shape, ctx=ctx)

    # 隐藏层参数
    w_xh = _one((num_inputs, num_hidden))
    w_hh = _one((num_hidden, num_hidden))
    b_h = nd.zeros(num_hidden)

    # 输出层参数
    w_hq = _one((num_hidden, num_outputs))
    b_q = nd.zeros(num_outputs)

    # 梯度
    params = [w_xh, w_hh, b_h, w_hq, b_q]
    for param in params:
        param.attach_grad()
    return params


# 定义模型

def init_rnn_state(batch_size, num_hidden):
    return (nd.zeros(shape=(batch_size, num_hidden), ctx=ctx),)


def rnn(inputs, state, params):
    w_xh, w_hh, b_h, w_hq, b_q = params
    h, = state
    outputs = []
    for x in inputs:
        h = nd.tanh(nd.dot(x, w_xh) + nd.dot(h, w_hh) + b_h)
        y = nd.dot(h, w_hq) + b_q
        outputs.append(y)
    return outputs, (h,)


state = init_rnn_state(x.shape[0], num_hidden)
inputs = to_onehot(x.as_in_context(ctx), vocab_size)
params = get_params()
outputs, state_new = rnn(inputs, state, params)

print(len(outputs), outputs[0].shape, state_new[0].shape)


# 定义预测函数

def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state,
                num_hidden, vocab_size, ctx, idx_to_char, char_to_idx):
    state = init_rnn_state(1, num_hidden)
    output = [char_to_idx[prefix[0]]]

    for t in range(num_chars + len(prefix) - 1):
        # 将上一时间步的输出作为当前时间步的输入
        X = to_onehot(nd.array([output[-1]], ctx=ctx), vocab_size)
        # 计算输出和更新隐藏状态
        (Y, state) = rnn(X, state, params)
        # 下一个时间步的输入是prefix里的字符或者当前的最佳预测字符
        if t < len(prefix) - 1:
            output.append(char_to_idx[prefix[t + 1]])
        else:
            output.append(int(Y[0].argmax(axis=1).asscalar()))
    return ''.join([idx_to_char[i] for i in output])


print(predict_rnn('分开', 10, rnn, params, init_rnn_state, num_hidden, vocab_size,
                  ctx, idx_to_char, char_to_idx))


# 裁剪梯度

def grad_clipping(params, theta, ctx):
    norm = nd.array([0], ctx)
    for param in params:
        norm += (param.grad ** 2).sum()
    norm = norm.sqrt().asscalar()
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm


def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, ctx, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):

    if is_random_iter:
        data_iter_fn = d2l.data_iter_random
    else:
        data_iter_fn = d2l.data_iter_consecutive
    params = get_params()
    loss = gloss.SoftmaxCrossEntropyLoss()

    for epoch in range(num_epochs):
        if not is_random_iter:  # 如使用相邻采样,在epoch开始时初始化隐藏状态
            state = init_rnn_state(batch_size, num_hiddens)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)
        for X, Y in data_iter:
            if is_random_iter:  # 如使用随机采样,在每个小批量更新前初始化隐藏状态
                state = init_rnn_state(batch_size, num_hiddens)
            else:  # 否则需要使用detach函数从计算图分离隐藏状态
                for s in state:
                    s.detach()
            with autograd.record():
                inputs = to_onehot(X, vocab_size)
                # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵
                (outputs, state) = rnn(inputs, state, params)
                # 连结之后形状为(num_steps * batch_size, vocab_size)
                outputs = nd.concat(*outputs, dim=0)
                # Y的形状是(batch_size, num_steps),转置后再变成长度为
                # batch * num_steps 的向量,这样跟输出的行一一对应
                y = Y.T.reshape((-1,))
                # 使用交叉熵损失计算平均分类误差
                l = loss(outputs, y).mean()
            l.backward()
            grad_clipping(params, clipping_theta, ctx)  # 裁剪梯度
            d2l.sgd(params, lr, 1)  # 因为误差已经取过均值,梯度不用再做平均
            l_sum += l.asscalar() * y.size
            n += y.size

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn(
                    prefix, pred_len, rnn, params, init_rnn_state,
                    num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))


# 随机采样训练模型并创作歌词
num_epochs, num_steps, batch_size, lr, clipping_theta = 250, 35, 32, 1e2, 1e-2
pred_period, pred_len, prefixes = 50, 50, ['分开', '不分开']

train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hidden,
                      vocab_size, ctx, corpus_indices, idx_to_char,
                      char_to_idx, True, num_epochs, num_steps, lr,
                      clipping_theta, batch_size, pred_period, pred_len,
                      prefixes)

 

posted @ 2023-08-27 16:57  o-Sakurajimamai-o  阅读(32)  评论(0编辑  收藏  举报
-- --