Andrej Karpathy的char-rnn Python3版本
1 """ 2 Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy) 3 BSD License 4 """ 5 import numpy as np 6 7 # data I/O 8 data = open('input.txt', 'r', encoding='utf-8').read() # should be simple plain text file 9 chars = list(set(data)) 10 data_size, vocab_size = len(data), len(chars) 11 print('data has %d characters, %d unique.' % (data_size, vocab_size)) 12 char_to_ix = { ch:i for i,ch in enumerate(chars) } 13 ix_to_char = { i:ch for i,ch in enumerate(chars) } 14 15 # hyperparameters 16 hidden_size = 100 # size of hidden layer of neurons 17 seq_length = 25 # number of steps to unroll the RNN for 18 learning_rate = 1e-1 19 20 # model parameters 21 Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden 22 Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden 23 Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output 24 bh = np.zeros((hidden_size, 1)) # hidden bias 25 by = np.zeros((vocab_size, 1)) # output bias 26 27 def lossFun(inputs, targets, hprev): 28 """ 29 inputs,targets are both list of integers. 30 hprev is Hx1 array of initial hidden state 31 returns the loss, gradients on model parameters, and last hidden state 32 """ 33 xs, hs, ys, ps = {}, {}, {}, {} 34 hs[-1] = np.copy(hprev) 35 loss = 0 36 # forward pass 37 for t in range(len(inputs)): 38 xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation 39 xs[t][inputs[t]] = 1 40 hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state 41 ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars 42 ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars 43 loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss) 44 # backward pass: compute gradients going backwards 45 dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) 46 dbh, dby = np.zeros_like(bh), np.zeros_like(by) 47 dhnext = np.zeros_like(hs[0]) 48 for t in reversed(range(len(inputs))): 49 dy = np.copy(ps[t]) 50 dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here 51 dWhy += np.dot(dy, hs[t].T) 52 dby += dy 53 dh = np.dot(Why.T, dy) + dhnext # backprop into h 54 dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity 55 dbh += dhraw 56 dWxh += np.dot(dhraw, xs[t].T) 57 dWhh += np.dot(dhraw, hs[t-1].T) 58 dhnext = np.dot(Whh.T, dhraw) 59 for dparam in [dWxh, dWhh, dWhy, dbh, dby]: 60 np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients 61 return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1] 62 63 def sample(h, seed_ix, n): 64 """ 65 sample a sequence of integers from the model 66 h is memory state, seed_ix is seed letter for first time step 67 """ 68 x = np.zeros((vocab_size, 1)) 69 x[seed_ix] = 1 70 ixes = [] 71 for t in range(n): 72 h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh) 73 y = np.dot(Why, h) + by 74 p = np.exp(y) / np.sum(np.exp(y)) 75 ix = np.random.choice(list(range(vocab_size)), p=p.ravel()) 76 x = np.zeros((vocab_size, 1)) 77 x[ix] = 1 78 ixes.append(ix) 79 return ixes 80 81 n, p = 0, 0 82 mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) 83 mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad 84 smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0 85 while True: 86 # prepare inputs (we're sweeping from left to right in steps seq_length long) 87 if p+seq_length+1 >= len(data) or n == 0: 88 hprev = np.zeros((hidden_size,1)) # reset RNN memory 89 p = 0 # go from start of data 90 inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]] 91 targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]] 92 93 # sample from the model now and then 94 if n % 100 == 0: 95 sample_ix = sample(hprev, inputs[0], 200) 96 txt = ''.join(ix_to_char[ix] for ix in sample_ix) 97 print('----\n %s \n----' % (txt, )) 98 99 # forward seq_length characters through the net and fetch gradient 100 loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev) 101 smooth_loss = smooth_loss * 0.999 + loss * 0.001 102 if n % 100 == 0: print('iter %d, loss: %f' % (n, smooth_loss)) # print progress 103 104 # perform parameter update with Adagrad 105 for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 106 [dWxh, dWhh, dWhy, dbh, dby], 107 [mWxh, mWhh, mWhy, mbh, mby]): 108 mem += dparam * dparam 109 param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update 110 111 p += seq_length # move data pointer 112 n += 1 # iteration counter