|
|
@@ -0,0 +1,108 @@ |
|
|
""" |
|
|
Minimal character-level demo. Written by Andrej Karpathy (@karpathy) |
|
|
BSD License |
|
|
""" |
|
|
import numpy as np |
|
|
|
|
|
# data I/O |
|
|
data = open('data.txt', 'r').read() # should be simple plain text file |
|
|
chars = list(set(data)) |
|
|
print '%d unique characters in data.' % (len(chars), ) |
|
|
vocab_size = len(chars) |
|
|
data_size = len(data) |
|
|
char_to_ix = { ch:i for i,ch in enumerate(chars) } |
|
|
ix_to_char = { i:ch for i,ch in enumerate(chars) } |
|
|
|
|
|
# hyperparameters |
|
|
hidden_size = 50 # size of hidden layer of neurons |
|
|
seq_length = 20 # number of steps to unroll the RNN for |
|
|
base_learning_rate = 0.01 |
|
|
learning_rate_decay = 0.85 # every 1000 iteration learning rate gets divided by this |
|
|
|
|
|
# model parameters |
|
|
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden |
|
|
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden |
|
|
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output |
|
|
bh = np.zeros((hidden_size, 1)) # hidden bias |
|
|
by = np.zeros((vocab_size, 1)) # output bias |
|
|
|
|
|
def lossFun(inputs, targets, hprev): |
|
|
""" |
|
|
inputs,targets are both list of integers. |
|
|
hprev is Hx1 array of initial |
|
|
returns the loss, gradients on model parameters, and last hidden state |
|
|
""" |
|
|
xs, hs, ys, ps = {}, {}, {}, {} |
|
|
hs[-1] = np.copy(hprev) |
|
|
loss = 0 |
|
|
# forward pass |
|
|
for t in xrange(len(inputs)): |
|
|
xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation |
|
|
xs[t][inputs[t]] = 1 |
|
|
hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) |
|
|
ys[t] = np.dot(Why, hs[t]) + by |
|
|
ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) |
|
|
loss += -np.log(ps[t][targets[t],0]) |
|
|
# backward pass: compute gradients going backwards |
|
|
dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) |
|
|
dbh, dby = np.zeros_like(bh), np.zeros_like(by) |
|
|
dhs, dys = {}, {} |
|
|
dhnext = np.zeros_like(hs[0]) |
|
|
for t in reversed(xrange(len(inputs))): |
|
|
dys[t] = np.copy(ps[t]) |
|
|
dys[t][targets[t]] -= 1 # backprop into y |
|
|
dWhy += np.dot(dys[t], hs[t].T) |
|
|
dby += np.copy(dys[t]) |
|
|
dhs[t] = np.dot(Why.T, dys[t]) + dhnext # backprop into h |
|
|
dhraw = (1 - hs[t] * hs[t]) * dhs[t] # backprop through tanh nonlinearity |
|
|
dbh += dhraw |
|
|
dWxh += np.dot(dhraw, xs[t].T) |
|
|
dWhh += np.dot(dhraw, hs[t-1].T) |
|
|
dhnext = np.dot(Whh.T, dhraw) |
|
|
|
|
|
return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1] |
|
|
|
|
|
def sample(h, seed_ix, n): |
|
|
""" |
|
|
sample a sequence of integers from the model |
|
|
h is memory state, seed_ix is seed letter for first time step |
|
|
""" |
|
|
x = np.zeros((vocab_size,1)) |
|
|
x[seed_ix] = 1 |
|
|
ixes = [] |
|
|
for t in xrange(n): |
|
|
h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh) |
|
|
y = np.dot(Why, h) + by |
|
|
p = np.exp(y) / np.sum(np.exp(y)) |
|
|
ix = np.random.choice(range(vocab_size), p=p.ravel()) |
|
|
x = np.zeros((vocab_size,1)) |
|
|
x[ix] = 1 |
|
|
ixes.append(ix) |
|
|
return ixes |
|
|
|
|
|
n, p = 0, 0 |
|
|
while n < 20000: |
|
|
# prepare inputs (we're sweeping from left to right in steps seq_length long) |
|
|
if p+seq_length+1 >= len(data) or n == 0: |
|
|
hprev = np.zeros((hidden_size,1)) # reset RNN memory |
|
|
p = 0 # go from start of data |
|
|
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]] |
|
|
targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]] |
|
|
|
|
|
# sample from the model now and then |
|
|
if n % 100 == 0: |
|
|
sample_ix = sample(hprev, inputs[0], 40) |
|
|
print 'sample:' |
|
|
print ''.join([ix_to_char[ix] for ix in sample_ix]) |
|
|
|
|
|
# forward seq_length characters through the net and fetch gradient |
|
|
loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev) |
|
|
if p == 0: print 'iter %d, loss: %f' % (n, loss) # print progress each epoch |
|
|
|
|
|
# perform parameter update with vanilla SGD, decay learning rate |
|
|
learning_rate = base_learning_rate * np.power(learning_rate_decay, n/1000.0) |
|
|
for param,dparam in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby]): |
|
|
param += -learning_rate * dparam |
|
|
|
|
|
p += seq_length # move data pointer |
|
|
n += 1 # iteration counter |