|
|
@@ -0,0 +1,183 @@ |
|
|
"""Short and sweet LSTM implementation in Tensorflow. |
|
|
|
|
|
Motivation: |
|
|
When Tensorflow was released, adding RNNs was a bit of a hack - it required |
|
|
building separate graphs for every number of timesteps and was a bit obscure |
|
|
to use. Since then TF devs added things like `dynamic_rnn`, `scan` and `map_fn`. |
|
|
Currently the APIs are decent, but all the tutorials that I am aware of are not |
|
|
making the best use of the new APIs. |
|
|
|
|
|
Advantages of this implementation: |
|
|
- No need to specify number of timesteps ahead of time. Number of timesteps is |
|
|
infered from shape of input tensor. Can use the same graph for multiple |
|
|
different numbers of timesteps. |
|
|
- No need to specify batch size ahead of time. Batch size is infered from shape |
|
|
of input tensor. Can use the same graph for multiple different batch sizes. |
|
|
- Easy to swap out different recurrent gadgets (RNN, LSTM, GRU, your new |
|
|
creative idea) |
|
|
""" |
|
|
|
|
|
|
|
|
import numpy as np |
|
|
import random |
|
|
import tensorflow as tf |
|
|
import tensorflow.contrib.layers as layers |
|
|
|
|
|
map_fn = tf.python.functional_ops.map_fn |
|
|
|
|
|
################################################################################ |
|
|
## DATASET GENERATION ## |
|
|
## ## |
|
|
## The problem we are trying to solve is adding two binary numbers. The ## |
|
|
## numbers are reversed, so that the state of RNN can add the numbers ## |
|
|
## perfectly provided it can learn to store carry in the state. Timestep t ## |
|
|
## corresponds to bit len(number) - t. ## |
|
|
################################################################################ |
|
|
|
|
|
def as_bytes(num, final_size): |
|
|
res = [] |
|
|
for _ in range(final_size): |
|
|
res.append(num % 2) |
|
|
num //= 2 |
|
|
return res |
|
|
|
|
|
def generate_example(num_bits): |
|
|
a = random.randint(0, 2**(num_bits - 1) - 1) |
|
|
b = random.randint(0, 2**(num_bits - 1) - 1) |
|
|
res = a + b |
|
|
return (as_bytes(a, num_bits), |
|
|
as_bytes(b, num_bits), |
|
|
as_bytes(res,num_bits)) |
|
|
|
|
|
def generate_batch(num_bits, batch_size): |
|
|
"""Generates instance of a problem. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
x: np.array |
|
|
two numbers to be added represented by bits. |
|
|
shape: b, i, n |
|
|
where: |
|
|
b is bit index from the end |
|
|
i is example idx in batch |
|
|
n is one of [0,1] depending for first and |
|
|
second summand respectively |
|
|
y: np.array |
|
|
the result of the addition |
|
|
shape: b, i, n |
|
|
where: |
|
|
b is bit index from the end |
|
|
i is example idx in batch |
|
|
n is always 0 |
|
|
""" |
|
|
x = np.empty((num_bits, batch_size, 2)) |
|
|
y = np.empty((num_bits, batch_size, 1)) |
|
|
|
|
|
for i in range(batch_size): |
|
|
a, b, r = generate_example(num_bits) |
|
|
x[:, i, 0] = a |
|
|
x[:, i, 1] = b |
|
|
y[:, i, 0] = r |
|
|
return x, y |
|
|
|
|
|
|
|
|
################################################################################ |
|
|
## GRAPH DEFINITION ## |
|
|
################################################################################ |
|
|
|
|
|
INPUT_SIZE = 2 # 2 bits per timestep |
|
|
RNN_HIDDEN = 20 |
|
|
OUTPUT_SIZE = 1 # 1 bit per timestep |
|
|
TINY = 1e-6 # to avoid NaNs in logs |
|
|
LEARNING_RATE = 0.01 |
|
|
|
|
|
USE_LSTM = True |
|
|
LEARN_INITIAL_STATE = True |
|
|
|
|
|
inputs = tf.placeholder(tf.float32, (None, None, INPUT_SIZE)) # (time, batch, in) |
|
|
outputs = tf.placeholder(tf.float32, (None, None, OUTPUT_SIZE)) # (time, batch, out) |
|
|
|
|
|
|
|
|
## Here cell can be any function you want, provided it has two attributes: |
|
|
# - cell.state_size - size of the hidden vector passed along timesteps |
|
|
# For LSTM this is 2 * hidden_size (memory + hidden). |
|
|
# - cell.__call__(input, state) - function that given input and previous |
|
|
# state returns tuple (output, state) where |
|
|
# state is the state passed to the next |
|
|
# timestep and output is the tensor used |
|
|
# for infering the output at timestep. For |
|
|
# example for LSTM, output is just hidden, |
|
|
# but state is memory + hidden |
|
|
if USE_LSTM: |
|
|
cell = tf.nn.rnn_cell.BasicLSTMCell(RNN_HIDDEN, state_is_tuple=False) |
|
|
else: |
|
|
cell = tf.nn.rnn_cell.BasicRNNCell(RNN_HIDDEN) |
|
|
|
|
|
if LEARN_INITIAL_STATE: |
|
|
initial_state_single = tf.get_variable("rnn_initial", |
|
|
(1, cell.state_size), |
|
|
initializer=tf.random_normal_initializer()) |
|
|
else: |
|
|
initial_state_single = tf.zeros((1, cell.state_size)) |
|
|
|
|
|
# Repeat initial_state vector along the batch index. Why does it take 3 lines to |
|
|
# do it? Who knows. |
|
|
batch_size = tf.shape(inputs)[1] |
|
|
initial_state = tf.tile(initial_state_single, tf.pack([batch_size, 1])) |
|
|
initial_state.set_shape([None, cell.state_size]) |
|
|
|
|
|
# Given inputs (time, batch, input_size) outputs a tuple |
|
|
# - outputs: (time, batch, output_size) [do not mistake with OUTPUT_SIZE] |
|
|
# - states: (time, batch, hidden_size) |
|
|
rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state, time_major=True) |
|
|
|
|
|
# project output from rnn output size to OUTPUT_SIZE. Sometimes it is worth adding |
|
|
# an extra layer here. |
|
|
final_projection = lambda x: layers.linear(x, num_outputs=OUTPUT_SIZE, activation_fn=tf.nn.sigmoid) |
|
|
|
|
|
# apply projection to every timestep. |
|
|
predicted_outputs = map_fn(final_projection, rnn_outputs) |
|
|
|
|
|
# compute elementwise cross entropy. |
|
|
error = -(outputs * tf.log(predicted_outputs + TINY) + (1.0 - outputs) * tf.log(1.0 - predicted_outputs + TINY)) |
|
|
error = tf.reduce_mean(error) |
|
|
|
|
|
# optimize |
|
|
train_fn = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(error) |
|
|
|
|
|
# assuming that absolute difference between output and correct answer is 0.5 |
|
|
# or less we can round it to the correct output. |
|
|
accuracy = tf.reduce_mean(tf.cast(tf.abs(outputs - predicted_outputs) < 0.5, tf.float32)) |
|
|
|
|
|
|
|
|
################################################################################ |
|
|
## TRAINING LOOP ## |
|
|
################################################################################ |
|
|
|
|
|
NUM_BITS = 10 |
|
|
ITERATIONS_PER_EPOCH = 100 |
|
|
BATCH_SIZE = 16 |
|
|
|
|
|
valid_x, valid_y = generate_batch(num_bits=NUM_BITS, batch_size=100) |
|
|
|
|
|
session = tf.Session() |
|
|
# For some reason it is our job to do this: |
|
|
session.run(tf.initialize_all_variables()) |
|
|
|
|
|
|
|
|
for epoch in range(1000): |
|
|
epoch_error = 0 |
|
|
for _ in range(ITERATIONS_PER_EPOCH): |
|
|
# here train_fn is what triggers backprop. error and accuracy on their |
|
|
# own do not trigger the backprop. |
|
|
x, y = generate_batch(num_bits=NUM_BITS, batch_size=BATCH_SIZE) |
|
|
epoch_error += session.run([error, train_fn], { |
|
|
inputs: x, |
|
|
outputs: y, |
|
|
})[0] |
|
|
epoch_error /= ITERATIONS_PER_EPOCH |
|
|
valid_accuracy = session.run(accuracy, { |
|
|
inputs: valid_x, |
|
|
outputs: valid_y, |
|
|
}) |
|
|
print "Epoch %d, train error: %.2f, valid accuracy: %.1f %%" % (epoch, epoch_error, valid_accuracy * 100.0) |