rohit22 · February 10, 2017 01:50 · Oct 4, 2016 · Oct 3, 2016 · Oct 3, 2016 · Oct 3, 2016
diff --git a/tf_lstm.py b/tf_lstm.py
@@ -107,6 +107,8 @@ def generate_batch(num_bits, batch_size):
 #                                     for infering the output at timestep. For
 #                                     example for LSTM, output is just hidden,
 #                                     but state is memory + hidden
+# Example LSTM cell with learnable zero_state can be found here:
+#    https://gist.github.com/nivwusquorum/160d5cf7e1e82c21fad3ebf04f039317
 if USE_LSTM:
     cell = tf.nn.rnn_cell.BasicLSTMCell(RNN_HIDDEN, state_is_tuple=True)
 else:

diff --git a/tf_lstm.py b/tf_lstm.py
@@ -85,22 +85,21 @@ def generate_batch(num_bits, batch_size):
 ##                           GRAPH DEFINITION                                 ##
 ################################################################################
 
-INPUT_SIZE  = 2       # 2 bits per timestep
-RNN_HIDDEN  = 20
-OUTPUT_SIZE = 1       # 1 bit per timestep
-TINY = 1e-6           # to avoid NaNs in logs
+INPUT_SIZE    = 2       # 2 bits per timestep
+RNN_HIDDEN    = 20
+OUTPUT_SIZE   = 1       # 1 bit per timestep
+TINY          = 1e-6    # to avoid NaNs in logs
 LEARNING_RATE = 0.01
 
 USE_LSTM = True
-LEARN_INITIAL_STATE = True
 
 inputs  = tf.placeholder(tf.float32, (None, None, INPUT_SIZE))  # (time, batch, in)
 outputs = tf.placeholder(tf.float32, (None, None, OUTPUT_SIZE)) # (time, batch, out)
 
 
 ## Here cell can be any function you want, provided it has two attributes:
-#     - cell.state_size - size of the hidden vector passed along timesteps
-#                         For LSTM this is 2 * hidden_size (memory + hidden).
+#     - cell.zero_state(batch_size, dtype)- tensor which is an initial value
+#                                           for state in __call__
 #     - cell.__call__(input, state) - function that given input and previous
 #                                     state returns tuple (output, state) where
 #                                     state is the state passed to the next
@@ -109,22 +108,16 @@ def generate_batch(num_bits, batch_size):
 #                                     example for LSTM, output is just hidden,
 #                                     but state is memory + hidden
 if USE_LSTM:
-    cell = tf.nn.rnn_cell.BasicLSTMCell(RNN_HIDDEN, state_is_tuple=False)
+    cell = tf.nn.rnn_cell.BasicLSTMCell(RNN_HIDDEN, state_is_tuple=True)
 else:
     cell = tf.nn.rnn_cell.BasicRNNCell(RNN_HIDDEN)
 
-if LEARN_INITIAL_STATE:
-    initial_state_single = tf.get_variable("rnn_initial",
-                                           (1, cell.state_size),
-                                           initializer=tf.random_normal_initializer())
-else:
-    initial_state_single = tf.zeros((1, cell.state_size))
-
-# Repeat initial_state vector along the batch index. Why does it take 3 lines to
-# do it? Who knows.
+# Create initial state. Here it is just a constant tensor filled with zeros,
+# but in principle it could be a learnable parameter. This is a bit tricky
+# to do for LSTM's tuple state, but can be achieved by creating two vector
+# Variables, which are then tiled along batch dimension and grouped into tuple.
 batch_size    = tf.shape(inputs)[1]
-initial_state = tf.tile(initial_state_single, tf.pack([batch_size, 1]))
-initial_state.set_shape([None, cell.state_size])
+initial_state = cell.zero_state(batch_size, tf.float32)
 
 # Given inputs (time, batch, input_size) outputs a tuple
 #  - outputs: (time, batch, output_size)  [do not mistake with OUTPUT_SIZE]
@@ -164,7 +157,6 @@ def generate_batch(num_bits, batch_size):
 # For some reason it is our job to do this:
 session.run(tf.initialize_all_variables())
 
-
 for epoch in range(1000):
     epoch_error = 0
     for _ in range(ITERATIONS_PER_EPOCH):

diff --git a/tf_lstm.py b/tf_lstm.py
@@ -0,0 +1,183 @@
+"""Short and sweet LSTM implementation in Tensorflow.
+
+Motivation:
+When Tensorflow was released, adding RNNs was a bit of a hack - it required
+building separate graphs for every number of timesteps and was a bit obscure
+to use. Since then TF devs added things like `dynamic_rnn`, `scan` and `map_fn`.
+Currently the APIs are decent, but all the tutorials that I am aware of are not
+making the best use of the new APIs.
+
+Advantages of this implementation:
+- No need to specify number of timesteps ahead of time. Number of timesteps is
+  infered from shape of input tensor. Can use the same graph for multiple
+  different numbers of timesteps.
+- No need to specify batch size ahead of time. Batch size is infered from shape
+  of input tensor. Can use the same graph for multiple different batch sizes.
+- Easy to swap out different recurrent gadgets (RNN, LSTM, GRU, your new
+  creative idea)
+"""
+
+
+import numpy as np
+import random
+import tensorflow as tf
+import tensorflow.contrib.layers as layers
+
+map_fn = tf.python.functional_ops.map_fn
+
+################################################################################
+##                           DATASET GENERATION                               ##
+##                                                                            ##
+##  The problem we are trying to solve is adding two binary numbers. The      ##
+##  numbers are reversed, so that the state of RNN can add the numbers        ##
+##  perfectly provided it can learn to store carry in the state. Timestep t   ##
+##  corresponds to bit len(number) - t.                                       ##
+################################################################################
+
+def as_bytes(num, final_size):
+    res = []
+    for _ in range(final_size):
+        res.append(num % 2)
+        num //= 2
+    return res
+
+def generate_example(num_bits):
+    a = random.randint(0, 2**(num_bits - 1) - 1)
+    b = random.randint(0, 2**(num_bits - 1) - 1)
+    res = a + b
+    return (as_bytes(a,  num_bits),
+            as_bytes(b,  num_bits),
+            as_bytes(res,num_bits))
+
+def generate_batch(num_bits, batch_size):
+    """Generates instance of a problem.
+
+    Returns
+    -------
+    x: np.array
+        two numbers to be added represented by bits.
+        shape: b, i, n
+        where:
+            b is bit index from the end
+            i is example idx in batch
+            n is one of [0,1] depending for first and
+                second summand respectively
+    y: np.array
+        the result of the addition
+        shape: b, i, n
+        where:
+            b is bit index from the end
+            i is example idx in batch
+            n is always 0
+    """
+    x = np.empty((num_bits, batch_size, 2))
+    y = np.empty((num_bits, batch_size, 1))
+
+    for i in range(batch_size):
+        a, b, r = generate_example(num_bits)
+        x[:, i, 0] = a
+        x[:, i, 1] = b
+        y[:, i, 0] = r
+    return x, y
+
+
+################################################################################
+##                           GRAPH DEFINITION                                 ##
+################################################################################
+
+INPUT_SIZE  = 2       # 2 bits per timestep
+RNN_HIDDEN  = 20
+OUTPUT_SIZE = 1       # 1 bit per timestep
+TINY = 1e-6           # to avoid NaNs in logs
+LEARNING_RATE = 0.01
+
+USE_LSTM = True
+LEARN_INITIAL_STATE = True
+
+inputs  = tf.placeholder(tf.float32, (None, None, INPUT_SIZE))  # (time, batch, in)
+outputs = tf.placeholder(tf.float32, (None, None, OUTPUT_SIZE)) # (time, batch, out)
+
+
+## Here cell can be any function you want, provided it has two attributes:
+#     - cell.state_size - size of the hidden vector passed along timesteps
+#                         For LSTM this is 2 * hidden_size (memory + hidden).
+#     - cell.__call__(input, state) - function that given input and previous
+#                                     state returns tuple (output, state) where
+#                                     state is the state passed to the next
+#                                     timestep and output is the tensor used
+#                                     for infering the output at timestep. For
+#                                     example for LSTM, output is just hidden,
+#                                     but state is memory + hidden
+if USE_LSTM:
+    cell = tf.nn.rnn_cell.BasicLSTMCell(RNN_HIDDEN, state_is_tuple=False)
+else:
+    cell = tf.nn.rnn_cell.BasicRNNCell(RNN_HIDDEN)
+
+if LEARN_INITIAL_STATE:
+    initial_state_single = tf.get_variable("rnn_initial",
+                                           (1, cell.state_size),
+                                           initializer=tf.random_normal_initializer())
+else:
+    initial_state_single = tf.zeros((1, cell.state_size))
+
+# Repeat initial_state vector along the batch index. Why does it take 3 lines to
+# do it? Who knows.
+batch_size    = tf.shape(inputs)[1]
+initial_state = tf.tile(initial_state_single, tf.pack([batch_size, 1]))
+initial_state.set_shape([None, cell.state_size])
+
+# Given inputs (time, batch, input_size) outputs a tuple
+#  - outputs: (time, batch, output_size)  [do not mistake with OUTPUT_SIZE]
+#  - states:  (time, batch, hidden_size)
+rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state, time_major=True)
+
+# project output from rnn output size to OUTPUT_SIZE. Sometimes it is worth adding
+# an extra layer here.
+final_projection = lambda x: layers.linear(x, num_outputs=OUTPUT_SIZE, activation_fn=tf.nn.sigmoid)
+
+# apply projection to every timestep.
+predicted_outputs = map_fn(final_projection, rnn_outputs)
+
+# compute elementwise cross entropy.
+error = -(outputs * tf.log(predicted_outputs + TINY) + (1.0 - outputs) * tf.log(1.0 - predicted_outputs + TINY))
+error = tf.reduce_mean(error)
+
+# optimize
+train_fn = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(error)
+
+# assuming that absolute difference between output and correct answer is 0.5
+# or less we can round it to the correct output.
+accuracy = tf.reduce_mean(tf.cast(tf.abs(outputs - predicted_outputs) < 0.5, tf.float32))
+
+
+################################################################################
+##                           TRAINING LOOP                                    ##
+################################################################################
+
+NUM_BITS = 10
+ITERATIONS_PER_EPOCH = 100
+BATCH_SIZE = 16
+
+valid_x, valid_y = generate_batch(num_bits=NUM_BITS, batch_size=100)
+
+session = tf.Session()
+# For some reason it is our job to do this:
+session.run(tf.initialize_all_variables())
+
+
+for epoch in range(1000):
+    epoch_error = 0
+    for _ in range(ITERATIONS_PER_EPOCH):
+        # here train_fn is what triggers backprop. error and accuracy on their
+        # own do not trigger the backprop.
+        x, y = generate_batch(num_bits=NUM_BITS, batch_size=BATCH_SIZE)
+        epoch_error += session.run([error, train_fn], {
+            inputs: x,
+            outputs: y,
+        })[0]
+    epoch_error /= ITERATIONS_PER_EPOCH
+    valid_accuracy = session.run(accuracy, {
+        inputs:  valid_x,
+        outputs: valid_y,
+    })
+    print "Epoch %d, train error: %.2f, valid accuracy: %.1f %%" % (epoch, epoch_error, valid_accuracy * 100.0)