honnibal · March 1, 2023 15:10 · Oct 26, 2015 · Oct 26, 2015 · Aug 29, 2015 · Jun 22, 2015
diff --git a/theano_mlp_small.py b/theano_mlp_small.py
@@ -73,7 +73,7 @@ def _make_array(xy):
 
 def _init_logreg_weights(n_hidden, n_out):
     weights = numpy.zeros((n_hidden, n_out), dtype=theano.config.floatX)
-    bias = numpy.zeros((10,), dtype=theano.config.floatX)
+    bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
     return (
         theano.shared(name='W', borrow=True, value=weights),
         theano.shared(name='b', borrow=True, value=bias)

diff --git a/theano_mlp_small.py b/theano_mlp_small.py
@@ -14,6 +14,9 @@
 * Inputs streamed to model, not pre-loaded as given
 * Minibatch size 1, i.e. `true' stochastic update
 * No early stopping
+
+Released under MIT license
+Copyright Matthew Honnibal, 2015.
 """
 import os
 import sys

diff --git a/theano_mlp_small.py b/theano_mlp_small.py
@@ -69,7 +69,7 @@ def _make_array(xy):
 
 
 def _init_logreg_weights(n_hidden, n_out):
-    weights = numpy.zeros((n_hidden, 10), dtype=theano.config.floatX)
+    weights = numpy.zeros((n_hidden, n_out), dtype=theano.config.floatX)
     bias = numpy.zeros((10,), dtype=theano.config.floatX)
     return (
         theano.shared(name='W', borrow=True, value=weights),

diff --git a/theano_mlp_small.py b/theano_mlp_small.py
@@ -176,8 +176,7 @@ def main(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
              dataset='mnist.pkl.gz', n_hidden=500):
     train_examples, dev_examples, test_examples = load_data(dataset)
     print '... building the model'
-    train_model, evaluate_model = compile_model(28*28, 10, n_hidden, learning_rate,
-                                                L1_reg, L2_reg)
+    train_model, evaluate_model = compile_model(28*28, 10, n_hidden, learning_rate, L1_reg, L2_reg)
     print '... training'
     for epoch in range(1, n_epochs+1):
         for x, y in train_examples:

diff --git a/theano_mlp_small.py b/theano_mlp_small.py
@@ -14,7 +14,6 @@
 * Inputs streamed to model, not pre-loaded as given
 * Minibatch size 1, i.e. `true' stochastic update
 * No early stopping
-* Model compiled in one scope. No classes.
 """
 import os
 import sys

diff --git a/theano_mlp_small.py b/theano_mlp_small.py
@@ -2,19 +2,14 @@
 
 Based on the tutorial here: http://deeplearning.net/tutorial/mlp.html
 
-Theano is very unintuitive the first time you see it, and I found the MLP tutorial
-especially confusing.
+This example trims away some complexities, and makes it easier to see how Theano works.
 
-I think my version is clearer for two reasons:
+Design changes:
 
-1. The model is compiled in a distinct function, that only returns the train/eval
-  functions.  This way the symbolic variables are not in scope of the main function,
-  making it clear that they are not part of the run-time.
-2. No classes. The network is shown by simply chaining together the function calls
-  of the components.
+* Model compiled in a distinct function, so that symbolic variables are not in run-time scope.
+* No classes. Network shown by chained function calls.
 
-I also made some simplifications, pruning out details which are necessary for
-real-world use, but complicate the example:
+Some features of original have been dropped:
 
 * Inputs streamed to model, not pre-loaded as given
 * Minibatch size 1, i.e. `true' stochastic update
@@ -83,7 +78,7 @@ def _init_logreg_weights(n_hidden, n_out):
     )
 
 
-def _init_hidden_weights(n_in, n_out, activation=T.tanh):
+def _init_hidden_weights(n_in, n_out):
     rng = numpy.random.RandomState(1234)
     weights = numpy.asarray(
         rng.uniform(
@@ -131,6 +126,11 @@ def compile_model(n_in, n_classes, n_hidden, learning_rate, L1_reg, L2_reg):
     # allocate symbolic variables for the data
     x = T.vector('x')  # Features
     y = T.iscalar('y') # (Gold) Label
+
+    # Allocate and initialize weights.  These are stored internally, and updated.
+    hidden_W, hidden_b = _init_hidden_weights(n_in, n_hidden)
+    logreg_W, logreg_b = _init_logreg_weights(n_hidden, n_classes)
+
     # Estimate P(y | x) given the current weights
     p_y_given_x = feed_forward(
                       T.nnet.softmax,
@@ -189,4 +189,4 @@ def main(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
 
 
 if __name__ == '__main__':
-    main()
+    main()
diff --git a/theano_mlp_small.py b/theano_mlp_small.py
@@ -1,9 +1,22 @@
 """A stripped-down MLP example, using Theano.
 
-Based on the tutorial here: 
+Based on the tutorial here: http://deeplearning.net/tutorial/mlp.html
 
-Except:
+Theano is very unintuitive the first time you see it, and I found the MLP tutorial
+especially confusing.
 
+I think my version is clearer for two reasons:
+
+1. The model is compiled in a distinct function, that only returns the train/eval
+  functions.  This way the symbolic variables are not in scope of the main function,
+  making it clear that they are not part of the run-time.
+2. No classes. The network is shown by simply chaining together the function calls
+  of the components.
+
+I also made some simplifications, pruning out details which are necessary for
+real-world use, but complicate the example:
+
+* Inputs streamed to model, not pre-loaded as given
 * Minibatch size 1, i.e. `true' stochastic update
 * No early stopping
 * Model compiled in one scope. No classes.

diff --git a/theano_mlp_small.py b/theano_mlp_small.py
@@ -8,10 +8,10 @@
 * No early stopping
 * Model compiled in one scope. No classes.
 """
-
 import os
 import sys
 import time
+from os import path
 
 import numpy
 
@@ -31,12 +31,12 @@ def load_data(dataset):
     data_dir, data_file = os.path.split(dataset)
     if data_dir == "" and not os.path.isfile(dataset):
         # Check if dataset is in the data directory.
-        new_path = os.path.join(
-            os.path.split(__file__)[0],
-            "..",
-            "data",
-            dataset
-        )
+        data_dir = os.path.join(os.path.split(__file__)[0], "..", "data")
+        if not path.exists(data_dir):
+            print "No data directory to save data to. Try:"
+            print "mkdir ../data"
+            sys.exit(1)
+        new_path = path.join(data_dir, data_file)
         if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
             dataset = new_path
 
@@ -61,7 +61,7 @@ def _make_array(xy):
         numpy.asarray(data_y, dtype='int32'))
 
 
-def _init_maxent_weights(n_hidden, n_out):
+def _init_logreg_weights(n_hidden, n_out):
     weights = numpy.zeros((n_hidden, 10), dtype=theano.config.floatX)
     bias = numpy.zeros((10,), dtype=theano.config.floatX)
     return (
@@ -98,7 +98,7 @@ def feed_forward(activation, weights, bias, input_):
     return activation(T.dot(input_, weights) + bias)
 
 def sgd_step(param, cost, learning_rate):
-    return param - (learnign_rate * T.grad(cost, param))
+    return param - (learning_rate * T.grad(cost, param))
 
 # These are also symbolic.
 def L1(L1_reg, w1, w2):
@@ -109,7 +109,7 @@ def L2(L2_reg, w1, w2):
     return L2_reg * ((w1 ** 2).sum() + (w2 ** 2).sum())
 
 
-def compile_model(n_in, n_classes, n_hidden, learning_rate, L1, L2):
+def compile_model(n_in, n_classes, n_hidden, learning_rate, L1_reg, L2_reg):
     '''Compile train and evaluation functions, which we'll then call iteratively
     to train the parameters.  This function is called exactly once --- think of
     it like a compiler.  We declare variables, allocate memory, and define some
@@ -118,12 +118,6 @@ def compile_model(n_in, n_classes, n_hidden, learning_rate, L1, L2):
     # allocate symbolic variables for the data
     x = T.vector('x')  # Features
     y = T.iscalar('y') # (Gold) Label
-
-    # Weights and bias term for the hidden layer
-    hidden_W, hidden_b = _init_hidden_weights(n_in, n_hidden, T.tanh) 
-    # Weights and bias term for the softmax (logistic regression) layer
-    logreg_W, logreg_b = _init_logreg_weights(n_hidden, n_classes)
-
     # Estimate P(y | x) given the current weights
     p_y_given_x = feed_forward(
                       T.nnet.softmax,
@@ -147,10 +141,10 @@ def compile_model(n_in, n_classes, n_hidden, learning_rate, L1, L2):
     # also define how to update the weights based on the input label.
     train_model = theano.function(
         inputs=[x, y],
-        outputs=cost,
+        outputs=cost, # <-- Output depends on cost, which depends on P(y | x)
         updates=[
-            (maxent_W, sgd_step(logreg_W, cost, learning_rate)),
-            (maxent_b, sgd_step(logreg_W, cost, learning_rate)),
+            (logreg_W, sgd_step(logreg_W, cost, learning_rate)),
+            (logreg_b, sgd_step(logreg_b, cost, learning_rate)),
             (hidden_W, sgd_step(hidden_W, cost, learning_rate)),
             (hidden_b, sgd_step(hidden_b, cost, learning_rate)),
         ]
@@ -168,10 +162,10 @@ def compile_model(n_in, n_classes, n_hidden, learning_rate, L1, L2):
 
 def main(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
              dataset='mnist.pkl.gz', n_hidden=500):
-    print '... building the model'
-    train_model, evaluate_model = build_model(n_hidden, 10, learning_rate,
-                                              L1_reg, L2_reg)
     train_examples, dev_examples, test_examples = load_data(dataset)
+    print '... building the model'
+    train_model, evaluate_model = compile_model(28*28, 10, n_hidden, learning_rate,
+                                                L1_reg, L2_reg)
     print '... training'
     for epoch in range(1, n_epochs+1):
         for x, y in train_examples:

diff --git a/theano_mlp_small.py b/theano_mlp_small.py
@@ -0,0 +1,185 @@
+"""A stripped-down MLP example, using Theano.
+
+Based on the tutorial here: 
+
+Except:
+
+* Minibatch size 1, i.e. `true' stochastic update
+* No early stopping
+* Model compiled in one scope. No classes.
+"""
+
+import os
+import sys
+import time
+
+import numpy
+
+import theano
+import theano.tensor as T
+import gzip
+import cPickle
+
+
+def load_data(dataset):
+    ''' Loads the dataset
+
+    :type dataset: string
+    :param dataset: the path to the dataset (here MNIST)
+    '''
+    # Download the MNIST dataset if it is not present
+    data_dir, data_file = os.path.split(dataset)
+    if data_dir == "" and not os.path.isfile(dataset):
+        # Check if dataset is in the data directory.
+        new_path = os.path.join(
+            os.path.split(__file__)[0],
+            "..",
+            "data",
+            dataset
+        )
+        if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
+            dataset = new_path
+
+    if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
+        import urllib
+        url = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
+        print 'Downloading data from %s' % url
+        urllib.urlretrieve(url, dataset)
+
+    print '... loading data'
+
+    # Load the dataset
+    with gzip.open(dataset, 'rb') as f:
+        train_set, valid_set, test_set = cPickle.load(f)
+    return _make_array(train_set), _make_array(valid_set), _make_array(test_set)
+
+
+def _make_array(xy):
+    data_x, data_y = xy
+    return zip(
+        numpy.asarray(data_x, dtype=theano.config.floatX),
+        numpy.asarray(data_y, dtype='int32'))
+
+
+def _init_maxent_weights(n_hidden, n_out):
+    weights = numpy.zeros((n_hidden, 10), dtype=theano.config.floatX)
+    bias = numpy.zeros((10,), dtype=theano.config.floatX)
+    return (
+        theano.shared(name='W', borrow=True, value=weights),
+        theano.shared(name='b', borrow=True, value=bias)
+    )
+
+
+def _init_hidden_weights(n_in, n_out, activation=T.tanh):
+    rng = numpy.random.RandomState(1234)
+    weights = numpy.asarray(
+        rng.uniform(
+            low=-numpy.sqrt(6. / (n_in + n_out)),
+            high=numpy.sqrt(6. / (n_in + n_out)),
+            size=(n_in, n_out)
+        ),
+        dtype=theano.config.floatX
+    )
+    bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
+    return (
+        theano.shared(value=weights, name='W', borrow=True),
+        theano.shared(value=bias, name='b', borrow=True)
+    )
+
+
+# Define how an input is fed through a layer of the network, and how a step of
+# the stochastic gradient descent is computed.
+
+# Note that these are *symbolic expressions* --- we are just compiling code here.
+# These functions are only called during compile_model.  The *actual* feed-forward
+# and SGD update procedures, which happen iteratively on each example, are 
+# Theano-internal.
+def feed_forward(activation, weights, bias, input_):
+    return activation(T.dot(input_, weights) + bias)
+
+def sgd_step(param, cost, learning_rate):
+    return param - (learnign_rate * T.grad(cost, param))
+
+# These are also symbolic.
+def L1(L1_reg, w1, w2):
+    return L1_reg * (abs(w1).sum() + abs(w2).sum())
+
+
+def L2(L2_reg, w1, w2):
+    return L2_reg * ((w1 ** 2).sum() + (w2 ** 2).sum())
+
+
+def compile_model(n_in, n_classes, n_hidden, learning_rate, L1, L2):
+    '''Compile train and evaluation functions, which we'll then call iteratively
+    to train the parameters.  This function is called exactly once --- think of
+    it like a compiler.  We declare variables, allocate memory, and define some
+    computation.
+    '''
+    # allocate symbolic variables for the data
+    x = T.vector('x')  # Features
+    y = T.iscalar('y') # (Gold) Label
+
+    # Weights and bias term for the hidden layer
+    hidden_W, hidden_b = _init_hidden_weights(n_in, n_hidden, T.tanh) 
+    # Weights and bias term for the softmax (logistic regression) layer
+    logreg_W, logreg_b = _init_logreg_weights(n_hidden, n_classes)
+
+    # Estimate P(y | x) given the current weights
+    p_y_given_x = feed_forward(
+                      T.nnet.softmax,
+                      logreg_W,
+                      logreg_b,
+                      feed_forward(
+                          T.tanh,
+                          hidden_W,
+                          hidden_b,
+                          x)) # <--- Our input variable (the features)
+
+    cost = (
+        -T.log(p_y_given_x[0, y]) # <-- Negative log likelihood of gold label
+        + L1(L1_reg, logreg_W, hidden_W)
+        + L2(L2_reg, logreg_W, hidden_W)
+    )
+
+    # Compile the training function.  Successive calls to this update the weights.
+    # Internal state is maintained.
+    # The output is "cost", which requires the computation of p_y_given_x.  We
+    # also define how to update the weights based on the input label.
+    train_model = theano.function(
+        inputs=[x, y],
+        outputs=cost,
+        updates=[
+            (maxent_W, sgd_step(logreg_W, cost, learning_rate)),
+            (maxent_b, sgd_step(logreg_W, cost, learning_rate)),
+            (hidden_W, sgd_step(hidden_W, cost, learning_rate)),
+            (hidden_b, sgd_step(hidden_b, cost, learning_rate)),
+        ]
+    )
+
+    # Compile the evaluation function, which returns a 0/1 loss wrt the true
+    # label.  Note that the output depends on p_y_given_x, so the program must
+    # compute it.
+    evaluate_model = theano.function(
+        inputs=[x, y],
+        outputs=T.neq(y, T.argmax(p_y_given_x[0])),
+    )
+    return train_model, evaluate_model
+
+
+def main(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
+             dataset='mnist.pkl.gz', n_hidden=500):
+    print '... building the model'
+    train_model, evaluate_model = build_model(n_hidden, 10, learning_rate,
+                                              L1_reg, L2_reg)
+    train_examples, dev_examples, test_examples = load_data(dataset)
+    print '... training'
+    for epoch in range(1, n_epochs+1):
+        for x, y in train_examples:
+            train_model(x, y)
+        # compute zero-one loss on validation set
+        error = numpy.mean([evaluate_model(x, y) for x, y in dev_examples])
+        print('epoch %i, validation error %f %%' % (epoch, error * 100))
+
+
+if __name__ == '__main__':
+    main()
No results found