Skip to content

Instantly share code, notes, and snippets.

@honnibal
Last active March 1, 2023 15:10
Show Gist options
  • Select an option

  • Save honnibal/6a9e5ef2921c0214eeeb to your computer and use it in GitHub Desktop.

Select an option

Save honnibal/6a9e5ef2921c0214eeeb to your computer and use it in GitHub Desktop.

Revisions

  1. honnibal revised this gist Oct 26, 2015. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion theano_mlp_small.py
    Original file line number Diff line number Diff line change
    @@ -73,7 +73,7 @@ def _make_array(xy):

    def _init_logreg_weights(n_hidden, n_out):
    weights = numpy.zeros((n_hidden, n_out), dtype=theano.config.floatX)
    bias = numpy.zeros((10,), dtype=theano.config.floatX)
    bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
    return (
    theano.shared(name='W', borrow=True, value=weights),
    theano.shared(name='b', borrow=True, value=bias)
  2. honnibal revised this gist Oct 26, 2015. 1 changed file with 3 additions and 0 deletions.
    3 changes: 3 additions & 0 deletions theano_mlp_small.py
    Original file line number Diff line number Diff line change
    @@ -14,6 +14,9 @@
    * Inputs streamed to model, not pre-loaded as given
    * Minibatch size 1, i.e. `true' stochastic update
    * No early stopping
    Released under MIT license
    Copyright Matthew Honnibal, 2015.
    """
    import os
    import sys
  3. honnibal revised this gist Aug 29, 2015. No changes.
  4. honnibal revised this gist Jun 22, 2015. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion theano_mlp_small.py
    Original file line number Diff line number Diff line change
    @@ -69,7 +69,7 @@ def _make_array(xy):


    def _init_logreg_weights(n_hidden, n_out):
    weights = numpy.zeros((n_hidden, 10), dtype=theano.config.floatX)
    weights = numpy.zeros((n_hidden, n_out), dtype=theano.config.floatX)
    bias = numpy.zeros((10,), dtype=theano.config.floatX)
    return (
    theano.shared(name='W', borrow=True, value=weights),
  5. honnibal revised this gist Jun 19, 2015. 1 changed file with 1 addition and 2 deletions.
    3 changes: 1 addition & 2 deletions theano_mlp_small.py
    Original file line number Diff line number Diff line change
    @@ -176,8 +176,7 @@ def main(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
    dataset='mnist.pkl.gz', n_hidden=500):
    train_examples, dev_examples, test_examples = load_data(dataset)
    print '... building the model'
    train_model, evaluate_model = compile_model(28*28, 10, n_hidden, learning_rate,
    L1_reg, L2_reg)
    train_model, evaluate_model = compile_model(28*28, 10, n_hidden, learning_rate, L1_reg, L2_reg)
    print '... training'
    for epoch in range(1, n_epochs+1):
    for x, y in train_examples:
  6. honnibal revised this gist Jun 19, 2015. 1 changed file with 0 additions and 1 deletion.
    1 change: 0 additions & 1 deletion theano_mlp_small.py
    Original file line number Diff line number Diff line change
    @@ -14,7 +14,6 @@
    * Inputs streamed to model, not pre-loaded as given
    * Minibatch size 1, i.e. `true' stochastic update
    * No early stopping
    * Model compiled in one scope. No classes.
    """
    import os
    import sys
  7. honnibal revised this gist Jun 19, 2015. 1 changed file with 12 additions and 12 deletions.
    24 changes: 12 additions & 12 deletions theano_mlp_small.py
    Original file line number Diff line number Diff line change
    @@ -2,19 +2,14 @@
    Based on the tutorial here: http://deeplearning.net/tutorial/mlp.html
    Theano is very unintuitive the first time you see it, and I found the MLP tutorial
    especially confusing.
    This example trims away some complexities, and makes it easier to see how Theano works.
    I think my version is clearer for two reasons:
    Design changes:
    1. The model is compiled in a distinct function, that only returns the train/eval
    functions. This way the symbolic variables are not in scope of the main function,
    making it clear that they are not part of the run-time.
    2. No classes. The network is shown by simply chaining together the function calls
    of the components.
    * Model compiled in a distinct function, so that symbolic variables are not in run-time scope.
    * No classes. Network shown by chained function calls.
    I also made some simplifications, pruning out details which are necessary for
    real-world use, but complicate the example:
    Some features of original have been dropped:
    * Inputs streamed to model, not pre-loaded as given
    * Minibatch size 1, i.e. `true' stochastic update
    @@ -83,7 +78,7 @@ def _init_logreg_weights(n_hidden, n_out):
    )


    def _init_hidden_weights(n_in, n_out, activation=T.tanh):
    def _init_hidden_weights(n_in, n_out):
    rng = numpy.random.RandomState(1234)
    weights = numpy.asarray(
    rng.uniform(
    @@ -131,6 +126,11 @@ def compile_model(n_in, n_classes, n_hidden, learning_rate, L1_reg, L2_reg):
    # allocate symbolic variables for the data
    x = T.vector('x') # Features
    y = T.iscalar('y') # (Gold) Label

    # Allocate and initialize weights. These are stored internally, and updated.
    hidden_W, hidden_b = _init_hidden_weights(n_in, n_hidden)
    logreg_W, logreg_b = _init_logreg_weights(n_hidden, n_classes)

    # Estimate P(y | x) given the current weights
    p_y_given_x = feed_forward(
    T.nnet.softmax,
    @@ -189,4 +189,4 @@ def main(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,


    if __name__ == '__main__':
    main()
    main()
  8. honnibal revised this gist Jun 19, 2015. 1 changed file with 15 additions and 2 deletions.
    17 changes: 15 additions & 2 deletions theano_mlp_small.py
    Original file line number Diff line number Diff line change
    @@ -1,9 +1,22 @@
    """A stripped-down MLP example, using Theano.
    Based on the tutorial here:
    Based on the tutorial here: http://deeplearning.net/tutorial/mlp.html
    Except:
    Theano is very unintuitive the first time you see it, and I found the MLP tutorial
    especially confusing.
    I think my version is clearer for two reasons:
    1. The model is compiled in a distinct function, that only returns the train/eval
    functions. This way the symbolic variables are not in scope of the main function,
    making it clear that they are not part of the run-time.
    2. No classes. The network is shown by simply chaining together the function calls
    of the components.
    I also made some simplifications, pruning out details which are necessary for
    real-world use, but complicate the example:
    * Inputs streamed to model, not pre-loaded as given
    * Minibatch size 1, i.e. `true' stochastic update
    * No early stopping
    * Model compiled in one scope. No classes.
  9. honnibal revised this gist Jun 19, 2015. 1 changed file with 16 additions and 22 deletions.
    38 changes: 16 additions & 22 deletions theano_mlp_small.py
    Original file line number Diff line number Diff line change
    @@ -8,10 +8,10 @@
    * No early stopping
    * Model compiled in one scope. No classes.
    """

    import os
    import sys
    import time
    from os import path

    import numpy

    @@ -31,12 +31,12 @@ def load_data(dataset):
    data_dir, data_file = os.path.split(dataset)
    if data_dir == "" and not os.path.isfile(dataset):
    # Check if dataset is in the data directory.
    new_path = os.path.join(
    os.path.split(__file__)[0],
    "..",
    "data",
    dataset
    )
    data_dir = os.path.join(os.path.split(__file__)[0], "..", "data")
    if not path.exists(data_dir):
    print "No data directory to save data to. Try:"
    print "mkdir ../data"
    sys.exit(1)
    new_path = path.join(data_dir, data_file)
    if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
    dataset = new_path

    @@ -61,7 +61,7 @@ def _make_array(xy):
    numpy.asarray(data_y, dtype='int32'))


    def _init_maxent_weights(n_hidden, n_out):
    def _init_logreg_weights(n_hidden, n_out):
    weights = numpy.zeros((n_hidden, 10), dtype=theano.config.floatX)
    bias = numpy.zeros((10,), dtype=theano.config.floatX)
    return (
    @@ -98,7 +98,7 @@ def feed_forward(activation, weights, bias, input_):
    return activation(T.dot(input_, weights) + bias)

    def sgd_step(param, cost, learning_rate):
    return param - (learnign_rate * T.grad(cost, param))
    return param - (learning_rate * T.grad(cost, param))

    # These are also symbolic.
    def L1(L1_reg, w1, w2):
    @@ -109,7 +109,7 @@ def L2(L2_reg, w1, w2):
    return L2_reg * ((w1 ** 2).sum() + (w2 ** 2).sum())


    def compile_model(n_in, n_classes, n_hidden, learning_rate, L1, L2):
    def compile_model(n_in, n_classes, n_hidden, learning_rate, L1_reg, L2_reg):
    '''Compile train and evaluation functions, which we'll then call iteratively
    to train the parameters. This function is called exactly once --- think of
    it like a compiler. We declare variables, allocate memory, and define some
    @@ -118,12 +118,6 @@ def compile_model(n_in, n_classes, n_hidden, learning_rate, L1, L2):
    # allocate symbolic variables for the data
    x = T.vector('x') # Features
    y = T.iscalar('y') # (Gold) Label

    # Weights and bias term for the hidden layer
    hidden_W, hidden_b = _init_hidden_weights(n_in, n_hidden, T.tanh)
    # Weights and bias term for the softmax (logistic regression) layer
    logreg_W, logreg_b = _init_logreg_weights(n_hidden, n_classes)

    # Estimate P(y | x) given the current weights
    p_y_given_x = feed_forward(
    T.nnet.softmax,
    @@ -147,10 +141,10 @@ def compile_model(n_in, n_classes, n_hidden, learning_rate, L1, L2):
    # also define how to update the weights based on the input label.
    train_model = theano.function(
    inputs=[x, y],
    outputs=cost,
    outputs=cost, # <-- Output depends on cost, which depends on P(y | x)
    updates=[
    (maxent_W, sgd_step(logreg_W, cost, learning_rate)),
    (maxent_b, sgd_step(logreg_W, cost, learning_rate)),
    (logreg_W, sgd_step(logreg_W, cost, learning_rate)),
    (logreg_b, sgd_step(logreg_b, cost, learning_rate)),
    (hidden_W, sgd_step(hidden_W, cost, learning_rate)),
    (hidden_b, sgd_step(hidden_b, cost, learning_rate)),
    ]
    @@ -168,10 +162,10 @@ def compile_model(n_in, n_classes, n_hidden, learning_rate, L1, L2):

    def main(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
    dataset='mnist.pkl.gz', n_hidden=500):
    print '... building the model'
    train_model, evaluate_model = build_model(n_hidden, 10, learning_rate,
    L1_reg, L2_reg)
    train_examples, dev_examples, test_examples = load_data(dataset)
    print '... building the model'
    train_model, evaluate_model = compile_model(28*28, 10, n_hidden, learning_rate,
    L1_reg, L2_reg)
    print '... training'
    for epoch in range(1, n_epochs+1):
    for x, y in train_examples:
  10. honnibal created this gist Jun 19, 2015.
    185 changes: 185 additions & 0 deletions theano_mlp_small.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,185 @@
    """A stripped-down MLP example, using Theano.
    Based on the tutorial here:
    Except:
    * Minibatch size 1, i.e. `true' stochastic update
    * No early stopping
    * Model compiled in one scope. No classes.
    """

    import os
    import sys
    import time

    import numpy

    import theano
    import theano.tensor as T
    import gzip
    import cPickle


    def load_data(dataset):
    ''' Loads the dataset
    :type dataset: string
    :param dataset: the path to the dataset (here MNIST)
    '''
    # Download the MNIST dataset if it is not present
    data_dir, data_file = os.path.split(dataset)
    if data_dir == "" and not os.path.isfile(dataset):
    # Check if dataset is in the data directory.
    new_path = os.path.join(
    os.path.split(__file__)[0],
    "..",
    "data",
    dataset
    )
    if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
    dataset = new_path

    if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
    import urllib
    url = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
    print 'Downloading data from %s' % url
    urllib.urlretrieve(url, dataset)

    print '... loading data'

    # Load the dataset
    with gzip.open(dataset, 'rb') as f:
    train_set, valid_set, test_set = cPickle.load(f)
    return _make_array(train_set), _make_array(valid_set), _make_array(test_set)


    def _make_array(xy):
    data_x, data_y = xy
    return zip(
    numpy.asarray(data_x, dtype=theano.config.floatX),
    numpy.asarray(data_y, dtype='int32'))


    def _init_maxent_weights(n_hidden, n_out):
    weights = numpy.zeros((n_hidden, 10), dtype=theano.config.floatX)
    bias = numpy.zeros((10,), dtype=theano.config.floatX)
    return (
    theano.shared(name='W', borrow=True, value=weights),
    theano.shared(name='b', borrow=True, value=bias)
    )


    def _init_hidden_weights(n_in, n_out, activation=T.tanh):
    rng = numpy.random.RandomState(1234)
    weights = numpy.asarray(
    rng.uniform(
    low=-numpy.sqrt(6. / (n_in + n_out)),
    high=numpy.sqrt(6. / (n_in + n_out)),
    size=(n_in, n_out)
    ),
    dtype=theano.config.floatX
    )
    bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
    return (
    theano.shared(value=weights, name='W', borrow=True),
    theano.shared(value=bias, name='b', borrow=True)
    )


    # Define how an input is fed through a layer of the network, and how a step of
    # the stochastic gradient descent is computed.

    # Note that these are *symbolic expressions* --- we are just compiling code here.
    # These functions are only called during compile_model. The *actual* feed-forward
    # and SGD update procedures, which happen iteratively on each example, are
    # Theano-internal.
    def feed_forward(activation, weights, bias, input_):
    return activation(T.dot(input_, weights) + bias)

    def sgd_step(param, cost, learning_rate):
    return param - (learnign_rate * T.grad(cost, param))

    # These are also symbolic.
    def L1(L1_reg, w1, w2):
    return L1_reg * (abs(w1).sum() + abs(w2).sum())


    def L2(L2_reg, w1, w2):
    return L2_reg * ((w1 ** 2).sum() + (w2 ** 2).sum())


    def compile_model(n_in, n_classes, n_hidden, learning_rate, L1, L2):
    '''Compile train and evaluation functions, which we'll then call iteratively
    to train the parameters. This function is called exactly once --- think of
    it like a compiler. We declare variables, allocate memory, and define some
    computation.
    '''
    # allocate symbolic variables for the data
    x = T.vector('x') # Features
    y = T.iscalar('y') # (Gold) Label

    # Weights and bias term for the hidden layer
    hidden_W, hidden_b = _init_hidden_weights(n_in, n_hidden, T.tanh)
    # Weights and bias term for the softmax (logistic regression) layer
    logreg_W, logreg_b = _init_logreg_weights(n_hidden, n_classes)

    # Estimate P(y | x) given the current weights
    p_y_given_x = feed_forward(
    T.nnet.softmax,
    logreg_W,
    logreg_b,
    feed_forward(
    T.tanh,
    hidden_W,
    hidden_b,
    x)) # <--- Our input variable (the features)

    cost = (
    -T.log(p_y_given_x[0, y]) # <-- Negative log likelihood of gold label
    + L1(L1_reg, logreg_W, hidden_W)
    + L2(L2_reg, logreg_W, hidden_W)
    )

    # Compile the training function. Successive calls to this update the weights.
    # Internal state is maintained.
    # The output is "cost", which requires the computation of p_y_given_x. We
    # also define how to update the weights based on the input label.
    train_model = theano.function(
    inputs=[x, y],
    outputs=cost,
    updates=[
    (maxent_W, sgd_step(logreg_W, cost, learning_rate)),
    (maxent_b, sgd_step(logreg_W, cost, learning_rate)),
    (hidden_W, sgd_step(hidden_W, cost, learning_rate)),
    (hidden_b, sgd_step(hidden_b, cost, learning_rate)),
    ]
    )

    # Compile the evaluation function, which returns a 0/1 loss wrt the true
    # label. Note that the output depends on p_y_given_x, so the program must
    # compute it.
    evaluate_model = theano.function(
    inputs=[x, y],
    outputs=T.neq(y, T.argmax(p_y_given_x[0])),
    )
    return train_model, evaluate_model


    def main(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
    dataset='mnist.pkl.gz', n_hidden=500):
    print '... building the model'
    train_model, evaluate_model = build_model(n_hidden, 10, learning_rate,
    L1_reg, L2_reg)
    train_examples, dev_examples, test_examples = load_data(dataset)
    print '... training'
    for epoch in range(1, n_epochs+1):
    for x, y in train_examples:
    train_model(x, y)
    # compute zero-one loss on validation set
    error = numpy.mean([evaluate_model(x, y) for x, y in dev_examples])
    print('epoch %i, validation error %f %%' % (epoch, error * 100))


    if __name__ == '__main__':
    main()