Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save andrewssobral/9a649c3e98bb5b2654d4089ddc20d4de to your computer and use it in GitHub Desktop.
Save andrewssobral/9a649c3e98bb5b2654d4089ddc20d4de to your computer and use it in GitHub Desktop.

Revisions

  1. @fchollet fchollet created this gist Jun 6, 2016.
    162 changes: 162 additions & 0 deletions classifier_from_little_data_script_2.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,162 @@
    '''This script goes along the blog post
    "Building powerful image classification models using very little data"
    from blog.keras.io.
    It uses data that can be downloaded at:
    https://www.kaggle.com/c/dogs-vs-cats/data
    In our setup, we:
    - created a data/ folder
    - created train/ and validation/ subfolders inside data/
    - created cats/ and dogs/ subfolders inside train/ and validation/
    - put the cat pictures index 0-999 in data/train/cats
    - put the cat pictures index 1000-1400 in data/validation/cats
    - put the dogs pictures index 12500-13499 in data/train/dogs
    - put the dog pictures index 13500-13900 in data/validation/dogs
    So that we have 1000 training examples for each class, and 400 validation examples for each class.
    In summary, this is our directory structure:
    ```
    data/
    train/
    dogs/
    dog001.jpg
    dog002.jpg
    ...
    cats/
    cat001.jpg
    cat002.jpg
    ...
    validation/
    dogs/
    dog001.jpg
    dog002.jpg
    ...
    cats/
    cat001.jpg
    cat002.jpg
    ...
    ```
    '''
    import os
    import h5py
    import numpy as np
    from keras.preprocessing.image import ImageDataGenerator
    from keras.models import Sequential
    from keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D
    from keras.layers import Activation, Dropout, Flatten, Dense

    # path to the model weights file.
    weights_path = '../keras/examples/vgg16_weights.h5'
    top_model_weights_path = 'bottleneck_fc_model.h5'
    # dimensions of our images.
    img_width, img_height = 150, 150

    train_data_dir = 'data/train'
    validation_data_dir = 'data/validation'
    nb_train_samples = 2000
    nb_validation_samples = 800
    nb_epoch = 50


    def save_bottlebeck_features():
    datagen = ImageDataGenerator(rescale=1./255)

    # build the VGG16 network
    model = Sequential()
    model.add(ZeroPadding2D((1, 1), input_shape=(3, img_width, img_height)))

    model.add(Convolution2D(64, 3, 3, activation='relu', name='conv1_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(64, 3, 3, activation='relu', name='conv1_2'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(128, 3, 3, activation='relu', name='conv2_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(128, 3, 3, activation='relu', name='conv2_2'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # load the weights of the VGG16 networks
    # (trained on ImageNet, won the ILSVRC competition in 2014)
    # note: when there is a complete match between your model definition
    # and your weight savefile, you can simply call model.load_weights(filename)
    assert os.path.exists(weights_path), 'Model weights not found (see "weights_path" variable in script).'
    f = h5py.File(weights_path)
    for k in range(f.attrs['nb_layers']):
    if k >= len(model.layers):
    # we don't look at the last (fully-connected) layers in the savefile
    break
    g = f['layer_{}'.format(k)]
    weights = [g['param_{}'.format(p)] for p in range(g.attrs['nb_params'])]
    model.layers[k].set_weights(weights)
    f.close()
    print('Model loaded.')

    generator = datagen.flow_from_directory(
    train_data_dir,
    target_size=(img_width, img_height),
    batch_size=32,
    class_mode=None,
    shuffle=False)
    bottleneck_features_train = model.predict_generator(generator, nb_train_samples)
    np.save(open('bottleneck_features_train.npy', 'w'), bottleneck_features_train)

    generator = datagen.flow_from_directory(
    validation_data_dir,
    target_size=(img_width, img_height),
    batch_size=32,
    class_mode=None,
    shuffle=False)
    bottleneck_features_validation = model.predict_generator(generator, nb_validation_samples)
    np.save(open('bottleneck_features_validation.npy', 'w'), bottleneck_features_validation)


    def train_top_model():
    train_data = np.load(open('bottleneck_features_train.npy'))
    train_labels = np.array([0] * (nb_train_samples / 2) + [1] * (nb_train_samples / 2))

    validation_data = np.load(open('bottleneck_features_validation.npy'))
    validation_labels = np.array([0] * (nb_validation_samples / 2) + [1] * (nb_validation_samples / 2))

    model = Sequential()
    model.add(Flatten(input_shape=train_data.shape[1:]))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(train_data, train_labels,
    nb_epoch=nb_epoch, batch_size=32,
    validation_data=(validation_data, validation_labels))
    model.save_weights(top_model_weights_path)


    save_bottlebeck_features()
    train_top_model()