'''This script goes along the blog post "Building powerful image classification models using very little data" from blog.keras.io. It uses data that can be downloaded at: https://www.kaggle.com/c/dogs-vs-cats/data In our setup, we: - created a data/ folder - created train/ and validation/ subfolders inside data/ - created cats/ and dogs/ subfolders inside train/ and validation/ - put the cat pictures index 0-999 in data/train/cats - put the cat pictures index 1000-1400 in data/validation/cats - put the dogs pictures index 12500-13499 in data/train/dogs - put the dog pictures index 13500-13900 in data/validation/dogs So that we have 1000 training examples for each class, and 400 validation examples for each class. In summary, this is our directory structure: ``` data/ train/ dogs/ dog001.jpg dog002.jpg ... cats/ cat001.jpg cat002.jpg ... validation/ dogs/ dog001.jpg dog002.jpg ... cats/ cat001.jpg cat002.jpg ... ``` ''' import os import h5py import numpy as np from keras.preprocessing.image import ImageDataGenerator from keras.models import Sequential from keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D from keras.layers import Activation, Dropout, Flatten, Dense # path to the model weights file. weights_path = '../keras/examples/vgg16_weights.h5' top_model_weights_path = 'bottleneck_fc_model.h5' # dimensions of our images. img_width, img_height = 150, 150 train_data_dir = 'data/train' validation_data_dir = 'data/validation' nb_train_samples = 2000 nb_validation_samples = 800 nb_epoch = 50 def save_bottlebeck_features(): datagen = ImageDataGenerator(rescale=1./255) # build the VGG16 network model = Sequential() model.add(ZeroPadding2D((1, 1), input_shape=(3, img_width, img_height))) model.add(Convolution2D(64, 3, 3, activation='relu', name='conv1_1')) model.add(ZeroPadding2D((1, 1))) model.add(Convolution2D(64, 3, 3, activation='relu', name='conv1_2')) model.add(MaxPooling2D((2, 2), strides=(2, 2))) model.add(ZeroPadding2D((1, 1))) model.add(Convolution2D(128, 3, 3, activation='relu', name='conv2_1')) model.add(ZeroPadding2D((1, 1))) model.add(Convolution2D(128, 3, 3, activation='relu', name='conv2_2')) model.add(MaxPooling2D((2, 2), strides=(2, 2))) model.add(ZeroPadding2D((1, 1))) model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_1')) model.add(ZeroPadding2D((1, 1))) model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_2')) model.add(ZeroPadding2D((1, 1))) model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_3')) model.add(MaxPooling2D((2, 2), strides=(2, 2))) model.add(ZeroPadding2D((1, 1))) model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_1')) model.add(ZeroPadding2D((1, 1))) model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_2')) model.add(ZeroPadding2D((1, 1))) model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_3')) model.add(MaxPooling2D((2, 2), strides=(2, 2))) model.add(ZeroPadding2D((1, 1))) model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_1')) model.add(ZeroPadding2D((1, 1))) model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_2')) model.add(ZeroPadding2D((1, 1))) model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_3')) model.add(MaxPooling2D((2, 2), strides=(2, 2))) # load the weights of the VGG16 networks # (trained on ImageNet, won the ILSVRC competition in 2014) # note: when there is a complete match between your model definition # and your weight savefile, you can simply call model.load_weights(filename) assert os.path.exists(weights_path), 'Model weights not found (see "weights_path" variable in script).' f = h5py.File(weights_path) for k in range(f.attrs['nb_layers']): if k >= len(model.layers): # we don't look at the last (fully-connected) layers in the savefile break g = f['layer_{}'.format(k)] weights = [g['param_{}'.format(p)] for p in range(g.attrs['nb_params'])] model.layers[k].set_weights(weights) f.close() print('Model loaded.') generator = datagen.flow_from_directory( train_data_dir, target_size=(img_width, img_height), batch_size=32, class_mode=None, shuffle=False) bottleneck_features_train = model.predict_generator(generator, nb_train_samples) np.save(open('bottleneck_features_train.npy', 'w'), bottleneck_features_train) generator = datagen.flow_from_directory( validation_data_dir, target_size=(img_width, img_height), batch_size=32, class_mode=None, shuffle=False) bottleneck_features_validation = model.predict_generator(generator, nb_validation_samples) np.save(open('bottleneck_features_validation.npy', 'w'), bottleneck_features_validation) def train_top_model(): train_data = np.load(open('bottleneck_features_train.npy')) train_labels = np.array([0] * (nb_train_samples / 2) + [1] * (nb_train_samples / 2)) validation_data = np.load(open('bottleneck_features_validation.npy')) validation_labels = np.array([0] * (nb_validation_samples / 2) + [1] * (nb_validation_samples / 2)) model = Sequential() model.add(Flatten(input_shape=train_data.shape[1:])) model.add(Dense(256, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(1, activation='sigmoid')) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) model.fit(train_data, train_labels, nb_epoch=nb_epoch, batch_size=32, validation_data=(validation_data, validation_labels)) model.save_weights(top_model_weights_path) save_bottlebeck_features() train_top_model()