{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# data pre-process\n", "\n", "The drumkit sound dataset used for the training is available here:\n", "https://s3-ap-northeast-1.amazonaws.com/codepen-dev/drumkit_dataset.zip" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['0_kick', '1_snare', '2_hihat_closed', '3_hihat_open', '4_tom_low', '5_tom_mid', '6_tom_high', '7_clap', '8_rim']\n" ] } ], "source": [ "import librosa\n", "import librosa.display\n", "import numpy as np\n", "from glob import glob\n", "%matplotlib inline\n", "\n", "N_FFT = 1024\n", "HOP_LENGTH = 256 \n", "SR = 16000\n", "MELSPEC_SIZE = 128;\n", "\n", "len_src = 3. \n", "ref_n_src = int(SR * len_src)\n", "\n", "drum_dirs = [r.split('/')[-1] for r in sorted(glob('./selected_drums/*'))]\n", "NB_CLASS = len(drum_dirs)\n", "\n", "print drum_dirs\n", "\n", "def get_melspec(filepath, hop_length=HOP_LENGTH, n_mels=128):\n", "\n", " y_tmp = np.zeros(ref_n_src)\n", " \n", " y, sr = librosa.core.load(filepath, sr = SR, mono=True)\n", " y = y[:ref_n_src]\n", " y_tmp[:len(y)] = y[:ref_n_src]\n", " \n", " # sfft -> mel conversion\n", " melspec = librosa.feature.melspectrogram(y=y_tmp, sr=sr,\n", " n_fft=N_FFT, hop_length=hop_length, n_mels=n_mels)\n", " S = librosa.power_to_db(melspec, np.max) \n", " \n", " return S" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\r", " 0%| | 0/27069 [00:00 MELSPEC_SIZE:\n", " melspec = melspec[:,:MELSPEC_SIZE]\n", " else:\n", " melspec.resize((MELSPEC_SIZE,MELSPEC_SIZE)) \n", "\n", " drum_genres.append(genre)\n", " drum_melspecs.append(melspec)\n", " except:\n", " print (\"error\", filepath)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(27069,)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Using TensorFlow backend.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "(27069, 9)\n", "(27069, 128, 128, 1)\n" ] } ], "source": [ "drum_genres = np.array(drum_genres)\n", "print(drum_genres.shape)\n", "\n", "from keras.utils import to_categorical\n", "\n", "drum_genres = to_categorical(drum_genres, NB_CLASS)\n", "print(drum_genres.shape)\n", "\n", "drum_melspecs = np.array(drum_melspecs)\n", "drum_melspecs = np.expand_dims(drum_melspecs, 3)\n", "print(drum_melspecs.shape)\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "np.savez(\"drum_data_128.npz\", melspecs=drum_melspecs, genres=drum_genres)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "## loading from pre-processed npz file\n", "# drum_melspecs = np.load(\"drum_data_128.npz\")['melspecs']\n", "# drum_genres = np.load(\"drum_data_128.npz\")['genres']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# training" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using TensorFlow backend.\n" ] } ], "source": [ "import os\n", "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", "\n", "from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten\n", "from keras.layers import BatchNormalization,Activation\n", "from keras.layers.advanced_activations import ELU\n", "\n", "from keras.models import Model\n", "from keras import backend as K\n", "\n", "SIZE = MELSPEC_SIZE\n", "\n", "input_img = Input(shape=(SIZE, SIZE, 1)) # normalized, 128 x 128\n", "\n", "x = Conv2D(32, (3, 3), padding='same', kernel_initializer='he_normal')(input_img) #nb_filter, nb_row, nb_col\n", "x = BatchNormalization(axis=1)(x)\n", "x = ELU(alpha=1.0)(x)\n", "x = MaxPooling2D((4, 4))(x)\n", "\n", "x = Conv2D(64, (3, 3), padding='same',kernel_initializer='he_normal')(x)\n", "x = BatchNormalization(axis=1)(x)\n", "x = ELU(alpha=1.0)(x)\n", "x = MaxPooling2D((2, 2))(x)\n", "\n", "\n", "# x = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal')(x)\n", "# x = BatchNormalization(axis=1)(x)\n", "# x = ELU(alpha=1.0)(x)\n", "# x = MaxPooling2D((2, 2), padding='same')(x)\n", "# print K.int_shape(x)\n", "\n", "x = Conv2D(32, (3, 3), padding='same', kernel_initializer='he_normal')(x)\n", "x = BatchNormalization(axis=1)(x)\n", "x = ELU(alpha=1.0)(x)\n", "x = MaxPooling2D((2, 4))(x)\n", "\n", "x = Conv2D(32, (3, 3), padding='same', kernel_initializer='he_normal')(x)\n", "x = BatchNormalization(axis=1)(x)\n", "x = ELU(alpha=1.0)(x)\n", "x = MaxPooling2D((2, 2))(x)\n", "x = Flatten()(x)\n", "x = Dense(NB_CLASS)(x)\n", "y = Activation(\"softmax\")(x)\n", "\n", "model = Model(input_img, y)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "input_1 (InputLayer) (None, 128, 128, 1) 0 \n", "_________________________________________________________________\n", "conv2d_1 (Conv2D) (None, 128, 128, 32) 320 \n", "_________________________________________________________________\n", "batch_normalization_1 (Batch (None, 128, 128, 32) 512 \n", "_________________________________________________________________\n", "elu_1 (ELU) (None, 128, 128, 32) 0 \n", "_________________________________________________________________\n", "max_pooling2d_1 (MaxPooling2 (None, 32, 32, 32) 0 \n", "_________________________________________________________________\n", "conv2d_2 (Conv2D) (None, 32, 32, 64) 18496 \n", "_________________________________________________________________\n", "batch_normalization_2 (Batch (None, 32, 32, 64) 128 \n", "_________________________________________________________________\n", "elu_2 (ELU) (None, 32, 32, 64) 0 \n", "_________________________________________________________________\n", "max_pooling2d_2 (MaxPooling2 (None, 16, 16, 64) 0 \n", "_________________________________________________________________\n", "conv2d_3 (Conv2D) (None, 16, 16, 32) 18464 \n", "_________________________________________________________________\n", "batch_normalization_3 (Batch (None, 16, 16, 32) 64 \n", "_________________________________________________________________\n", "elu_3 (ELU) (None, 16, 16, 32) 0 \n", "_________________________________________________________________\n", "max_pooling2d_3 (MaxPooling2 (None, 8, 4, 32) 0 \n", "_________________________________________________________________\n", "conv2d_4 (Conv2D) (None, 8, 4, 32) 9248 \n", "_________________________________________________________________\n", "batch_normalization_4 (Batch (None, 8, 4, 32) 32 \n", "_________________________________________________________________\n", "elu_4 (ELU) (None, 8, 4, 32) 0 \n", "_________________________________________________________________\n", "max_pooling2d_4 (MaxPooling2 (None, 4, 2, 32) 0 \n", "_________________________________________________________________\n", "flatten_1 (Flatten) (None, 256) 0 \n", "_________________________________________________________________\n", "dense_1 (Dense) (None, 9) 2313 \n", "_________________________________________________________________\n", "activation_1 (Activation) (None, 9) 0 \n", "=================================================================\n", "Total params: 49,577\n", "Trainable params: 49,209\n", "Non-trainable params: 368\n", "_________________________________________________________________\n" ] } ], "source": [ "model.summary()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{0: 3.808244231851435, 1: 3.4399542508577965, 2: 13.159455517744288, 3: 8.788636363636364, 4: 13.089458413926499, 5: 17.703727926749508, 6: 13.747587607922803, 7: 21.18075117370892, 8: 243.86486486486487}\n" ] } ], "source": [ "class_weight = {}\n", "total = drum_genres.shape[0]\n", "for i in range(NB_CLASS):\n", " nb = np.sum(np.argmax(drum_genres, axis=1) == i)\n", " class_weight[i] = total / float(nb) \n", "print class_weight\n", " " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "27069 24362\n" ] } ], "source": [ "nb_total = drum_melspecs.shape[0]\n", "nb_train = int(nb_total * 0.9)\n", "print nb_total, nb_train\n", "\n", "train_melspecs = drum_melspecs[:nb_train]\n", "train_genres = drum_genres[:nb_train]\n", "\n", "val_melspecs = drum_melspecs[nb_train:]\n", "val_genres = drum_genres[nb_train:]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(24362, 9)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_melspecs.shape\n", "train_genres.shape\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 24362 samples, validate on 2707 samples\n", "Epoch 1/100\n", "24362/24362 [==============================] - 15s 620us/step - loss: 9.6032 - acc: 0.5979 - val_loss: 1.0786 - val_acc: 0.6151\n", "Epoch 2/100\n", "24362/24362 [==============================] - 14s 591us/step - loss: 7.6791 - acc: 0.7147 - val_loss: 0.7552 - val_acc: 0.7577\n", "Epoch 3/100\n", "24362/24362 [==============================] - 14s 588us/step - loss: 6.9066 - acc: 0.7466 - val_loss: 0.7529 - val_acc: 0.7639\n", "Epoch 4/100\n", "24362/24362 [==============================] - 14s 584us/step - loss: 6.4276 - acc: 0.7641 - val_loss: 0.6297 - val_acc: 0.7917\n", "Epoch 5/100\n", "24362/24362 [==============================] - 14s 589us/step - loss: 5.9326 - acc: 0.7826 - val_loss: 0.6280 - val_acc: 0.7883\n", "Epoch 6/100\n", "24362/24362 [==============================] - 14s 589us/step - loss: 5.6260 - acc: 0.7909 - val_loss: 0.5761 - val_acc: 0.8038\n", "Epoch 7/100\n", "24362/24362 [==============================] - 14s 581us/step - loss: 5.2758 - acc: 0.8036 - val_loss: 0.5437 - val_acc: 0.8138\n", "Epoch 8/100\n", "24362/24362 [==============================] - 14s 589us/step - loss: 4.9793 - acc: 0.8125 - val_loss: 0.5270 - val_acc: 0.8109\n", "Epoch 9/100\n", "24362/24362 [==============================] - 14s 580us/step - loss: 4.6689 - acc: 0.8223 - val_loss: 0.5058 - val_acc: 0.8245\n", "Epoch 10/100\n", "24362/24362 [==============================] - 14s 586us/step - loss: 4.4284 - acc: 0.8323 - val_loss: 0.5059 - val_acc: 0.8168\n", "Epoch 11/100\n", "24362/24362 [==============================] - 14s 591us/step - loss: 4.1443 - acc: 0.8411 - val_loss: 0.4714 - val_acc: 0.8312\n", "Epoch 12/100\n", "24362/24362 [==============================] - 14s 586us/step - loss: 3.8199 - acc: 0.8530 - val_loss: 0.4355 - val_acc: 0.8400\n", "Epoch 13/100\n", "24362/24362 [==============================] - 14s 592us/step - loss: 3.5505 - acc: 0.8624 - val_loss: 0.4160 - val_acc: 0.8437\n", "Epoch 14/100\n", "24362/24362 [==============================] - 14s 590us/step - loss: 3.2395 - acc: 0.8756 - val_loss: 0.3993 - val_acc: 0.8537\n", "Epoch 15/100\n", "24362/24362 [==============================] - 14s 590us/step - loss: 2.9702 - acc: 0.8843 - val_loss: 0.3749 - val_acc: 0.8655\n", "Epoch 16/100\n", "24362/24362 [==============================] - 14s 593us/step - loss: 2.7573 - acc: 0.8922 - val_loss: 0.3710 - val_acc: 0.8633\n", "Epoch 17/100\n", "24362/24362 [==============================] - 14s 587us/step - loss: 2.4935 - acc: 0.9005 - val_loss: 0.3590 - val_acc: 0.8696\n", "Epoch 18/100\n", "24362/24362 [==============================] - 14s 594us/step - loss: 2.2897 - acc: 0.9097 - val_loss: 0.3597 - val_acc: 0.8733\n", "Epoch 19/100\n", "24362/24362 [==============================] - 14s 565us/step - loss: 2.0280 - acc: 0.9197 - val_loss: 0.3834 - val_acc: 0.8633\n", "Epoch 20/100\n", "24362/24362 [==============================] - 14s 561us/step - loss: 1.8911 - acc: 0.9221 - val_loss: 0.3902 - val_acc: 0.8626\n", "Epoch 21/100\n", "24362/24362 [==============================] - 14s 556us/step - loss: 2.1735 - acc: 0.9143 - val_loss: 0.4030 - val_acc: 0.8659\n", "Epoch 22/100\n", "24362/24362 [==============================] - 14s 572us/step - loss: 1.7177 - acc: 0.9296 - val_loss: 0.3557 - val_acc: 0.8814\n", "Epoch 23/100\n", "24362/24362 [==============================] - 14s 588us/step - loss: 1.3327 - acc: 0.9449 - val_loss: 0.3461 - val_acc: 0.8881\n", "Epoch 24/100\n", "24362/24362 [==============================] - 14s 594us/step - loss: 1.2125 - acc: 0.9498 - val_loss: 0.3828 - val_acc: 0.8818\n", "Epoch 25/100\n", "24362/24362 [==============================] - 14s 587us/step - loss: 1.1479 - acc: 0.9516 - val_loss: 0.3617 - val_acc: 0.8855\n", "Epoch 26/100\n", "24362/24362 [==============================] - 14s 592us/step - loss: 1.0277 - acc: 0.9575 - val_loss: 0.3443 - val_acc: 0.8947\n", "Epoch 27/100\n", "24362/24362 [==============================] - 14s 594us/step - loss: 0.9107 - acc: 0.9624 - val_loss: 0.3447 - val_acc: 0.8958\n", "Epoch 28/100\n", "24362/24362 [==============================] - 14s 591us/step - loss: 0.9255 - acc: 0.9601 - val_loss: 0.3940 - val_acc: 0.8833\n", "Epoch 29/100\n", "24362/24362 [==============================] - 14s 592us/step - loss: 0.8915 - acc: 0.9615 - val_loss: 0.4088 - val_acc: 0.8907\n", "Epoch 30/100\n", "24362/24362 [==============================] - 14s 585us/step - loss: 0.8569 - acc: 0.9626 - val_loss: 0.3988 - val_acc: 0.8840\n", "Epoch 31/100\n", "24362/24362 [==============================] - 14s 594us/step - loss: 0.8020 - acc: 0.9649 - val_loss: 0.3974 - val_acc: 0.8914\n", "Epoch 00031: early stopping\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from keras.callbacks import EarlyStopping\n", "es = EarlyStopping(verbose=1, patience=5)\n", "\n", "model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['acc'])\n", "\n", "model.fit(train_melspecs, train_genres, batch_size=64, \n", " epochs=100, verbose=1, \n", " shuffle=False,validation_data = (val_melspecs, val_genres), class_weight=class_weight, callbacks=[es])\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "model.save(\"model/drum_spec_model_128.h5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "then you can conver keras model into tensorflow.js model with the following command \n", "\n", "```$ tensorflowjs_converter --input_format keras \\\n", " path/to/my_model.h5 \\\n", " path/to/tfjs_target_dir```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.15" } }, "nbformat": 4, "nbformat_minor": 2 }