Deep Music Genre


In [2]:
import itertools
import os
import _pickle as pickle

import IPython
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import sklearn
from keras import optimizers
from keras.utils.vis_utils import model_to_dot
from sklearn.model_selection import train_test_split

from models import resnet, spotify

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2


Using TensorFlow backend.

Visualizing and preprocessing

We already have the preprocessed songs as melspectograms (see scripts/generate_data.py). Let's load them and see one for each genre.


In [15]:
GTZAN_ROOT = 'datasets/genres'
GTZAN_GENRES = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
CLASSES = len(GTZAN_GENRES)

In [16]:
with open(os.path.join(GTZAN_ROOT, 'data.pickle'), 'rb') as f:
    X, y = pickle.load(f)
    
print(X.shape)
print(y.shape)


(1000, 647, 128)
(1000, 10)

In [23]:
def plot_spectogram(x, tag=None):
    plt.figure(figsize=(10, 5))
    librosa.display.specshow(x.T, y_axis='mel', x_axis='time')
    plt.colorbar(format='%+2.0f dB')
    if tag:
        plt.title('Melspectogram for ' + genre + ' 0')
        plt.tight_layout()

In [18]:
for i, genre in enumerate(GTZAN_GENRES):
    plot_spectogram(X[i*100, :, :], tag=genre)


Let's split the data into validation, test and train sets. Then we'll also shuffle it.


In [5]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.33, random_state=42)

print('TRAIN: X = {0}, y = {1}'.format(X_train.shape, y_train.shape))
print('TEST: X = {0}, y = {1}'.format(y_val.shape, y_val.shape))


TRAIN: X = (670, 647, 128), y = (670, 10)
TEST: X = (330, 10), y = (330, 10)

Now let's center the data around the mean, and split it into test, train and validation sets.


In [6]:
mean_spectogram = np.mean(X_train, axis=0)
plt.figure(figsize=(10, 5))
librosa.display.specshow(mean_spectogram.T, y_axis='mel', x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('Mean Melspectogram')
plt.tight_layout()



In [7]:
std_spectogram = np.std(X_train, axis=0)
X_val = (X_val - mean_spectogram) / std_spectogram
X_train = (X_train - mean_spectogram) / std_spectogram

Training the model

We'll be using a Resnet model with 5 Identity residual units per residual block (as described in Identity Mappings in Deep Residual Networks).


In [ ]:
X_train = X_train.reshape(X_train.shape + (1,))
X_val = X_val.reshape(X_val.shape + (1,))
model = resnet.get(X_train[0].shape, CLASSES, resnet.IdentityResidualUnit, [1,1])
#model = spotify.get(X_train[0].shape, CLASSES)
IPython.display.SVG(model_to_dot(model).create(prog='dot', format='svg'))


Out[ ]:
G 139623703559080 input_1: InputLayer 139623703836824 conv2d_1: Conv2D 139623703559080->139623703836824 139623703837776 batch_normalization_1: BatchNormalization 139623703836824->139623703837776 139623703936304 activation_1: Activation 139623703837776->139623703936304 139623704653384 max_pooling2d_1: MaxPooling2D 139623703936304->139623704653384 139623704427768 batch_normalization_2: BatchNormalization 139623704653384->139623704427768 139623705839040 add_1: Add 139623704653384->139623705839040 139623704640088 activation_2: Activation 139623704427768->139623704640088 139623704640760 conv2d_2: Conv2D 139623704640088->139623704640760 139623705296344 batch_normalization_3: BatchNormalization 139623704640760->139623705296344 139623705056536 activation_3: Activation 139623705296344->139623705056536 139623705058888 conv2d_3: Conv2D 139623705056536->139623705058888 139623705058888->139623705839040 139623705980376 batch_normalization_4: BatchNormalization 139623705839040->139623705980376 139623707974288 conv2d_6: Conv2D 139623705839040->139623707974288 139623705817384 activation_4: Activation 139623705980376->139623705817384 139623705821136 conv2d_4: Conv2D 139623705817384->139623705821136 139623706627040 batch_normalization_5: BatchNormalization 139623705821136->139623706627040 139623706873752 activation_5: Activation 139623706627040->139623706873752 139623683779384 batch_normalization_6: BatchNormalization 139623707974288->139623683779384 139623706870168 conv2d_5: Conv2D 139623706873752->139623706870168 139623707469584 add_2: Add 139623683779384->139623707469584 139623706870168->139623707469584 139623683981608 activation_6: Activation 139623707469584->139623683981608 139623704410040 average_pooling2d_1: AveragePooling2D 139623683981608->139623704410040 139623707009208 flatten_1: Flatten 139623704410040->139623707009208 139623703797600 dense_1: Dense 139623707009208->139623703797600

Now let's train our model using Adam.


In [ ]:
BATCH_SIZE = 32
EPOCHS = 100

optimizer = optimizers.Adam(lr=0.001, decay=0.0001)
with tf.device('/gpu:0'):
    model.compile(
        loss='categorical_crossentropy',
        optimizer=optimizer,
        metrics=['accuracy'])
    history = model.fit(
        X_train, y_train,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_data=(X_val, y_val),
        verbose=1)


Train on 670 samples, validate on 330 samples
Epoch 1/100
670/670 [==============================] - 130s - loss: 13.5450 - acc: 0.0970 - val_loss: 9.1285 - val_acc: 0.1545
Epoch 2/100
670/670 [==============================] - 129s - loss: 6.6978 - acc: 0.1522 - val_loss: 2.3024 - val_acc: 0.0909
Epoch 3/100
670/670 [==============================] - 129s - loss: 2.0945 - acc: 0.2507 - val_loss: 2.1287 - val_acc: 0.1939
Epoch 4/100
670/670 [==============================] - 129s - loss: 1.8002 - acc: 0.3403 - val_loss: 1.9505 - val_acc: 0.3303
Epoch 5/100
670/670 [==============================] - 128s - loss: 1.4765 - acc: 0.4866 - val_loss: 2.3850 - val_acc: 0.2606
Epoch 6/100
670/670 [==============================] - 129s - loss: 1.3801 - acc: 0.5493 - val_loss: 2.5711 - val_acc: 0.1909
Epoch 7/100
670/670 [==============================] - 128s - loss: 1.1905 - acc: 0.5716 - val_loss: 4.7831 - val_acc: 0.2333
Epoch 8/100
670/670 [==============================] - 129s - loss: 1.1235 - acc: 0.6015 - val_loss: 1.9843 - val_acc: 0.3727
Epoch 9/100
670/670 [==============================] - 129s - loss: 0.9505 - acc: 0.6612 - val_loss: 1.8931 - val_acc: 0.4030
Epoch 10/100
670/670 [==============================] - 129s - loss: 0.8190 - acc: 0.7060 - val_loss: 3.5696 - val_acc: 0.3030
Epoch 11/100
670/670 [==============================] - 129s - loss: 0.7069 - acc: 0.7582 - val_loss: 2.6428 - val_acc: 0.3636
Epoch 12/100
670/670 [==============================] - 129s - loss: 0.7194 - acc: 0.7313 - val_loss: 1.8754 - val_acc: 0.4333
Epoch 13/100
670/670 [==============================] - 129s - loss: 0.6439 - acc: 0.7701 - val_loss: 1.9104 - val_acc: 0.4606
Epoch 14/100
670/670 [==============================] - 129s - loss: 0.5015 - acc: 0.8403 - val_loss: 1.7354 - val_acc: 0.4636
Epoch 15/100
670/670 [==============================] - 129s - loss: 0.4721 - acc: 0.8358 - val_loss: 2.5687 - val_acc: 0.4303
Epoch 16/100
670/670 [==============================] - 129s - loss: 0.3980 - acc: 0.8612 - val_loss: 1.8160 - val_acc: 0.5030
Epoch 17/100
670/670 [==============================] - 129s - loss: 0.4292 - acc: 0.8627 - val_loss: 1.7972 - val_acc: 0.5212
Epoch 18/100
670/670 [==============================] - 129s - loss: 0.3550 - acc: 0.8716 - val_loss: 1.9795 - val_acc: 0.5152
Epoch 19/100
670/670 [==============================] - 129s - loss: 0.3440 - acc: 0.8896 - val_loss: 1.7645 - val_acc: 0.5515
Epoch 20/100
670/670 [==============================] - 129s - loss: 0.2792 - acc: 0.9015 - val_loss: 1.8649 - val_acc: 0.5303
Epoch 21/100
670/670 [==============================] - 128s - loss: 0.2654 - acc: 0.9060 - val_loss: 1.8379 - val_acc: 0.5394
Epoch 22/100
670/670 [==============================] - 129s - loss: 0.2203 - acc: 0.9284 - val_loss: 1.8729 - val_acc: 0.5212
Epoch 23/100
670/670 [==============================] - 129s - loss: 0.1836 - acc: 0.9373 - val_loss: 2.2595 - val_acc: 0.5152
Epoch 24/100
670/670 [==============================] - 129s - loss: 0.1654 - acc: 0.9463 - val_loss: 1.9407 - val_acc: 0.5576
Epoch 25/100
670/670 [==============================] - 129s - loss: 0.1549 - acc: 0.9522 - val_loss: 2.2583 - val_acc: 0.5485
Epoch 26/100
670/670 [==============================] - 129s - loss: 0.2849 - acc: 0.9104 - val_loss: 3.2673 - val_acc: 0.4667
Epoch 27/100
670/670 [==============================] - 129s - loss: 0.4168 - acc: 0.8552 - val_loss: 2.3934 - val_acc: 0.5394
Epoch 28/100
670/670 [==============================] - 129s - loss: 0.2236 - acc: 0.9269 - val_loss: 2.2246 - val_acc: 0.5576
Epoch 29/100
670/670 [==============================] - 129s - loss: 0.0987 - acc: 0.9746 - val_loss: 2.3679 - val_acc: 0.5242
Epoch 30/100
670/670 [==============================] - 129s - loss: 0.0866 - acc: 0.9776 - val_loss: 2.4038 - val_acc: 0.5606
Epoch 31/100
670/670 [==============================] - 121s - loss: 0.1308 - acc: 0.9522 - val_loss: 2.5675 - val_acc: 0.5212
Epoch 32/100
670/670 [==============================] - 115s - loss: 0.1904 - acc: 0.9313 - val_loss: 2.6716 - val_acc: 0.5424
Epoch 33/100
670/670 [==============================] - 115s - loss: 0.0999 - acc: 0.9716 - val_loss: 2.4817 - val_acc: 0.5636
Epoch 34/100
670/670 [==============================] - 115s - loss: 0.0395 - acc: 0.9910 - val_loss: 2.2852 - val_acc: 0.5788
Epoch 35/100
670/670 [==============================] - 115s - loss: 0.0651 - acc: 0.9761 - val_loss: 2.4823 - val_acc: 0.5818
Epoch 36/100
670/670 [==============================] - 115s - loss: 0.0613 - acc: 0.9836 - val_loss: 2.8787 - val_acc: 0.5545
Epoch 37/100
670/670 [==============================] - 115s - loss: 0.0406 - acc: 0.9940 - val_loss: 2.7708 - val_acc: 0.5333
Epoch 38/100
670/670 [==============================] - 115s - loss: 0.0445 - acc: 0.9881 - val_loss: 2.8393 - val_acc: 0.5364
Epoch 39/100
670/670 [==============================] - 115s - loss: 0.0431 - acc: 0.9851 - val_loss: 2.7652 - val_acc: 0.5485
Epoch 40/100
670/670 [==============================] - 116s - loss: 0.0463 - acc: 0.9896 - val_loss: 2.9303 - val_acc: 0.5545
Epoch 41/100
670/670 [==============================] - 115s - loss: 0.0547 - acc: 0.9851 - val_loss: 2.7817 - val_acc: 0.5515
Epoch 42/100
670/670 [==============================] - 115s - loss: 0.0651 - acc: 0.9776 - val_loss: 2.8774 - val_acc: 0.5697
Epoch 43/100
670/670 [==============================] - 115s - loss: 0.0813 - acc: 0.9776 - val_loss: 3.2824 - val_acc: 0.5091
Epoch 44/100
670/670 [==============================] - 115s - loss: 0.1156 - acc: 0.9582 - val_loss: 2.8948 - val_acc: 0.5000
Epoch 45/100
670/670 [==============================] - 115s - loss: 0.0577 - acc: 0.9776 - val_loss: 2.7191 - val_acc: 0.5394
Epoch 46/100
670/670 [==============================] - 115s - loss: 0.0378 - acc: 0.9955 - val_loss: 3.0694 - val_acc: 0.5424
Epoch 47/100
670/670 [==============================] - 115s - loss: 0.0346 - acc: 0.9925 - val_loss: 2.7279 - val_acc: 0.5818
Epoch 48/100
670/670 [==============================] - 115s - loss: 0.0316 - acc: 0.9970 - val_loss: 2.9047 - val_acc: 0.5455
Epoch 49/100
670/670 [==============================] - 115s - loss: 0.0222 - acc: 0.9925 - val_loss: 2.9600 - val_acc: 0.5606
Epoch 50/100
670/670 [==============================] - 115s - loss: 0.0233 - acc: 0.9955 - val_loss: 2.9374 - val_acc: 0.5545
Epoch 51/100
670/670 [==============================] - 115s - loss: 0.0239 - acc: 0.9940 - val_loss: 2.7677 - val_acc: 0.5727
Epoch 52/100
670/670 [==============================] - 115s - loss: 0.0146 - acc: 0.9970 - val_loss: 2.8247 - val_acc: 0.5333
Epoch 53/100
670/670 [==============================] - 115s - loss: 0.0255 - acc: 0.9940 - val_loss: 2.8240 - val_acc: 0.5848
Epoch 54/100
670/670 [==============================] - 115s - loss: 0.0276 - acc: 0.9955 - val_loss: 2.7047 - val_acc: 0.5879
Epoch 55/100
670/670 [==============================] - 116s - loss: 0.0307 - acc: 0.9910 - val_loss: 3.0759 - val_acc: 0.5485
Epoch 56/100
670/670 [==============================] - 115s - loss: 0.0347 - acc: 0.9910 - val_loss: 2.9541 - val_acc: 0.5697
Epoch 57/100
670/670 [==============================] - 115s - loss: 0.0472 - acc: 0.9806 - val_loss: 3.1897 - val_acc: 0.5545
Epoch 58/100
670/670 [==============================] - 115s - loss: 0.0497 - acc: 0.9881 - val_loss: 3.1919 - val_acc: 0.5061
Epoch 59/100
670/670 [==============================] - 115s - loss: 0.0625 - acc: 0.9806 - val_loss: 3.1261 - val_acc: 0.5727
Epoch 60/100
670/670 [==============================] - 115s - loss: 0.0960 - acc: 0.9701 - val_loss: 2.6753 - val_acc: 0.5667
Epoch 61/100
640/670 [===========================>..] - ETA: 4s - loss: 0.0785 - acc: 0.9781

In [ ]:
print(history.history.keys())

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [ ]:
X_complete = np.vstack([X_train, X_val])
y_complete = np.argmax(np.vstack([y_train, y_val]), axis=1)
y_pred = np.argmax(model.predict(X_complete), axis=1)

cm = sklearn.metrics.confusion_matrix(y_complete, y_pred)
plt.figure()
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Class predictions')
plt.colorbar()
tick_marks = np.arange(CLASSES)
plt.xticks(tick_marks, GTZAN_GENRES, rotation=45)
plt.yticks(tick_marks, GTZAN_GENRES)
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, cm[i, j], horizontalalignment='center', color='white' if cm[i, j] > cm.max() / 2 else 'black')
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

Magnatagatune


In [24]:
PATH_MAGNATAGATUNE = 'datasets/magnatagatune'
X_DATA_SHAPE = (15659, 628, 128)
Y_DATA_SHAPE = (15659, 40)

with tf.Graph().as_default():
    X_init = tf.placeholder(tf.float32, shape=X_DATA_SHAPE)
    y_init = tf.placeholder(tf.float32, shape=Y_DATA_SHAPE)
    X = tf.Variable(X_init, name='X_data')
    y = tf.Variable(y_init, name='y_data')
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, os.path.join(PATH_MAGNATAGATUNE, 'data.ckpt'))
        plot_spectogram(X[0].eval())


INFO:tensorflow:Restoring parameters from datasets/magnatagatune/data.ckpt

In [12]:
with tf.Graph().as_default():
    X_init = tf.placeholder(tf.float32, shape=(3, 4))
    y_init = tf.placeholder(tf.float32, shape=(3, 3))
    X = tf.Variable(X_init, name='X_data')
    y = tf.Variable(y_init, name='y_data')
    saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer(), feed_dict={
            X_init: [[2, 3, 4, 5], [5, 6, 7, 8], [8, 9, 0, 1]],
            y_init: [[1, 0, 0], [0, 1, 0], [0, 0, 1]]})
        indices = [0, 2]
        print(X[0][indices].eval())


---------------------------------------------------------------------------
InvalidArgumentError                      Traceback (most recent call last)
/Users/miguelfrde/.virtualenvs/cs231n-project/lib/python3.6/site-packages/tensorflow/python/framework/common_shapes.py in _call_cpp_shape_fn_impl(op, input_tensors_needed, input_tensors_as_shapes_needed, debug_python_shape_fn, require_shape_fn)
    670           graph_def_version, node_def_str, input_shapes, input_tensors,
--> 671           input_tensors_as_shapes, status)
    672   except errors.InvalidArgumentError as err:

/usr/local/Cellar/python3/3.6.1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/contextlib.py in __exit__(self, type, value, traceback)
     88             try:
---> 89                 next(self.gen)
     90             except StopIteration:

/Users/miguelfrde/.virtualenvs/cs231n-project/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py in raise_exception_on_not_ok_status()
    465           compat.as_text(pywrap_tensorflow.TF_Message(status)),
--> 466           pywrap_tensorflow.TF_GetCode(status))
    467   finally:

InvalidArgumentError: Index out of range using input dim 1; input has only 1 dims for 'strided_slice_1' (op: 'StridedSlice') with input shapes: [4], [2], [2], [2] and with computed input tensors: input[3] = <1 1>.

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-12-44cef0da6ad6> in <module>()
     10             y_init: [[1, 0, 0], [0, 1, 0], [0, 0, 1]]})
     11         indices = [0, 2]
---> 12         print(X[0][indices].eval())

/Users/miguelfrde/.virtualenvs/cs231n-project/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py in _SliceHelper(tensor, slice_spec, var)
    495         ellipsis_mask=ellipsis_mask,
    496         var=var,
--> 497         name=name)
    498 
    499 

/Users/miguelfrde/.virtualenvs/cs231n-project/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py in strided_slice(input_, begin, end, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask, var, name)
    653       ellipsis_mask=ellipsis_mask,
    654       new_axis_mask=new_axis_mask,
--> 655       shrink_axis_mask=shrink_axis_mask)
    656 
    657   def assign(val):

/Users/miguelfrde/.virtualenvs/cs231n-project/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py in strided_slice(input, begin, end, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask, name)
   3566                                 ellipsis_mask=ellipsis_mask,
   3567                                 new_axis_mask=new_axis_mask,
-> 3568                                 shrink_axis_mask=shrink_axis_mask, name=name)
   3569   return result
   3570 

/Users/miguelfrde/.virtualenvs/cs231n-project/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py in apply_op(self, op_type_name, name, **keywords)
    766         op = g.create_op(op_type_name, inputs, output_types, name=scope,
    767                          input_types=input_types, attrs=attr_protos,
--> 768                          op_def=op_def)
    769         if output_structure:
    770           outputs = op.outputs

/Users/miguelfrde/.virtualenvs/cs231n-project/lib/python3.6/site-packages/tensorflow/python/framework/ops.py in create_op(self, op_type, inputs, dtypes, input_types, name, attrs, op_def, compute_shapes, compute_device)
   2336                     original_op=self._default_original_op, op_def=op_def)
   2337     if compute_shapes:
-> 2338       set_shapes_for_outputs(ret)
   2339     self._add_op(ret)
   2340     self._record_op_seen_by_control_dependencies(ret)

/Users/miguelfrde/.virtualenvs/cs231n-project/lib/python3.6/site-packages/tensorflow/python/framework/ops.py in set_shapes_for_outputs(op)
   1717       shape_func = _call_cpp_shape_fn_and_require_op
   1718 
-> 1719   shapes = shape_func(op)
   1720   if shapes is None:
   1721     raise RuntimeError(

/Users/miguelfrde/.virtualenvs/cs231n-project/lib/python3.6/site-packages/tensorflow/python/framework/ops.py in call_with_requiring(op)
   1667 
   1668   def call_with_requiring(op):
-> 1669     return call_cpp_shape_fn(op, require_shape_fn=True)
   1670 
   1671   _call_cpp_shape_fn_and_require_op = call_with_requiring

/Users/miguelfrde/.virtualenvs/cs231n-project/lib/python3.6/site-packages/tensorflow/python/framework/common_shapes.py in call_cpp_shape_fn(op, input_tensors_needed, input_tensors_as_shapes_needed, debug_python_shape_fn, require_shape_fn)
    608     res = _call_cpp_shape_fn_impl(op, input_tensors_needed,
    609                                   input_tensors_as_shapes_needed,
--> 610                                   debug_python_shape_fn, require_shape_fn)
    611     if not isinstance(res, dict):
    612       # Handles the case where _call_cpp_shape_fn_impl calls unknown_shape(op).

/Users/miguelfrde/.virtualenvs/cs231n-project/lib/python3.6/site-packages/tensorflow/python/framework/common_shapes.py in _call_cpp_shape_fn_impl(op, input_tensors_needed, input_tensors_as_shapes_needed, debug_python_shape_fn, require_shape_fn)
    674       missing_shape_fn = True
    675     else:
--> 676       raise ValueError(err.message)
    677 
    678   if missing_shape_fn:

ValueError: Index out of range using input dim 1; input has only 1 dims for 'strided_slice_1' (op: 'StridedSlice') with input shapes: [4], [2], [2], [2] and with computed input tensors: input[3] = <1 1>.

In [ ]: