In [1]:
import os
import h5py
import librosa
import itertools
from copy import copy
import numpy as np
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Add
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import PReLU
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import AveragePooling2D
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.layers import GlobalMaxPooling2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import ReduceLROnPlateau

In [3]:
# For reproducibility purposes
np.random.seed(42)

Read the data

Helper functions to assist the process to read songs, split then and return an array of spectrograms/melspectrograms


In [4]:
"""
@description: Method to split a song into multiple songs using overlapping windows
"""
def splitsongs(X, y, window = 0.05, overlap = 0.5):
    # Empty lists to hold our results
    temp_X = []
    temp_y = []

    # Get the input song array size
    xshape = X.shape[0]
    chunk = int(xshape*window)
    offset = int(chunk*(1.-overlap))
    
    # Split the song and create new ones on windows
    spsong = [X[i:i+chunk] for i in range(0, xshape - chunk + offset, offset)]
    for s in spsong:
        if s.shape[0] != chunk:
            continue

        temp_X.append(s)
        temp_y.append(y)

    return np.array(temp_X), np.array(temp_y)

In [5]:
"""
@description: Method to convert a list of songs to a np array of melspectrograms
"""
def to_melspectrogram(songs, n_fft=1024, hop_length=256):
    # Transformation function
    melspec = lambda x: librosa.feature.melspectrogram(x, n_fft=n_fft,
        hop_length=hop_length, n_mels=128)[:,:,np.newaxis]

    # map transformation of input songs to melspectrogram using log-scale
    tsongs = map(melspec, songs)
    # np.array([librosa.power_to_db(s, ref=np.max) for s in list(tsongs)])
    return np.array(list(tsongs))

In [6]:
def split_convert(X, y):
    arr_specs, arr_genres = [], []
    
    # Convert to spectrograms and split into small windows
    for fn, genre in zip(X, y):
        signal, sr = librosa.load(fn)
        signal = signal[:song_samples]

        # Convert to dataset of spectograms/melspectograms
        signals, y = splitsongs(signal, genre)

        # Convert to "spec" representation
        specs = to_melspectrogram(signals)

        # Save files
        arr_genres.extend(y)
        arr_specs.extend(specs)
    
    return np.array(arr_specs), to_categorical(arr_genres)

In [7]:
def read_data(src_dir, genres, song_samples):    
    # Empty array of dicts with the processed features from all files
    arr_fn = []
    arr_genres = []

    # Get file list from the folders
    for x,_ in genres.items():
        folder = src_dir + x
        for root, subdirs, files in os.walk(folder):
            for file in files:
                file_name = folder + "/" + file

                # Save the file name and the genre
                arr_fn.append(file_name)
                arr_genres.append(genres[x])
    
    # Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(
        arr_fn, arr_genres, test_size=0.3, random_state=42, stratify=arr_genres
    )
    
    # Split into small segments and convert to spectrogram
    X_train, y_train = split_convert(X_train, y_train)
    X_test, y_test = split_convert(X_test, y_test)

    return X_train, X_test, y_train, y_test

In [8]:
# Parameters
gtzan_dir = '../data/genres/'
song_samples = 660000
genres = {'metal': 0, 'disco': 1, 'classical': 2, 'hiphop': 3, 'jazz': 4, 
          'country': 5, 'pop': 6, 'blues': 7, 'reggae': 8, 'rock': 9}

# Read the data
X_train, X_test, y_train, y_test = read_data(gtzan_dir, genres, song_samples)

In [9]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(27300, 128, 129, 1) (11700, 128, 129, 1) (27300, 10) (11700, 10)

In [10]:
# Histogram for train and test 
values, count = np.unique(np.argmax(y_train, axis=1), return_counts=True)
plt.bar(values, count)

values, count = np.unique(np.argmax(y_test, axis=1), return_counts=True)
plt.bar(values, count)
plt.show()


GTZAN Melspectrogram Generator


In [11]:
from tensorflow.keras.utils import Sequence

class GTZANGenerator(Sequence):
    def __init__(self, X, y, batch_size=64, is_test = False):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.is_test = is_test
    
    def __len__(self):
        return int(np.ceil(len(self.X)/self.batch_size))
    
    def __getitem__(self, index):
        # Get batch indexes
        signals = self.X[index*self.batch_size:(index+1)*self.batch_size]

        # Apply data augmentation
        if not self.is_test:
            signals = self.__augment(signals)
        return signals, self.y[index*self.batch_size:(index+1)*self.batch_size]
    
    def __augment(self, signals, hor_flip = 0.5, random_cutout = 0.5):
        spectrograms =  []
        for s in signals:
            signal = copy(s)
            
            # Perform horizontal flip
            if np.random.rand() < hor_flip:
                signal = np.flip(signal, 1)

            # Perform random cutoout of some frequency/time
            if np.random.rand() < random_cutout:
                lines = np.random.randint(signal.shape[0], size=3)
                cols = np.random.randint(signal.shape[0], size=4)
                signal[lines, :, :] = -80 # dB
                signal[:, cols, :] = -80 # dB

            spectrograms.append(signal)
        return np.array(spectrograms)
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.X))
        np.random.shuffle(self.indexes)
        return None

Custom CNN (Melspectrogram version)


In [12]:
def conv_block(x, n_filters, pool_size=(2, 2)):
    x = Conv2D(n_filters, (3, 3), strides=(1, 1), padding='same')(x)
    x = Activation('relu')(x)
    x = MaxPooling2D(pool_size=pool_size, strides=pool_size)(x)
    x = Dropout(0.25)(x)
    return x

In [13]:
# Model Definition
def create_model(input_shape, num_genres):
    inpt = Input(shape=input_shape)
    x = conv_block(inpt, 16)
    x = conv_block(x, 32)
    x = conv_block(x, 64)
    x = conv_block(x, 128)
    x = conv_block(x, 256)
    
    # Global Pooling and MLP
    x = Flatten()(x)
    x = Dropout(0.5)(x)
    x = Dense(512, activation='relu', 
              kernel_regularizer=tf.keras.regularizers.l2(0.02))(x)
    x = Dropout(0.25)(x)
    predictions = Dense(num_genres, 
                        activation='softmax', 
                        kernel_regularizer=tf.keras.regularizers.l2(0.02))(x)
    
    model = Model(inputs=inpt, outputs=predictions)
    return model

In [14]:
model = create_model(X_train[0].shape, 10)

In [15]:
model.summary()


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_1 (InputLayer)         [(None, 128, 129, 1)]     0         
_________________________________________________________________
conv2d (Conv2D)              (None, 128, 129, 16)      160       
_________________________________________________________________
activation (Activation)      (None, 128, 129, 16)      0         
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 64, 64, 16)        0         
_________________________________________________________________
dropout (Dropout)            (None, 64, 64, 16)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 64, 64, 32)        4640      
_________________________________________________________________
activation_1 (Activation)    (None, 64, 64, 32)        0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 32, 32, 32)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 32, 32, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 32, 32, 64)        18496     
_________________________________________________________________
activation_2 (Activation)    (None, 32, 32, 64)        0         
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 16, 16, 64)        0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 16, 16, 64)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 16, 16, 128)       73856     
_________________________________________________________________
activation_3 (Activation)    (None, 16, 16, 128)       0         
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 8, 8, 128)         0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 8, 8, 128)         0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 8, 8, 256)         295168    
_________________________________________________________________
activation_4 (Activation)    (None, 8, 8, 256)         0         
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 4, 4, 256)         0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 4, 4, 256)         0         
_________________________________________________________________
flatten (Flatten)            (None, 4096)              0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 4096)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               2097664   
_________________________________________________________________
dropout_6 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                5130      
=================================================================
Total params: 2,495,114
Trainable params: 2,495,114
Non-trainable params: 0
_________________________________________________________________

Loss function


In [16]:
model.compile(loss=tf.keras.losses.categorical_crossentropy,
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

In [17]:
reduceLROnPlat = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.95,
    patience=3,
    verbose=1,
    mode='min',
    min_delta=0.0001,
    cooldown=2,
    min_lr=1e-5
)

In [18]:
# Generators
batch_size = 128
train_generator = GTZANGenerator(X_train, y_train)
steps_per_epoch = np.ceil(len(X_train)/batch_size)

validation_generator = GTZANGenerator(X_test, y_test)
val_steps = np.ceil(len(X_test)/batch_size)

In [19]:
hist = model.fit_generator(
    train_generator,
    steps_per_epoch=steps_per_epoch,
    validation_data=validation_generator,
    validation_steps=val_steps,
    epochs=150,
    verbose=1,
    callbacks=[reduceLROnPlat])


Epoch 1/150
214/214 [==============================] - 16s 73ms/step - loss: 7.9670 - accuracy: 0.1060 - val_loss: 3.3254 - val_accuracy: 0.1512
Epoch 2/150
214/214 [==============================] - 14s 66ms/step - loss: 2.7241 - accuracy: 0.1817 - val_loss: 2.3599 - val_accuracy: 0.2125
Epoch 3/150
214/214 [==============================] - 14s 66ms/step - loss: 2.3268 - accuracy: 0.1936 - val_loss: 2.2607 - val_accuracy: 0.2147
Epoch 4/150
214/214 [==============================] - 14s 65ms/step - loss: 2.2475 - accuracy: 0.1760 - val_loss: 2.1048 - val_accuracy: 0.2104
Epoch 5/150
214/214 [==============================] - 14s 65ms/step - loss: 2.1765 - accuracy: 0.1712 - val_loss: 2.1136 - val_accuracy: 0.1690
Epoch 6/150
214/214 [==============================] - 14s 63ms/step - loss: 2.1145 - accuracy: 0.2047 - val_loss: 2.0495 - val_accuracy: 0.1938
Epoch 7/150
214/214 [==============================] - 14s 64ms/step - loss: 2.0977 - accuracy: 0.2190 - val_loss: 2.0104 - val_accuracy: 0.2597
Epoch 8/150
214/214 [==============================] - 14s 66ms/step - loss: 2.0809 - accuracy: 0.2319 - val_loss: 1.9829 - val_accuracy: 0.2660
Epoch 9/150
214/214 [==============================] - 14s 66ms/step - loss: 2.0448 - accuracy: 0.2301 - val_loss: 2.0278 - val_accuracy: 0.2036
Epoch 10/150
214/214 [==============================] - 14s 65ms/step - loss: 2.0400 - accuracy: 0.2223 - val_loss: 1.9371 - val_accuracy: 0.2201
Epoch 11/150
214/214 [==============================] - 14s 66ms/step - loss: 2.0286 - accuracy: 0.2389 - val_loss: 1.9102 - val_accuracy: 0.2157
Epoch 12/150
214/214 [==============================] - 14s 65ms/step - loss: 1.9919 - accuracy: 0.2553 - val_loss: 1.9082 - val_accuracy: 0.2712
Epoch 13/150
214/214 [==============================] - 14s 64ms/step - loss: 1.9485 - accuracy: 0.2614 - val_loss: 1.9313 - val_accuracy: 0.2559
Epoch 14/150
214/214 [==============================] - 14s 63ms/step - loss: 2.0000 - accuracy: 0.2452 - val_loss: 1.8717 - val_accuracy: 0.2495
Epoch 15/150
214/214 [==============================] - 14s 64ms/step - loss: 1.9734 - accuracy: 0.2675 - val_loss: 1.8404 - val_accuracy: 0.2741
Epoch 16/150
214/214 [==============================] - 14s 64ms/step - loss: 1.9171 - accuracy: 0.2742 - val_loss: 1.8788 - val_accuracy: 0.2974
Epoch 17/150
214/214 [==============================] - 14s 64ms/step - loss: 1.9237 - accuracy: 0.2988 - val_loss: 1.8609 - val_accuracy: 0.2996
Epoch 18/150
214/214 [==============================] - 14s 63ms/step - loss: 1.9164 - accuracy: 0.2841 - val_loss: 1.8279 - val_accuracy: 0.28911.9599 - accu - ETA: 6s - los - ETA:  - ETA: 1s - loss: 1
Epoch 19/150
214/214 [==============================] - 14s 63ms/step - loss: 1.8822 - accuracy: 0.3049 - val_loss: 1.9588 - val_accuracy: 0.2717
Epoch 20/150
214/214 [==============================] - 14s 64ms/step - loss: 1.9182 - accuracy: 0.3020 - val_loss: 1.8090 - val_accuracy: 0.3463
Epoch 21/150
214/214 [==============================] - 14s 64ms/step - loss: 1.8934 - accuracy: 0.3177 - val_loss: 1.7859 - val_accuracy: 0.3314
Epoch 22/150
214/214 [==============================] - 14s 64ms/step - loss: 1.9114 - accuracy: 0.3090 - val_loss: 1.7856 - val_accuracy: 0.3067
Epoch 23/150
214/214 [==============================] - 14s 64ms/step - loss: 1.8505 - accuracy: 0.3221 - val_loss: 1.7803 - val_accuracy: 0.3434s: 1.8515 - ac - ETA: 1s
Epoch 24/150
214/214 [==============================] - 14s 64ms/step - loss: 1.8514 - accuracy: 0.3145 - val_loss: 1.7475 - val_accuracy: 0.3502
Epoch 25/150
214/214 [==============================] - 13s 63ms/step - loss: 1.8494 - accuracy: 0.2984 - val_loss: 1.7509 - val_accuracy: 0.3393 loss: 1.8470 - accu
Epoch 26/150
214/214 [==============================] - 13s 63ms/step - loss: 1.8433 - accuracy: 0.3456 - val_loss: 1.8446 - val_accuracy: 0.3178
Epoch 27/150
214/214 [==============================] - 14s 66ms/step - loss: 1.8155 - accuracy: 0.3541 - val_loss: 1.7450 - val_accuracy: 0.3719
Epoch 28/150
214/214 [==============================] - 14s 66ms/step - loss: 1.8141 - accuracy: 0.3394 - val_loss: 1.7023 - val_accuracy: 0.4175
Epoch 29/150
214/214 [==============================] - 15s 68ms/step - loss: 1.7866 - accuracy: 0.3501 - val_loss: 1.7601 - val_accuracy: 0.3957
Epoch 30/150
214/214 [==============================] - 15s 68ms/step - loss: 1.7743 - accuracy: 0.3759 - val_loss: 1.7338 - val_accuracy: 0.3993
Epoch 31/150
214/214 [==============================] - 14s 68ms/step - loss: 1.7356 - accuracy: 0.3798 - val_loss: 1.6585 - val_accuracy: 0.3923
Epoch 32/150
214/214 [==============================] - 15s 68ms/step - loss: 1.7659 - accuracy: 0.3817 - val_loss: 1.7022 - val_accuracy: 0.3913
Epoch 33/150
214/214 [==============================] - 14s 68ms/step - loss: 1.7168 - accuracy: 0.4047 - val_loss: 1.6327 - val_accuracy: 0.3922
Epoch 34/150
214/214 [==============================] - 14s 67ms/step - loss: 1.7661 - accuracy: 0.3752 - val_loss: 1.6221 - val_accuracy: 0.4433
Epoch 35/150
214/214 [==============================] - 14s 64ms/step - loss: 1.7567 - accuracy: 0.3774 - val_loss: 1.6006 - val_accuracy: 0.4429
Epoch 36/150
214/214 [==============================] - 14s 65ms/step - loss: 1.7019 - accuracy: 0.4069 - val_loss: 1.5065 - val_accuracy: 0.4924
Epoch 37/150
214/214 [==============================] - 14s 66ms/step - loss: 1.6326 - accuracy: 0.4322 - val_loss: 1.5537 - val_accuracy: 0.4268
Epoch 38/150
214/214 [==============================] - 14s 66ms/step - loss: 1.7175 - accuracy: 0.3936 - val_loss: 1.6342 - val_accuracy: 0.4054
Epoch 39/150
212/214 [============================>.] - ETA: 0s - loss: 1.6712 - accuracy: 0.4040
Epoch 00039: ReduceLROnPlateau reducing learning rate to 0.0009500000451225787.
214/214 [==============================] - 14s 65ms/step - loss: 1.6698 - accuracy: 0.4052 - val_loss: 1.5908 - val_accuracy: 0.4978
Epoch 40/150
214/214 [==============================] - 14s 66ms/step - loss: 1.6462 - accuracy: 0.4396 - val_loss: 1.5504 - val_accuracy: 0.4932
Epoch 41/150
214/214 [==============================] - 14s 65ms/step - loss: 1.6106 - accuracy: 0.4442 - val_loss: 1.5409 - val_accuracy: 0.4591
Epoch 42/150
214/214 [==============================] - 14s 65ms/step - loss: 1.6252 - accuracy: 0.4453 - val_loss: 1.4944 - val_accuracy: 0.4811
Epoch 43/150
214/214 [==============================] - 14s 65ms/step - loss: 1.5681 - accuracy: 0.4697 - val_loss: 1.6367 - val_accuracy: 0.4186
Epoch 44/150
214/214 [==============================] - 14s 65ms/step - loss: 1.6644 - accuracy: 0.4265 - val_loss: 1.4482 - val_accuracy: 0.4915
Epoch 45/150
214/214 [==============================] - 14s 65ms/step - loss: 1.5960 - accuracy: 0.4723 - val_loss: 1.4631 - val_accuracy: 0.5121
Epoch 46/150
214/214 [==============================] - 14s 65ms/step - loss: 1.5886 - accuracy: 0.4483 - val_loss: 1.5042 - val_accuracy: 0.4788
Epoch 47/150
212/214 [============================>.] - ETA: 0s - loss: 1.6167 - accuracy: 0.4451
Epoch 00047: ReduceLROnPlateau reducing learning rate to 0.0009025000152178108.
214/214 [==============================] - 14s 65ms/step - loss: 1.6177 - accuracy: 0.4434 - val_loss: 1.5493 - val_accuracy: 0.4589
Epoch 48/150
214/214 [==============================] - 14s 65ms/step - loss: 1.5601 - accuracy: 0.4763 - val_loss: 1.5750 - val_accuracy: 0.4667
Epoch 49/150
214/214 [==============================] - 14s 65ms/step - loss: 1.5282 - accuracy: 0.4859 - val_loss: 1.4422 - val_accuracy: 0.5172
Epoch 50/150
214/214 [==============================] - 14s 65ms/step - loss: 1.5719 - accuracy: 0.4829 - val_loss: 1.4582 - val_accuracy: 0.5340
Epoch 51/150
214/214 [==============================] - 14s 65ms/step - loss: 1.5787 - accuracy: 0.4651 - val_loss: 1.6262 - val_accuracy: 0.4628
Epoch 52/150
213/214 [============================>.] - ETA: 0s - loss: 1.5508 - accuracy: 0.4981
Epoch 00052: ReduceLROnPlateau reducing learning rate to 0.0008573750033974647.
214/214 [==============================] - 14s 65ms/step - loss: 1.5483 - accuracy: 0.4989 - val_loss: 1.4542 - val_accuracy: 0.5204
Epoch 53/150
214/214 [==============================] - 14s 65ms/step - loss: 1.4796 - accuracy: 0.5036 - val_loss: 1.3985 - val_accuracy: 0.5323
Epoch 54/150
214/214 [==============================] - 14s 65ms/step - loss: 1.5485 - accuracy: 0.4981 - val_loss: 1.4571 - val_accuracy: 0.5121curacy:  - ETA
Epoch 55/150
214/214 [==============================] - 14s 65ms/step - loss: 1.5175 - accuracy: 0.5091 - val_loss: 1.4023 - val_accuracy: 0.5686
Epoch 56/150
212/214 [============================>.] - ETA: 0s - loss: 1.4408 - accuracy: 0.5256 ETA: 0s - loss: 1.4431 - accuracy
Epoch 00056: ReduceLROnPlateau reducing learning rate to 0.0008145062311086804.
214/214 [==============================] - 14s 65ms/step - loss: 1.4410 - accuracy: 0.5262 - val_loss: 1.4457 - val_accuracy: 0.5312
Epoch 57/150
214/214 [==============================] - 14s 64ms/step - loss: 1.4521 - accuracy: 0.5187 - val_loss: 1.3451 - val_accuracy: 0.5791
Epoch 58/150
214/214 [==============================] - 14s 65ms/step - loss: 1.4666 - accuracy: 0.5253 - val_loss: 1.3667 - val_accuracy: 0.5754
Epoch 59/150
214/214 [==============================] - 14s 65ms/step - loss: 1.5014 - accuracy: 0.5138 - val_loss: 1.3141 - val_accuracy: 0.6016
Epoch 60/150
214/214 [==============================] - 14s 65ms/step - loss: 1.4186 - accuracy: 0.5361 - val_loss: 1.3144 - val_accuracy: 0.5983cy
Epoch 61/150
214/214 [==============================] - 14s 65ms/step - loss: 1.4445 - accuracy: 0.5374 - val_loss: 1.2955 - val_accuracy: 0.5973
Epoch 62/150
214/214 [==============================] - 14s 65ms/step - loss: 1.3730 - accuracy: 0.5580 - val_loss: 1.3523 - val_accuracy: 0.5735
Epoch 63/150
214/214 [==============================] - 14s 65ms/step - loss: 1.4156 - accuracy: 0.5332 - val_loss: 1.3812 - val_accuracy: 0.5722
Epoch 64/150
213/214 [============================>.] - ETA: 0s - loss: 1.3808 - accuracy: 0.56 - ETA: 0s - loss: 1.3799 - accuracy: 0.5676
Epoch 00064: ReduceLROnPlateau reducing learning rate to 0.0007737808919046074.
214/214 [==============================] - 14s 64ms/step - loss: 1.3774 - accuracy: 0.5689 - val_loss: 1.2976 - val_accuracy: 0.6005
Epoch 65/150
214/214 [==============================] - 14s 65ms/step - loss: 1.4632 - accuracy: 0.5337 - val_loss: 1.3976 - val_accuracy: 0.5630
Epoch 66/150
214/214 [==============================] - 14s 64ms/step - loss: 1.3115 - accuracy: 0.5762 - val_loss: 1.2789 - val_accuracy: 0.5893
Epoch 67/150
214/214 [==============================] - 14s 64ms/step - loss: 1.3945 - accuracy: 0.5573 - val_loss: 1.2238 - val_accuracy: 0.6118
Epoch 68/150
214/214 [==============================] - 14s 65ms/step - loss: 1.3873 - accuracy: 0.5556 - val_loss: 1.2963 - val_accuracy: 0.5915
Epoch 69/150
214/214 [==============================] - 14s 65ms/step - loss: 1.4028 - accuracy: 0.5593 - val_loss: 1.3328 - val_accuracy: 0.5873
Epoch 70/150
212/214 [============================>.] - ETA: 0s - loss: 1.3526 - accuracy: 0.5754
Epoch 00070: ReduceLROnPlateau reducing learning rate to 0.000735091819660738.
214/214 [==============================] - 14s 64ms/step - loss: 1.3509 - accuracy: 0.5754 - val_loss: 1.3024 - val_accuracy: 0.5968
Epoch 71/150
214/214 [==============================] - 14s 65ms/step - loss: 1.3621 - accuracy: 0.5683 - val_loss: 1.2307 - val_accuracy: 0.6260
Epoch 72/150
214/214 [==============================] - 14s 64ms/step - loss: 1.3141 - accuracy: 0.5880 - val_loss: 1.3519 - val_accuracy: 0.5713
Epoch 73/150
214/214 [==============================] - 14s 64ms/step - loss: 1.2984 - accuracy: 0.5985 - val_loss: 1.2421 - val_accuracy: 0.6219
Epoch 74/150
213/214 [============================>.] - ETA: 0s - loss: 1.3435 - accuracy: 0.5870
Epoch 00074: ReduceLROnPlateau reducing learning rate to 0.0006983372120885178.
214/214 [==============================] - 14s 65ms/step - loss: 1.3461 - accuracy: 0.5861 - val_loss: 1.2365 - val_accuracy: 0.6279
Epoch 75/150
214/214 [==============================] - 14s 64ms/step - loss: 1.2671 - accuracy: 0.6016 - val_loss: 1.1901 - val_accuracy: 0.6326
Epoch 76/150
214/214 [==============================] - 14s 65ms/step - loss: 1.2949 - accuracy: 0.6068 - val_loss: 1.1961 - val_accuracy: 0.6517
Epoch 77/150
214/214 [==============================] - 14s 64ms/step - loss: 1.3069 - accuracy: 0.5925 - val_loss: 1.1749 - val_accuracy: 0.6428
Epoch 78/150
214/214 [==============================] - 14s 65ms/step - loss: 1.2715 - accuracy: 0.6067 - val_loss: 1.1779 - val_accuracy: 0.6384
Epoch 79/150
214/214 [==============================] - 14s 65ms/step - loss: 1.2999 - accuracy: 0.5969 - val_loss: 1.2386 - val_accuracy: 0.6189
Epoch 80/150
213/214 [============================>.] - ETA: 0s - loss: 1.2199 - accuracy: 0.6174
Epoch 00080: ReduceLROnPlateau reducing learning rate to 0.0006634203542489559.
214/214 [==============================] - 14s 64ms/step - loss: 1.2185 - accuracy: 0.6177 - val_loss: 1.2019 - val_accuracy: 0.6274
Epoch 81/150
214/214 [==============================] - 14s 65ms/step - loss: 1.2428 - accuracy: 0.6060 - val_loss: 1.2053 - val_accuracy: 0.6208
Epoch 82/150
214/214 [==============================] - 14s 65ms/step - loss: 1.2418 - accuracy: 0.6268 - val_loss: 1.1778 - val_accuracy: 0.6469
Epoch 83/150
214/214 [==============================] - 14s 65ms/step - loss: 1.2613 - accuracy: 0.6061 - val_loss: 1.1220 - val_accuracy: 0.6574
Epoch 84/150
214/214 [==============================] - 14s 65ms/step - loss: 1.2116 - accuracy: 0.6259 - val_loss: 1.1560 - val_accuracy: 0.6435
Epoch 85/150
214/214 [==============================] - 14s 65ms/step - loss: 1.1671 - accuracy: 0.6476 - val_loss: 1.1243 - val_accuracy: 0.6534
Epoch 86/150
213/214 [============================>.] - ETA: 0s - loss: 1.2645 - accuracy: 0.6065
Epoch 00086: ReduceLROnPlateau reducing learning rate to 0.0006302493420662358.
214/214 [==============================] - 14s 65ms/step - loss: 1.2624 - accuracy: 0.6077 - val_loss: 1.1280 - val_accuracy: 0.6557
Epoch 87/150
214/214 [==============================] - 14s 64ms/step - loss: 1.2843 - accuracy: 0.5991 - val_loss: 1.1075 - val_accuracy: 0.6800
Epoch 88/150
214/214 [==============================] - 14s 65ms/step - loss: 1.1994 - accuracy: 0.6295 - val_loss: 1.1372 - val_accuracy: 0.6664
Epoch 89/150
214/214 [==============================] - 14s 65ms/step - loss: 1.2214 - accuracy: 0.6195 - val_loss: 1.1063 - val_accuracy: 0.6636
Epoch 90/150
214/214 [==============================] - 14s 64ms/step - loss: 1.1979 - accuracy: 0.6324 - val_loss: 1.1869 - val_accuracy: 0.6466
Epoch 91/150
214/214 [==============================] - 14s 64ms/step - loss: 1.1501 - accuracy: 0.6513 - val_loss: 1.1806 - val_accuracy: 0.6381
Epoch 92/150
214/214 [==============================] - 14s 64ms/step - loss: 1.2516 - accuracy: 0.6121 - val_loss: 1.0893 - val_accuracy: 0.6763
Epoch 93/150
214/214 [==============================] - 14s 64ms/step - loss: 1.1608 - accuracy: 0.6465 - val_loss: 1.2424 - val_accuracy: 0.6204
Epoch 94/150
214/214 [==============================] - 14s 64ms/step - loss: 1.1608 - accuracy: 0.6443 - val_loss: 1.1073 - val_accuracy: 0.6454s: 1.1592 - ac
Epoch 95/150
212/214 [============================>.] - ETA: 0s - loss: 1.1237 - accuracy: 0.6496 E - ETA
Epoch 00095: ReduceLROnPlateau reducing learning rate to 0.0005987368611386045.
214/214 [==============================] - 14s 64ms/step - loss: 1.1227 - accuracy: 0.6501 - val_loss: 1.1249 - val_accuracy: 0.6646
Epoch 96/150
214/214 [==============================] - 14s 64ms/step - loss: 1.2316 - accuracy: 0.6229 - val_loss: 1.1398 - val_accuracy: 0.6719
Epoch 97/150
214/214 [==============================] - 14s 64ms/step - loss: 1.1050 - accuracy: 0.6617 - val_loss: 1.0951 - val_accuracy: 0.6663
Epoch 98/150
214/214 [==============================] - 14s 64ms/step - loss: 1.1834 - accuracy: 0.6448 - val_loss: 1.0893 - val_accuracy: 0.6748
Epoch 99/150
213/214 [============================>.] - ETA: 0s - loss: 1.1715 - accuracy: 0.6308
Epoch 00099: ReduceLROnPlateau reducing learning rate to 0.0005688000208465382.
214/214 [==============================] - 14s 64ms/step - loss: 1.1736 - accuracy: 0.6296 - val_loss: 1.1429 - val_accuracy: 0.6520
Epoch 100/150
214/214 [==============================] - 14s 65ms/step - loss: 1.1137 - accuracy: 0.6617 - val_loss: 1.0427 - val_accuracy: 0.6955
Epoch 101/150
214/214 [==============================] - 14s 65ms/step - loss: 1.1240 - accuracy: 0.6573 - val_loss: 1.0848 - val_accuracy: 0.6741
Epoch 102/150
214/214 [==============================] - 14s 65ms/step - loss: 1.1250 - accuracy: 0.6471 - val_loss: 1.1012 - val_accuracy: 0.6807
Epoch 103/150
212/214 [============================>.] - ETA: 0s - loss: 1.1663 - accuracy: 0.6464
Epoch 00103: ReduceLROnPlateau reducing learning rate to 0.0005403600225690752.
214/214 [==============================] - 14s 64ms/step - loss: 1.1708 - accuracy: 0.6447 - val_loss: 1.0738 - val_accuracy: 0.6787
Epoch 104/150
214/214 [==============================] - 14s 64ms/step - loss: 1.1026 - accuracy: 0.6629 - val_loss: 1.0363 - val_accuracy: 0.6858
Epoch 105/150
214/214 [==============================] - 14s 64ms/step - loss: 1.0789 - accuracy: 0.6742 - val_loss: 1.0715 - val_accuracy: 0.6758s: 1.0687 - accura
Epoch 106/150
214/214 [==============================] - 14s 64ms/step - loss: 1.1163 - accuracy: 0.6637 - val_loss: 1.0308 - val_accuracy: 0.6872
Epoch 107/150
214/214 [==============================] - 14s 64ms/step - loss: 1.1002 - accuracy: 0.6604 - val_loss: 1.0655 - val_accuracy: 0.6783
Epoch 108/150
214/214 [==============================] - 14s 64ms/step - loss: 1.0698 - accuracy: 0.6818 - val_loss: 1.0796 - val_accuracy: 0.6799
Epoch 109/150
213/214 [============================>.] - ETA: 0s - loss: 1.0765 - accuracy: 0.6695 ETA: 3s - loss: 0
Epoch 00109: ReduceLROnPlateau reducing learning rate to 0.0005133419937919825.
214/214 [==============================] - 14s 64ms/step - loss: 1.0768 - accuracy: 0.6689 - val_loss: 1.0630 - val_accuracy: 0.6771
Epoch 110/150
214/214 [==============================] - 14s 64ms/step - loss: 1.1219 - accuracy: 0.6602 - val_loss: 1.0565 - val_accuracy: 0.6858
Epoch 111/150
214/214 [==============================] - 14s 64ms/step - loss: 1.0740 - accuracy: 0.6751 - val_loss: 1.0143 - val_accuracy: 0.6919
Epoch 112/150
214/214 [==============================] - 14s 65ms/step - loss: 1.0932 - accuracy: 0.6625 - val_loss: 1.0376 - val_accuracy: 0.6948
Epoch 113/150
214/214 [==============================] - 14s 64ms/step - loss: 1.0713 - accuracy: 0.6735 - val_loss: 1.0410 - val_accuracy: 0.6846
Epoch 114/150
213/214 [============================>.] - ETA: 0s - loss: 1.0459 - accuracy: 0.6882
Epoch 00114: ReduceLROnPlateau reducing learning rate to 0.0004876748775132.
214/214 [==============================] - 14s 63ms/step - loss: 1.0475 - accuracy: 0.6872 - val_loss: 1.0252 - val_accuracy: 0.6897
Epoch 115/150
214/214 [==============================] - 14s 64ms/step - loss: 1.0946 - accuracy: 0.6663 - val_loss: 1.0041 - val_accuracy: 0.7096
Epoch 116/150
214/214 [==============================] - 14s 64ms/step - loss: 1.0358 - accuracy: 0.6847 - val_loss: 1.0436 - val_accuracy: 0.6870
Epoch 117/150
214/214 [==============================] - 14s 64ms/step - loss: 1.0397 - accuracy: 0.6888 - val_loss: 1.0551 - val_accuracy: 0.6707
Epoch 118/150
213/214 [============================>.] - ETA: 0s - loss: 1.0426 - accuracy: 0.6796
Epoch 00118: ReduceLROnPlateau reducing learning rate to 0.00046329112810781223.
214/214 [==============================] - 14s 64ms/step - loss: 1.0415 - accuracy: 0.6801 - val_loss: 1.0613 - val_accuracy: 0.6984
Epoch 119/150
214/214 [==============================] - 14s 64ms/step - loss: 1.0282 - accuracy: 0.6851 - val_loss: 1.0439 - val_accuracy: 0.6863
Epoch 120/150
214/214 [==============================] - 14s 64ms/step - loss: 1.0490 - accuracy: 0.6863 - val_loss: 1.0236 - val_accuracy: 0.6943
Epoch 121/150
214/214 [==============================] - 14s 63ms/step - loss: 1.0471 - accuracy: 0.6825 - val_loss: 0.9890 - val_accuracy: 0.7002
Epoch 122/150
214/214 [==============================] - 14s 64ms/step - loss: 1.0509 - accuracy: 0.6755 - val_loss: 1.0191 - val_accuracy: 0.6928
Epoch 123/150
214/214 [==============================] - 14s 64ms/step - loss: 1.0368 - accuracy: 0.6824 - val_loss: 1.0555 - val_accuracy: 0.6793
Epoch 124/150
212/214 [============================>.] - ETA: 0s - loss: 0.9772 - accuracy: 0.7067
Epoch 00124: ReduceLROnPlateau reducing learning rate to 0.00044012657308485355.
214/214 [==============================] - 14s 64ms/step - loss: 0.9746 - accuracy: 0.7077 - val_loss: 1.0288 - val_accuracy: 0.6967
Epoch 125/150
214/214 [==============================] - 14s 64ms/step - loss: 1.0395 - accuracy: 0.6805 - val_loss: 1.0484 - val_accuracy: 0.6880
Epoch 126/150
214/214 [==============================] - 14s 64ms/step - loss: 0.9778 - accuracy: 0.7039 - val_loss: 0.9933 - val_accuracy: 0.7053
Epoch 127/150
214/214 [==============================] - 14s 64ms/step - loss: 0.9887 - accuracy: 0.6942 - val_loss: 0.9899 - val_accuracy: 0.7019
Epoch 128/150
212/214 [============================>.] - ETA: 0s - loss: 1.0306 - accuracy: 0.6893
Epoch 00128: ReduceLROnPlateau reducing learning rate to 0.00041812024719547477.
214/214 [==============================] - 14s 64ms/step - loss: 1.0259 - accuracy: 0.6912 - val_loss: 1.0293 - val_accuracy: 0.6878
Epoch 129/150
214/214 [==============================] - 14s 65ms/step - loss: 1.0101 - accuracy: 0.7000 - val_loss: 0.9502 - val_accuracy: 0.7201
Epoch 130/150
214/214 [==============================] - 14s 64ms/step - loss: 0.9707 - accuracy: 0.7024 - val_loss: 0.9720 - val_accuracy: 0.7116
Epoch 131/150
214/214 [==============================] - 14s 64ms/step - loss: 0.9802 - accuracy: 0.7109 - val_loss: 0.9754 - val_accuracy: 0.7058
Epoch 132/150
212/214 [============================>.] - ETA: 0s - loss: 0.9946 - accuracy: 0.6970
Epoch 00132: ReduceLROnPlateau reducing learning rate to 0.00039721422654110934.
214/214 [==============================] - 14s 64ms/step - loss: 0.9911 - accuracy: 0.6988 - val_loss: 1.0044 - val_accuracy: 0.6992
Epoch 133/150
214/214 [==============================] - 14s 64ms/step - loss: 1.0181 - accuracy: 0.6841 - val_loss: 0.9706 - val_accuracy: 0.7047
Epoch 134/150
214/214 [==============================] - 14s 64ms/step - loss: 0.9608 - accuracy: 0.7084 - val_loss: 0.9957 - val_accuracy: 0.7150
Epoch 135/150
214/214 [==============================] - 14s 64ms/step - loss: 0.9545 - accuracy: 0.7136 - val_loss: 0.9933 - val_accuracy: 0.7060
Epoch 136/150
213/214 [============================>.] - ETA: 0s - loss: 1.0073 - accuracy: 0.7032
Epoch 00136: ReduceLROnPlateau reducing learning rate to 0.00037735351797891776.
214/214 [==============================] - 14s 64ms/step - loss: 1.0072 - accuracy: 0.7030 - val_loss: 0.9948 - val_accuracy: 0.6958
Epoch 137/150
214/214 [==============================] - 14s 64ms/step - loss: 0.9688 - accuracy: 0.7079 - val_loss: 0.9992 - val_accuracy: 0.6974
Epoch 138/150
214/214 [==============================] - 14s 64ms/step - loss: 0.9437 - accuracy: 0.7176 - val_loss: 0.9617 - val_accuracy: 0.7196
Epoch 139/150
214/214 [==============================] - 14s 64ms/step - loss: 0.9619 - accuracy: 0.7064 - val_loss: 0.9755 - val_accuracy: 0.7014
Epoch 140/150
214/214 [==============================] - 14s 64ms/step - loss: 0.9912 - accuracy: 0.6997 - val_loss: 0.9467 - val_accuracy: 0.7193
Epoch 141/150
214/214 [==============================] - 14s 64ms/step - loss: 0.9409 - accuracy: 0.7218 - val_loss: 0.9998 - val_accuracy: 0.7067
Epoch 142/150
214/214 [==============================] - 14s 64ms/step - loss: 0.9388 - accuracy: 0.7194 - val_loss: 0.9566 - val_accuracy: 0.7147
Epoch 143/150
213/214 [============================>.] - ETA: 0s - loss: 0.9622 - accuracy: 0.7115 ETA: 1s -
Epoch 00143: ReduceLROnPlateau reducing learning rate to 0.00035848583793267607.
214/214 [==============================] - 14s 64ms/step - loss: 0.9620 - accuracy: 0.7117 - val_loss: 1.0212 - val_accuracy: 0.6997
Epoch 144/150
214/214 [==============================] - 14s 64ms/step - loss: 0.9092 - accuracy: 0.7252 - val_loss: 0.9373 - val_accuracy: 0.7189
Epoch 145/150
214/214 [==============================] - 14s 64ms/step - loss: 0.8926 - accuracy: 0.7353 - val_loss: 1.0077 - val_accuracy: 0.6965
Epoch 146/150
214/214 [==============================] - 14s 65ms/step - loss: 1.0001 - accuracy: 0.6891 - val_loss: 0.9865 - val_accuracy: 0.7094
Epoch 147/150
213/214 [============================>.] - ETA: 0s - loss: 0.9724 - accuracy: 0.70 - ETA: 0s - loss: 0.9727 - accuracy: 0.7051
Epoch 00147: ReduceLROnPlateau reducing learning rate to 0.00034056155709549785.
214/214 [==============================] - 14s 66ms/step - loss: 0.9696 - accuracy: 0.7063 - val_loss: 0.9750 - val_accuracy: 0.7013
Epoch 148/150
214/214 [==============================] - 14s 65ms/step - loss: 0.8958 - accuracy: 0.7264 - val_loss: 0.9766 - val_accuracy: 0.7084
Epoch 149/150
214/214 [==============================] - 14s 64ms/step - loss: 0.9480 - accuracy: 0.7115 - val_loss: 0.9455 - val_accuracy: 0.7130
Epoch 150/150
214/214 [==============================] - 14s 65ms/step - loss: 0.8912 - accuracy: 0.7338 - val_loss: 0.9489 - val_accuracy: 0.7172

In [20]:
score = model.evaluate(X_test, y_test, verbose=0)
print("val_loss = {:.3f} and val_acc = {:.3f}".format(score[0], score[1]))


val_loss = 0.964 and val_acc = 0.722

In [21]:
plt.figure(figsize=(15,7))

plt.subplot(1,2,1)
plt.plot(hist.history['accuracy'], label='train')
plt.plot(hist.history['val_accuracy'], label='validation')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1,2,2)
plt.plot(hist.history['loss'], label='train')
plt.plot(hist.history['val_loss'], label='validation')
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()



In [22]:
#http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [23]:
preds = np.argmax(model.predict(X_test), axis = 1)
y_orig = np.argmax(y_test, axis = 1)
cm = confusion_matrix(preds, y_orig)

In [24]:
keys = OrderedDict(sorted(genres.items(), key=lambda t: t[1])).keys()

plt.figure(figsize=(10,10))
plot_confusion_matrix(cm, keys, normalize=True)


Normalized confusion matrix

Majority Vote


In [25]:
def majority_vote(scores):
    values, counts = np.unique(scores,return_counts=True)
    ind = np.argmax(counts)
    return values[ind]

In [26]:
preds = model.predict(X_test, batch_size=128, verbose=0)

In [27]:
# Each sound was divided into 39 segments in our custom function
scores_songs = np.split(np.argmax(preds, axis=1), 300)
scores_songs = [majority_vote(scores) for scores in scores_songs]

In [28]:
# Same analysis for split
label = np.split(np.argmax(y_test, axis=1), 300)
label = [majority_vote(l) for l in label]

In [29]:
from sklearn.metrics import accuracy_score

print("majority voting system (acc) = {:.3f}".format(accuracy_score(label, scores_songs)))


majority voting system (acc) = 0.820

Compared to the classical approach, we are winning now!

We passed from a 78.8% accuracy to 82%. It is not a great improvement but this is a really simple architecture.

Save the model


In [30]:
# Save the model
model.save('../models/custom_cnn_2d.h5')