In [2]:
### Load necessary libraries ###
import glob
import os
import librosa
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras

In [3]:
### Define helper functions ###
def extract_features(parent_dir, sub_dirs, file_ext="*.wav", 
                     bands=20, frames=41):
    def _windows(data, window_size):
        start = 0
        while start < len(data):
            yield start, start + window_size
            start += (window_size // 2)    

    window_size = 512 * (frames - 1)
    features, labels = [], []
    for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
        segment_mfcc, segment_labels = [], []
        sound_clip, sr = librosa.load(fn)
        label = int(fn.split('/')[2].split('-')[1])
        for (start,end) in _windows(sound_clip,window_size):
            if(len(sound_clip[start:end]) == window_size):
                signal = sound_clip[start:end]
                mfcc = librosa.feature.mfcc(y=signal, sr=sr, 
                        n_mfcc=bands).T.flatten()[:, np.newaxis].T
                segment_mfcc.append(mfcc)
                segment_labels.append(label)
                
        segment_mfcc = np.asarray(segment_mfcc).reshape(
            len(segment_mfcc),frames,bands)
        
        if len(segment_mfcc) > 0: # check for empty segments 
            features.append(segment_mfcc)
            labels.append(segment_labels) 
            
    return features, labels

In [ ]:
parent_dir = 'UrbanSounds8K/audio/'
save_dir = "UrbanSounds8K/processed/"
folds = sub_dirs = np.array(['fold1','fold2','fold3','fold4',
                  'fold5','fold6','fold7','fold8',
                  'fold9','fold10'])
for sub_dir in sub_dirs:
    features, labels = extract_features(parent_dir,sub_dir)
    np.savez("{0}{1}".format(save_dir, sub_dir), features=features, 
             labels=labels)

In [4]:
### Define GRU based recurrent network architecture ###
def get_network():
    input_shape = (41, 20)
    num_classes = 10
    keras.backend.clear_session()
    
    model = keras.models.Sequential()
    model.add(keras.layers.GRU(128, input_shape=input_shape))
    model.add(keras.layers.Dense(128, activation="relu"))
    model.add(keras.layers.Dense(num_classes, activation = "softmax"))
    model.compile(optimizer=keras.optimizers.Adam(1e-4), 
        loss=keras.losses.SparseCategoricalCrossentropy(), 
        metrics=["accuracy"])
    
    return model

In [5]:
### Train and evaluate via 10-Folds cross-validation ###
accuracies = []
folds = np.array(['fold1','fold2','fold3','fold4',
                  'fold5','fold6','fold7','fold8',
                  'fold9','fold10'])
load_dir = "UrbanSounds8K/processed/"
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(folds):
    x_train, y_train = [], []
    for ind in train_index:
        # read features or segments of an audio file
        train_data = np.load("{0}/{1}.npz".format(load_dir,folds[ind]), 
                       allow_pickle=True)
        # for training stack all the segments so that they are treated as an example/instance
        features = np.concatenate(train_data["features"], axis=0) 
        labels = np.concatenate(train_data["labels"], axis=0)
        x_train.append(features)
        y_train.append(labels)
    # stack x,y pairs of all training folds 
    x_train = np.concatenate(x_train, axis = 0).astype(np.float32)
    y_train = np.concatenate(y_train, axis = 0).astype(np.float32)
    
    # for testing we will make predictions on each segment and average them to 
    # produce signle label for an entire sound clip.
    test_data = np.load("{0}/{1}.npz".format(load_dir,
                   folds[test_index][0]), allow_pickle=True)
    x_test = test_data["features"]
    y_test = test_data["labels"]

    model = get_network()
    model.fit(x_train, y_train, epochs = 3, batch_size = 24, verbose = 0)
    
    # evaluate on test set/fold
    y_true, y_pred = [], []
    for x, y in zip(x_test, y_test):
        # average predictions over segments of a sound clip
        avg_p = np.argmax(np.mean(model.predict(x), axis = 0))
        y_pred.append(avg_p) 
        # pick single label via np.unique for a sound clip
        y_true.append(np.unique(y)[0]) 
    accuracies.append(accuracy_score(y_true, y_pred))   
print("Average 10 Folds Accuracy: {0}".format(np.mean(accuracies)))


Average 10 Folds Accuracy: 0.0

In [ ]: