In [ ]:
### Load necessary libraries ###
import glob
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram
from sklearn.model_selection import KFold

import tensorflow as tf
from tensorflow import keras

%matplotlib inline
plt.style.use('ggplot')

In [ ]:
### Define helper functions ###
def load_sound_files(file_paths):
    raw_sounds = []
    for fp in file_paths:
        X,sr = librosa.load(fp)
        raw_sounds.append(X)
    return raw_sounds

def plot_waves(sound_names,raw_sounds):
    i = 1
    fig = plt.figure(figsize=(25,60), dpi=900)
    for n,f in zip(sound_names,raw_sounds):
        plt.subplot(10,1,i)
        librosa.display.waveplot(np.array(f),sr=22050)
        plt.title(n.title())
        i += 1
    plt.suptitle('Figure 1: Waveplot',x=0.5, y=0.915,fontsize=18)
    plt.show()
    
def plot_specgram(sound_names,raw_sounds):
    i = 1
    fig = plt.figure(figsize=(25,60), dpi = 900)
    for n,f in zip(sound_names,raw_sounds):
        plt.subplot(10,1,i)
        specgram(np.array(f), Fs=22050)
        plt.title(n.title())
        i += 1
    plt.suptitle('Figure 2: Spectrogram',x=0.5, y=0.915,fontsize=18)
    plt.show()

def plot_log_power_specgram(sound_names,raw_sounds):
    i = 1
    fig = plt.figure(figsize=(25,60), dpi = 900)
    for n,f in zip(sound_names,raw_sounds):
        plt.subplot(10,1,i)
        D = librosa.logamplitude(np.abs(librosa.stft(f))**2, ref_power=np.max)
        librosa.display.specshow(D,x_axis='time' ,y_axis='log')
        plt.title(n.title())
        i += 1
    plt.suptitle('Figure 3: Log power spectrogram',x=0.5, y=0.915,fontsize=18)
    plt.show()
    
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return mfccs,chroma,mel,contrast,tonnetz

def parse_audio_files(parent_dir,sub_dir,file_ext='*.wav'):
    features, labels = np.empty((0,193)), np.empty(0) # 193 => total features 
    for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
        mfccs, chroma, mel, contrast,tonnetz = extract_feature(fn)
        ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
        features = np.vstack([features,ext_features])
        labels = np.append(labels, int(fn.split('/')[2].split('-')[1]))
    return np.array(features, dtype=np.float32), np.array(labels, dtype = np.int8)

In [ ]:
### Plot few sound clips along with their spectrograms ###
sound_file_paths = ["57320-0-0-7.wav","24074-1-0-3.wav",
                    "15564-2-0-1.wav","31323-3-0-1.wav",
                    "46669-4-0-35.wav","89948-5-0-0.wav",
                    "46656-6-0-0.wav","103074-7-3-2.wav",
                    "106905-8-0-0.wav","108041-9-0-4.wav"]
sound_names = ["air conditioner","car horn","children playing",
               "dog bark","drilling","engine idling",
               "gun shot","jackhammer","siren","street music"]

raw_sounds = load_sound_files(sound_file_paths)
plot_waves(sound_names,raw_sounds)
plot_specgram(sound_names,raw_sounds)

In [ ]:
# Pre-process and extract feature from the data
parent_dir = 'UrbanSounds8K/audio/'
save_dir = "UrbanSounds8K/processed/"
sub_dirs = np.array(['fold1','fold2','fold3','fold4',
                  'fold5','fold6','fold7','fold8',
                  'fold9','fold10'])
for sub_dir in sub_dirs:
    features, labels = parse_audio_files(parent_dir,sub_dir)
    np.savez("{0}{1}".format(save_dir, sub_dir), features=features, 
             labels=labels)

In [ ]:
### Define feedforward network architecture ###
def get_network():
    input_shape = (193,)
    num_classes = 10
    keras.backend.clear_session()
    
    model = keras.models.Sequential()
    model.add(keras.layers.Dense(256, activation="relu", input_shape=input_shape))
    model.add(keras.layers.Dense(128, activation="relu", input_shape=input_shape))
    model.add(keras.layers.Dense(64, activation="relu", input_shape=input_shape))
    model.add(keras.layers.Dense(num_classes, activation = "softmax"))
    model.compile(optimizer=keras.optimizers.Adam(1e-4), 
        loss=keras.losses.SparseCategoricalCrossentropy(), 
        metrics=["accuracy"])
    
    return model

In [ ]:
### Train and evaluate via 10-Folds cross-validation ###
accuracies = []
folds = np.array(['fold1','fold2','fold3','fold4',
                  'fold5','fold6','fold7','fold8',
                  'fold9','fold10'])
load_dir = "UrbanSounds8K/processed/"
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(folds):
    x_train, y_train = [], []
    for ind in train_index:
        data = np.load("{0}/{1}.npz".format(load_dir,folds[ind]))
        x_train.append(data["features"])
        y_train.append(data["labels"])
    x_train = np.concatenate(x_train, axis = 0)
    y_train = np.concatenate(y_train, axis = 0)
    
    data = np.load("{0}/{1}.npz".format(load_dir,folds[test_index][0]))
    x_test = data["features"]
    y_test = data["labels"]
    
    # Possibly do mean normalization here on x_train and x_test but using only x_train's mean and std.
    
    model = get_network()
    model.fit(x_train, y_train, epochs = 50, batch_size = 24, verbose = 0)
    l, a = model.evaluate(x_test, y_test, verbose = 0)
    accuracies.append(a)
    print("Loss: {0} | Accuracy: {1}".format(l, a))
    
print("Average 10 Folds Accuracy: {0}".format(np.mean(accuracies)))

In [ ]: