In [2]:
### Load necessary libraries ###
import glob
import os
import librosa
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow import keras
In [3]:
### Define helper functions ###
def extract_features(parent_dir, sub_dirs, file_ext="*.wav",
bands=20, frames=41):
def _windows(data, window_size):
start = 0
while start < len(data):
yield start, start + window_size
start += (window_size // 2)
window_size = 512 * (frames - 1)
features, labels = [], []
for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
segment_mfcc, segment_labels = [], []
sound_clip, sr = librosa.load(fn)
label = int(fn.split('/')[2].split('-')[1])
for (start,end) in _windows(sound_clip,window_size):
if(len(sound_clip[start:end]) == window_size):
signal = sound_clip[start:end]
mfcc = librosa.feature.mfcc(y=signal, sr=sr,
n_mfcc=bands).T.flatten()[:, np.newaxis].T
segment_mfcc.append(mfcc)
segment_labels.append(label)
segment_mfcc = np.asarray(segment_mfcc).reshape(
len(segment_mfcc),frames,bands)
if len(segment_mfcc) > 0: # check for empty segments
features.append(segment_mfcc)
labels.append(segment_labels)
return features, labels
In [ ]:
parent_dir = 'UrbanSounds8K/audio/'
save_dir = "UrbanSounds8K/processed/"
folds = sub_dirs = np.array(['fold1','fold2','fold3','fold4',
'fold5','fold6','fold7','fold8',
'fold9','fold10'])
for sub_dir in sub_dirs:
features, labels = extract_features(parent_dir,sub_dir)
np.savez("{0}{1}".format(save_dir, sub_dir), features=features,
labels=labels)
In [4]:
### Define GRU based recurrent network architecture ###
def get_network():
input_shape = (41, 20)
num_classes = 10
keras.backend.clear_session()
model = keras.models.Sequential()
model.add(keras.layers.GRU(128, input_shape=input_shape))
model.add(keras.layers.Dense(128, activation="relu"))
model.add(keras.layers.Dense(num_classes, activation = "softmax"))
model.compile(optimizer=keras.optimizers.Adam(1e-4),
loss=keras.losses.SparseCategoricalCrossentropy(),
metrics=["accuracy"])
return model
In [5]:
### Train and evaluate via 10-Folds cross-validation ###
accuracies = []
folds = np.array(['fold1','fold2','fold3','fold4',
'fold5','fold6','fold7','fold8',
'fold9','fold10'])
load_dir = "UrbanSounds8K/processed/"
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(folds):
x_train, y_train = [], []
for ind in train_index:
# read features or segments of an audio file
train_data = np.load("{0}/{1}.npz".format(load_dir,folds[ind]),
allow_pickle=True)
# for training stack all the segments so that they are treated as an example/instance
features = np.concatenate(train_data["features"], axis=0)
labels = np.concatenate(train_data["labels"], axis=0)
x_train.append(features)
y_train.append(labels)
# stack x,y pairs of all training folds
x_train = np.concatenate(x_train, axis = 0).astype(np.float32)
y_train = np.concatenate(y_train, axis = 0).astype(np.float32)
# for testing we will make predictions on each segment and average them to
# produce signle label for an entire sound clip.
test_data = np.load("{0}/{1}.npz".format(load_dir,
folds[test_index][0]), allow_pickle=True)
x_test = test_data["features"]
y_test = test_data["labels"]
model = get_network()
model.fit(x_train, y_train, epochs = 3, batch_size = 24, verbose = 0)
# evaluate on test set/fold
y_true, y_pred = [], []
for x, y in zip(x_test, y_test):
# average predictions over segments of a sound clip
avg_p = np.argmax(np.mean(model.predict(x), axis = 0))
y_pred.append(avg_p)
# pick single label via np.unique for a sound clip
y_true.append(np.unique(y)[0])
accuracies.append(accuracy_score(y_true, y_pred))
print("Average 10 Folds Accuracy: {0}".format(np.mean(accuracies)))
In [ ]: