In [ ]:
### Load necessary libraries ###
import glob
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram
from sklearn.model_selection import KFold
import tensorflow as tf
from tensorflow import keras
%matplotlib inline
plt.style.use('ggplot')
In [ ]:
### Define helper functions ###
def load_sound_files(file_paths):
raw_sounds = []
for fp in file_paths:
X,sr = librosa.load(fp)
raw_sounds.append(X)
return raw_sounds
def plot_waves(sound_names,raw_sounds):
i = 1
fig = plt.figure(figsize=(25,60), dpi=900)
for n,f in zip(sound_names,raw_sounds):
plt.subplot(10,1,i)
librosa.display.waveplot(np.array(f),sr=22050)
plt.title(n.title())
i += 1
plt.suptitle('Figure 1: Waveplot',x=0.5, y=0.915,fontsize=18)
plt.show()
def plot_specgram(sound_names,raw_sounds):
i = 1
fig = plt.figure(figsize=(25,60), dpi = 900)
for n,f in zip(sound_names,raw_sounds):
plt.subplot(10,1,i)
specgram(np.array(f), Fs=22050)
plt.title(n.title())
i += 1
plt.suptitle('Figure 2: Spectrogram',x=0.5, y=0.915,fontsize=18)
plt.show()
def plot_log_power_specgram(sound_names,raw_sounds):
i = 1
fig = plt.figure(figsize=(25,60), dpi = 900)
for n,f in zip(sound_names,raw_sounds):
plt.subplot(10,1,i)
D = librosa.logamplitude(np.abs(librosa.stft(f))**2, ref_power=np.max)
librosa.display.specshow(D,x_axis='time' ,y_axis='log')
plt.title(n.title())
i += 1
plt.suptitle('Figure 3: Log power spectrogram',x=0.5, y=0.915,fontsize=18)
plt.show()
def extract_feature(file_name):
X, sample_rate = librosa.load(file_name)
stft = np.abs(librosa.stft(X))
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
return mfccs,chroma,mel,contrast,tonnetz
def parse_audio_files(parent_dir,sub_dir,file_ext='*.wav'):
features, labels = np.empty((0,193)), np.empty(0) # 193 => total features
for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
mfccs, chroma, mel, contrast,tonnetz = extract_feature(fn)
ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
features = np.vstack([features,ext_features])
labels = np.append(labels, int(fn.split('/')[2].split('-')[1]))
return np.array(features, dtype=np.float32), np.array(labels, dtype = np.int8)
In [ ]:
### Plot few sound clips along with their spectrograms ###
sound_file_paths = ["57320-0-0-7.wav","24074-1-0-3.wav",
"15564-2-0-1.wav","31323-3-0-1.wav",
"46669-4-0-35.wav","89948-5-0-0.wav",
"46656-6-0-0.wav","103074-7-3-2.wav",
"106905-8-0-0.wav","108041-9-0-4.wav"]
sound_names = ["air conditioner","car horn","children playing",
"dog bark","drilling","engine idling",
"gun shot","jackhammer","siren","street music"]
raw_sounds = load_sound_files(sound_file_paths)
plot_waves(sound_names,raw_sounds)
plot_specgram(sound_names,raw_sounds)
In [ ]:
# Pre-process and extract feature from the data
parent_dir = 'UrbanSounds8K/audio/'
save_dir = "UrbanSounds8K/processed/"
sub_dirs = np.array(['fold1','fold2','fold3','fold4',
'fold5','fold6','fold7','fold8',
'fold9','fold10'])
for sub_dir in sub_dirs:
features, labels = parse_audio_files(parent_dir,sub_dir)
np.savez("{0}{1}".format(save_dir, sub_dir), features=features,
labels=labels)
In [ ]:
### Define feedforward network architecture ###
def get_network():
input_shape = (193,)
num_classes = 10
keras.backend.clear_session()
model = keras.models.Sequential()
model.add(keras.layers.Dense(256, activation="relu", input_shape=input_shape))
model.add(keras.layers.Dense(128, activation="relu", input_shape=input_shape))
model.add(keras.layers.Dense(64, activation="relu", input_shape=input_shape))
model.add(keras.layers.Dense(num_classes, activation = "softmax"))
model.compile(optimizer=keras.optimizers.Adam(1e-4),
loss=keras.losses.SparseCategoricalCrossentropy(),
metrics=["accuracy"])
return model
In [ ]:
### Train and evaluate via 10-Folds cross-validation ###
accuracies = []
folds = np.array(['fold1','fold2','fold3','fold4',
'fold5','fold6','fold7','fold8',
'fold9','fold10'])
load_dir = "UrbanSounds8K/processed/"
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(folds):
x_train, y_train = [], []
for ind in train_index:
data = np.load("{0}/{1}.npz".format(load_dir,folds[ind]))
x_train.append(data["features"])
y_train.append(data["labels"])
x_train = np.concatenate(x_train, axis = 0)
y_train = np.concatenate(y_train, axis = 0)
data = np.load("{0}/{1}.npz".format(load_dir,folds[test_index][0]))
x_test = data["features"]
y_test = data["labels"]
# Possibly do mean normalization here on x_train and x_test but using only x_train's mean and std.
model = get_network()
model.fit(x_train, y_train, epochs = 50, batch_size = 24, verbose = 0)
l, a = model.evaluate(x_test, y_test, verbose = 0)
accuracies.append(a)
print("Loss: {0} | Accuracy: {1}".format(l, a))
print("Average 10 Folds Accuracy: {0}".format(np.mean(accuracies)))
In [ ]: