In [1]:
import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import seaborn as sns
sns.set()
In [2]:
def load_sound(path):
raw = []
for fp in path:
X, _ = librosa.load(fp)
raw.append(X)
return raw
def plot_waves(soundnames, raw):
i = 1
plt.figure(figsize = (20, 60))
for n, f in zip(soundnames, raw):
plt.subplot(10, 1, i)
librosa.display.waveplot(np.array(f), sr = 22050)
plt.title(n)
i += 1
plt.suptitle('Figure 1: Waveplot', x = 0.5, y = 0.915, fontsize = 18)
plt.show()
def plot_specgram(soundnames, raw):
i = 1
fig = plt.figure(figsize = (25,60))
for n, f in zip(soundnames, raw):
plt.subplot(10, 1, i)
plt.specgram(np.array(f), Fs = 22050)
plt.title(n)
i += 1
plt.suptitle('Figure 1: Waveplot', x = 0.5, y = 0.915, fontsize = 18)
plt.show()
def plot_logpower_specgram(soundnames, raw):
i = 1
plt.figure(figsize = (25, 60))
for n, f in zip(soundnames, raw):
plt.subplot(10, 1, i)
D = librosa.logamplitude(np.abs(librosa.stft(f)) ** 2, ref_power = np.max)
librosa.display.specshow(D, x_axis='time', y_axis = 'log')
plt.title(n)
i += 1
plt.suptitle('Figure 1: Waveplot', x = 0.5, y = 0.915, fontsize = 18)
plt.show()
In [3]:
dataset = pd.read_csv('/home/husein/space/UrbanSound8K/metadata/UrbanSound8K.csv')
dataset = dataset.iloc[np.random.permutation(len(dataset))]
dataset.head()
Out[3]:
In [4]:
unique_sound = np.unique(dataset.iloc[:, -1])
unique_sound
Out[4]:
In [5]:
path = '/home/husein/space/UrbanSound8K/audio/fold'
location, names = [], []
for i in xrange(unique_sound.shape[0]):
sound = dataset.loc[dataset['class'] == unique_sound[i]].iloc[0, :]
location.append(path + str(sound['fold']) + '/' + str(sound['slice_file_name']))
names.append(str(unique_sound[i]))
raw = load_sound(location)
In [6]:
plot_waves(names, raw)
In [7]:
plot_specgram(names, raw)
In [8]:
plot_logpower_specgram(names, raw)
In [9]:
def extract_feature(path):
Y, sample_rate = librosa.load(path)
stft = np.abs(librosa.stft(Y))
mfcss = np.mean(librosa.feature.mfcc(y = Y, sr = sample_rate, n_mfcc = 40).T, axis = 0)
chroma = np.mean(librosa.feature.chroma_stft(S = stft, sr = sample_rate).T, axis = 0)
mel = np.mean(librosa.feature.melspectrogram(Y, sr = sample_rate).T, axis = 0)
contrast = np.mean(librosa.feature.spectral_contrast(S = stft, sr = sample_rate).T, axis = 0)
tonnetz = np.mean(librosa.feature.tonnetz(y = librosa.effects.harmonic(Y), sr = sample_rate).T, axis = 0)
return mfcss, chroma, mel, contrast, tonnetz
def detail_extract_feature(path):
Y, sample_rate = librosa.load(path)
stft = np.abs(librosa.stft(Y))
print 'sample rate: ' + str(sample_rate)
print 'sound shape: ' + str(Y.shape)
print stft.shape
print librosa.feature.mfcc(y = Y, sr = sample_rate, n_mfcc = 40).shape
print librosa.feature.chroma_stft(S = stft, sr = sample_rate).shape
print librosa.feature.melspectrogram(Y, sr = sample_rate).shape
print librosa.feature.spectral_contrast(S = stft, sr = sample_rate).shape
print librosa.feature.tonnetz(y = librosa.effects.harmonic(Y), sr = sample_rate).shape
def parse_audio_files(dataset, normalize = True):
features, labels = np.empty((0, 193)), []
dataset = dataset.ix[:, :].values
for i in xrange(dataset.shape[0]):
try:
p = path + str(dataset[i, -3]) + '/' + str(dataset[i, 0])
mfcss, chroma, mel, contrast, tonnetz = extract_feature(p)
ext_features = np.hstack([mfcss, chroma, mel, contrast, tonnetz])
features = np.vstack([features, ext_features])
labels.append(int(dataset[i, -2]))
except:
print 'skipped: ' + str(dataset[i, :])
continue
if normalize:
features = (features - features.min()) / (features.max() - features.min())
return features, labels
def one_hot(labels, shape):
onehot = np.zeros((len(labels), shape))
for i in xrange(onehot.shape[0]):
onehot[i, labels[i]] = 1.0
return onehot
detail_extract_feature(location[0])
In [10]:
class Model:
def __init__(self, learning_rate, size_input, size_output, delta):
self.X = tf.placeholder(tf.float32, (None, size_input))
self.Y = tf.placeholder(tf.float32, (None, size_output))
w1 = tf.Variable(tf.random_normal([size_input, 784], stddev = 0.5))
b1 = tf.Variable(tf.random_normal([784], stddev = 0.1))
w2 = tf.Variable(tf.random_normal([784, 256], stddev = 0.5))
b2 = tf.Variable(tf.random_normal([256], stddev = 0.1))
w3 = tf.Variable(tf.random_normal([256, 100], stddev = 0.5))
b3 = tf.Variable(tf.random_normal([100], stddev = 0.1))
w4 = tf.Variable(tf.random_normal([100, size_output], stddev = 0.5))
b4 = tf.Variable(tf.random_normal([size_output], stddev = 0.1))
hidden1 = tf.nn.relu(tf.matmul(self.X, w1) + b1)
hidden1 = tf.layers.batch_normalization(hidden1)
hidden1 = tf.nn.dropout(hidden1, 0.5)
hidden2 = tf.nn.relu(tf.matmul(hidden1, w2) + b2)
hidden2 = tf.layers.batch_normalization(hidden2)
hidden2 = tf.nn.dropout(hidden2, 0.5)
hidden3 = tf.nn.relu(tf.matmul(hidden2, w3) + b3)
hidden3 = tf.layers.batch_normalization(hidden3)
hidden3 = tf.nn.dropout(hidden3, 0.5)
self.logits = tf.matmul(hidden3, w4) + b4
self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
self.cost += delta * tf.nn.l2_loss(w1) + delta * tf.nn.l2_loss(w2) + delta * tf.nn.l2_loss(w3) + delta * tf.nn.l2_loss(w4)
self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
correct_prediction = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
In [ ]:
no_classes = np.unique(dataset['classID']).shape[0]
sess = tf.InteractiveSession()
model = Model(0.001, 193, no_classes, 0.00005)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())
train_dataset = dataset.iloc[:int(dataset.shape[0] * 0.80), :]
test_dataset = dataset.iloc[int(dataset.shape[0] * 0.80):, :]
In [ ]:
import time
from sklearn import metrics
EPOCH = 10
BATCH_SIZE = 128
ACCURACY, ACCURACY_TEST, LOST = [], [], []
for i in xrange(EPOCH):
total_cost = 0; total_accuracy = 0; last_time = time.time()
for x in xrange(0, (train_dataset.shape[0] // BATCH_SIZE) * BATCH_SIZE, BATCH_SIZE):
print 'current batch: ' + str(x)
features, labels = parse_audio_files(train_dataset.iloc[x : x + BATCH_SIZE, :])
onehot = one_hot(labels, no_classes)
loss, _ = sess.run([model.cost, model.optimizer], feed_dict = {model.X : features, model.Y : onehot})
total_accuracy += sess.run(model.accuracy, feed_dict = {model.X : features, model.Y : onehot})
total_cost += loss
diff = time.time() - last_time
total_accuracy /= (train_dataset.shape[0] // BATCH_SIZE)
total_cost /= (train_dataset.shape[0] // BATCH_SIZE)
ACCURACY.append(total_accuracy)
LOST.append(total_cost)
print "total accuracy during training: " + str(total_accuracy)
print "epoch: " + str(i + 1) + ", loss: " + str(total_cost) + ", speed: " + str(diff / (train_dataset.shape[0] // BATCH_SIZE)) + " s / batch"
features, labels = parse_audio_files(test_dataset)
onehot = one_hot(labels, no_classes)
accuracy_test = sess.run(model.accuracy, feed_dict = {model.X : features, model.Y : onehot})
ACCURACY_TEST.append(accuracy_test)
print "total accuracy during testing: " + str(accuracy_test)
saver.save(sess, "./checkpoint/model.ckpt")
if(i + 1) % 1 == 0:
features, labels = parse_audio_files(test_dataset)
logits = sess.run(tf.cast(tf.argmax(model.logits, 1), tf.int32), feed_dict = {model.X : features})
print(metrics.classification_report(np.array(labels), logits, target_names = np.unique(dataset['class'])))
In [ ]: