In [1]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import seaborn as sns
sns.set()
path = '/home/huseinzol05/Documents/UrbanSound8K/audio/fold'
In [2]:
dataset = pd.read_csv('/home/huseinzol05/Documents/UrbanSound8K/metadata/UrbanSound8K.csv')
dataset = dataset.iloc[np.random.permutation(len(dataset))]
dataset.head()
Out[2]:
In [3]:
def extract_feature(path, t):
Y, sample_rate = librosa.load(path)
stft = np.abs(librosa.stft(Y))
mfcss = librosa.feature.mfcc(y = Y, sr = sample_rate, n_mfcc = 40)
chroma = librosa.feature.chroma_stft(S = stft, sr = sample_rate)
mel = librosa.feature.melspectrogram(Y, sr = sample_rate)
contrast = librosa.feature.spectral_contrast(S = stft, sr = sample_rate)
tonnetz = librosa.feature.tonnetz(y = librosa.effects.harmonic(Y), sr = sample_rate)
return mfcss[:, :t], chroma[:, :t], mel[:, :t], contrast[:, :t], tonnetz[:, :t]
def full_extract_feature(path, t):
Y, sample_rate = librosa.load(path)
stft = np.abs(librosa.stft(Y))
mfcss = librosa.feature.mfcc(y = Y, sr = sample_rate, n_mfcc = 40)
rmse = librosa.feature.rmse(y = Y)
chroma_stft = librosa.feature.chroma_stft(S = stft, sr = sample_rate)
chroma_cqt = librosa.feature.chroma_cqt(C = stft, sr = sample_rate)
chroma_cens = librosa.feature.chroma_cens(C = stft, sr = sample_rate)
mel = librosa.feature.melspectrogram(Y, sr = sample_rate)
contrast = librosa.feature.spectral_contrast(S = stft, sr = sample_rate)
centroid = librosa.feature.spectral_centroid(S = stft, sr = sample_rate)
rolloff = librosa.feature.spectral_rolloff(S = stft, sr = sample_rate)
bandwidth = librosa.feature.spectral_bandwidth(S = stft, sr = sample_rate)
tonnetz = librosa.feature.tonnetz(y = librosa.effects.harmonic(Y), sr = sample_rate)
zero_crossing_rate = librosa.feature.zero_crossing_rate(y = Y)
return mfcss[:, :t], rmse[:, :t], chroma_stft[:, :t], chroma_cqt[:, :t], chroma_cens[:, :t], mel[:, :t], contrast[:, :t], centroid[:, :t], rolloff[:, :t], bandwidth[:, :t], tonnetz[:, :t], zero_crossing_rate[:, :t]
def parse_audio_file(dataset, shape, t = 60, full_extract = False, normalize = True):
features, labels = np.empty((0, t, shape)), []
dataset = dataset.ix[:, :].values
for i in xrange(dataset.shape[0]):
try:
p = path + str(dataset[i, -3]) + '/' + str(dataset[i, 0])
if full_extract:
mfcss, rmse, chroma_stft, chroma_cqt, chroma_cens, mel, contrast, centroid, rolloff, bandwidth, tonnetz, zero_crossing_rate = full_extract_feature(p, t)
ext_features = np.hstack([mfcss.T, rmse.T, chroma_stft.T, chroma_cqt.T, chroma_cens.T, mel.T, contrast.T, centroid.T, rolloff.T, bandwidth.T, tonnetz.T, zero_crossing_rate.T])
else:
mfcss, chroma, mel, contrast, tonnetz = extract_feature(p, t)
ext_features = np.hstack([mfcss.T, chroma.T, mel.T, contrast.T, tonnetz.T])
features = np.vstack([features, np.array([ext_features])])
labels.append(int(dataset[i, -2]))
except Exception as e:
print e
print 'skipped: ' + str(dataset[i, :])
continue
if normalize:
features = (features - features.min()) / (features.max() - features.min())
return features, labels
def one_hot(labels, shape):
onehot = np.zeros((len(labels), shape))
for i in xrange(onehot.shape[0]):
onehot[i, labels[i]] = 1.0
return onehot
In [4]:
class Model:
def __init__(self, num_layers, size_layer, dimension_input, dimension_output, learning_rate, delta):
def lstm_cell():
return tf.nn.rnn_cell.LSTMCell(size_layer, activation = tf.nn.relu)
self.rnn_cells = tf.nn.rnn_cell.MultiRNNCell([lstm_cell() for _ in xrange(num_layers)])
self.X = tf.placeholder(tf.float32, [None, None, dimension_input])
self.Y = tf.placeholder(tf.float32, [None, dimension_output])
self.outputs, self.last_state = tf.nn.dynamic_rnn(self.rnn_cells, self.X, dtype = tf.float32)
self.rnn_W = tf.Variable(tf.random_normal((size_layer, dimension_output)))
self.rnn_B = tf.Variable(tf.random_normal([dimension_output]))
self.logits = tf.matmul(self.outputs[-1], self.rnn_W) + self.rnn_B
self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
l2 = sum(delta * tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables())
self.cost += l2
self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
self.correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))
In [5]:
num_layers = 2
size_layer = 256
learning_rate = 0.001
EPOCH = 10
BATCH_SIZE = 128
delta = 0.00005
FULL_EXTRACT = False
period = 30
dimension_output = np.unique(dataset['classID']).shape[0]
dimension_input = 222 if FULL_EXTRACT else 193
In [ ]:
sess = tf.InteractiveSession()
model = Model(num_layers, size_layer, dimension_input, dimension_output, learning_rate, delta)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())
train_dataset = dataset.iloc[:int(dataset.shape[0] * 0.80), :]
test_dataset = dataset.iloc[int(dataset.shape[0] * 0.80):, :]
In [ ]:
import time
from sklearn import metrics
ACCURACY, ACCURACY_TEST, LOST = [], [], []
for i in xrange(EPOCH):
total_cost = 0; total_accuracy = 0; last_time = time.time()
for x in xrange(0, (train_dataset.shape[0] // BATCH_SIZE) * BATCH_SIZE, BATCH_SIZE):
print 'current batch: ' + str(x)
features, labels = parse_audio_file(train_dataset.iloc[x : x + BATCH_SIZE], shape = dimension_input, t = period)
batch_x = np.zeros((period, features.shape[0], dimension_input))
for k in xrange(features.shape[0]):
batch_x[:, k, :] = features[k, :, :]
onehot = one_hot(labels, dimension_output)
loss, _ = sess.run([model.cost, model.optimizer], feed_dict = {model.X : batch_x, model.Y : onehot})
total_accuracy += sess.run(model.accuracy, feed_dict = {model.X : batch_x, model.Y : onehot})
total_cost += loss
diff = time.time() - last_time
total_accuracy /= (train_dataset.shape[0] // BATCH_SIZE)
total_cost /= (train_dataset.shape[0] // BATCH_SIZE)
ACCURACY.append(total_accuracy)
LOST.append(total_cost)
print "total accuracy during training: " + str(total_accuracy)
print "epoch: " + str(i + 1) + ", loss: " + str(total_cost) + ", speed: " + str(diff / (train_dataset.shape[0] // BATCH_SIZE)) + " s / batch"
features, labels = parse_audio_file(test_dataset, shape = dimension_input, t = period)
batch_x = np.zeros((period, features.shape[0], dimension_input))
for k in xrange(features.shape[0]):
batch_x[:, k, :] = features[k, :, :]
onehot = one_hot(labels, dimension_output)
accuracy_test = sess.run(model.accuracy, feed_dict = {model.X : features, model.Y : onehot})
ACCURACY_TEST.append(accuracy_test)
print "total accuracy during testing: " + str(accuracy_test)
saver.save(sess, "./checkpoint/model.ckpt")
if(i + 1) % 1 == 0:
logits = sess.run(tf.cast(tf.argmax(model.logits, 1), tf.int32), feed_dict = {model.X : features})
print(metrics.classification_report(np.array(labels), logits, target_names = np.unique(dataset['class'])))
In [ ]: