In [1]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import seaborn as sns
sns.set()
path = '/home/huseinzol05/Documents/UrbanSound8K/audio/fold'

In [2]:
dataset = pd.read_csv('/home/huseinzol05/Documents/UrbanSound8K/metadata/UrbanSound8K.csv')
dataset = dataset.iloc[np.random.permutation(len(dataset))]
dataset.head()


Out[2]:
slice_file_name fsID start end salience fold classID class
6302 39854-5-1-4.wav 39854 146.073254 150.073254 1 6 5 engine_idling
1813 145611-6-3-0.wav 145611 17.855555 18.975026 2 5 6 gun_shot
8274 84359-2-0-1.wav 84359 0.500000 4.500000 2 2 2 children_playing
2944 162134-7-5-0.wav 162134 120.266429 122.246864 1 10 7 jackhammer
4426 182739-2-0-17.wav 182739 8.500000 12.500000 1 2 2 children_playing

In [3]:
def extract_feature(path, t):
    Y, sample_rate = librosa.load(path)
    stft = np.abs(librosa.stft(Y))
    mfcss = librosa.feature.mfcc(y = Y, sr = sample_rate, n_mfcc = 40)
    chroma = librosa.feature.chroma_stft(S = stft, sr = sample_rate)
    mel = librosa.feature.melspectrogram(Y, sr = sample_rate)
    contrast = librosa.feature.spectral_contrast(S = stft, sr = sample_rate)
    tonnetz = librosa.feature.tonnetz(y = librosa.effects.harmonic(Y), sr = sample_rate)
    return mfcss[:, :t], chroma[:, :t], mel[:, :t], contrast[:, :t], tonnetz[:, :t]

def full_extract_feature(path, t):
    Y, sample_rate = librosa.load(path)
    stft = np.abs(librosa.stft(Y))
    mfcss = librosa.feature.mfcc(y = Y, sr = sample_rate, n_mfcc = 40)
    rmse = librosa.feature.rmse(y = Y)
    chroma_stft = librosa.feature.chroma_stft(S = stft, sr = sample_rate)
    chroma_cqt = librosa.feature.chroma_cqt(C = stft, sr = sample_rate)
    chroma_cens = librosa.feature.chroma_cens(C = stft, sr = sample_rate)
    mel = librosa.feature.melspectrogram(Y, sr = sample_rate)
    contrast = librosa.feature.spectral_contrast(S = stft, sr = sample_rate)
    centroid = librosa.feature.spectral_centroid(S = stft, sr = sample_rate)
    rolloff = librosa.feature.spectral_rolloff(S = stft, sr = sample_rate)
    bandwidth = librosa.feature.spectral_bandwidth(S = stft, sr = sample_rate)
    tonnetz = librosa.feature.tonnetz(y = librosa.effects.harmonic(Y), sr = sample_rate)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y = Y)
    return mfcss[:, :t], rmse[:, :t], chroma_stft[:, :t], chroma_cqt[:, :t], chroma_cens[:, :t], mel[:, :t], contrast[:, :t], centroid[:, :t], rolloff[:, :t], bandwidth[:, :t], tonnetz[:, :t], zero_crossing_rate[:, :t]

def parse_audio_file(dataset, shape, t = 60, full_extract = False, normalize = True):
    features, labels = np.empty((0, t, shape)), []
    dataset = dataset.ix[:, :].values
    for i in xrange(dataset.shape[0]):
        try:
            p = path + str(dataset[i, -3]) + '/' + str(dataset[i, 0])
            if full_extract:
                mfcss, rmse, chroma_stft, chroma_cqt, chroma_cens, mel, contrast, centroid, rolloff, bandwidth, tonnetz, zero_crossing_rate = full_extract_feature(p, t)
                ext_features = np.hstack([mfcss.T, rmse.T, chroma_stft.T, chroma_cqt.T, chroma_cens.T, mel.T, contrast.T, centroid.T, rolloff.T, bandwidth.T, tonnetz.T, zero_crossing_rate.T])
            else:
                mfcss, chroma, mel, contrast, tonnetz = extract_feature(p, t)
                ext_features = np.hstack([mfcss.T, chroma.T, mel.T, contrast.T, tonnetz.T])
            
            features = np.vstack([features, np.array([ext_features])])
            labels.append(int(dataset[i, -2]))
        except Exception as e:
            print e
            print 'skipped: ' + str(dataset[i, :])
            continue
    
    if normalize:
        features = (features - features.min()) / (features.max() - features.min())
        
    return features, labels
        

def one_hot(labels, shape):
    onehot = np.zeros((len(labels), shape))
    for i in xrange(onehot.shape[0]):
        onehot[i, labels[i]] = 1.0
    return onehot

In [4]:
class Model:
    
    def __init__(self, num_layers, size_layer, dimension_input, dimension_output, learning_rate, delta):
        
        def lstm_cell():
            return tf.nn.rnn_cell.LSTMCell(size_layer, activation = tf.nn.relu)
        
        self.rnn_cells = tf.nn.rnn_cell.MultiRNNCell([lstm_cell() for _ in xrange(num_layers)])
        
        self.X = tf.placeholder(tf.float32, [None, None, dimension_input])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        
        self.outputs, self.last_state = tf.nn.dynamic_rnn(self.rnn_cells, self.X, dtype = tf.float32)
        
        self.rnn_W = tf.Variable(tf.random_normal((size_layer, dimension_output)))
        self.rnn_B = tf.Variable(tf.random_normal([dimension_output]))
        
        self.logits = tf.matmul(self.outputs[-1], self.rnn_W) + self.rnn_B
        
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        
        l2 = sum(delta * tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables())
        
        self.cost += l2
        
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        
        self.correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        
        self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))

In [5]:
num_layers = 2
size_layer = 256
learning_rate = 0.001
EPOCH = 10
BATCH_SIZE = 128
delta = 0.00005
FULL_EXTRACT = False
period = 30
dimension_output = np.unique(dataset['classID']).shape[0]
dimension_input = 222 if FULL_EXTRACT else 193

In [ ]:
sess = tf.InteractiveSession()
model = Model(num_layers, size_layer, dimension_input, dimension_output, learning_rate, delta)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())

train_dataset = dataset.iloc[:int(dataset.shape[0] * 0.80), :]
test_dataset = dataset.iloc[int(dataset.shape[0] * 0.80):, :]

In [ ]:
import time
from sklearn import metrics

ACCURACY, ACCURACY_TEST, LOST = [], [], []

for i in xrange(EPOCH):
    total_cost = 0; total_accuracy = 0; last_time = time.time()
    
    for x in xrange(0, (train_dataset.shape[0] // BATCH_SIZE) * BATCH_SIZE, BATCH_SIZE):
        print 'current batch: ' + str(x)
        features, labels = parse_audio_file(train_dataset.iloc[x : x + BATCH_SIZE], shape = dimension_input, t = period)
        batch_x = np.zeros((period, features.shape[0], dimension_input))
        for k in xrange(features.shape[0]):
            batch_x[:, k, :] = features[k, :, :]
        onehot = one_hot(labels, dimension_output)
        loss, _ = sess.run([model.cost, model.optimizer], feed_dict = {model.X : batch_x, model.Y : onehot})
        total_accuracy += sess.run(model.accuracy, feed_dict = {model.X : batch_x, model.Y : onehot})
        total_cost += loss
    
    diff = time.time() - last_time
    total_accuracy /= (train_dataset.shape[0] // BATCH_SIZE)
    total_cost /= (train_dataset.shape[0] // BATCH_SIZE)
    ACCURACY.append(total_accuracy)
    LOST.append(total_cost)
    
    print "total accuracy during training: " + str(total_accuracy)
    print "epoch: " + str(i + 1) + ", loss: " + str(total_cost) + ", speed: " + str(diff / (train_dataset.shape[0] // BATCH_SIZE)) + " s / batch"
    
    features, labels = parse_audio_file(test_dataset, shape = dimension_input, t = period)
    batch_x = np.zeros((period, features.shape[0], dimension_input))
    for k in xrange(features.shape[0]):
        batch_x[:, k, :] = features[k, :, :]
    onehot = one_hot(labels, dimension_output)
    accuracy_test = sess.run(model.accuracy, feed_dict = {model.X : features, model.Y : onehot})
    ACCURACY_TEST.append(accuracy_test)
    print "total accuracy during testing: " + str(accuracy_test)
    
    saver.save(sess, "./checkpoint/model.ckpt")
    if(i + 1) % 1 == 0:
        logits = sess.run(tf.cast(tf.argmax(model.logits, 1), tf.int32), feed_dict = {model.X : features})
        print(metrics.classification_report(np.array(labels), logits, target_names = np.unique(dataset['class'])))


current batch: 0
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['180156-1-8-0.wav' 180156 25.488681 26.086759 2 9 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['43784-3-0-0.wav' 43784 0.934147 1.116784 1 7 3 'dog_bark']
current batch: 128
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['66587-3-1-0.wav' 66587 3.077173 3.4286449999999995 1 5 3 'dog_bark']
current batch: 256
/usr/local/lib/python2.7/dist-packages/librosa/util/utils.py:1632: RuntimeWarning: invalid value encountered in less
  if np.any(X < 0) or np.any(X_ref < 0):
/usr/local/lib/python2.7/dist-packages/librosa/util/utils.py:1645: RuntimeWarning: invalid value encountered in less
  bad_idx = (Z < np.finfo(dtype).tiny)
Audio buffer is not finite everywhere
skipped: ['87275-1-2-0.wav' 87275 1.817251 1.871768 2 1 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['43805-8-1-0.wav' 43805 64.181737 64.461469 1 7 8 'siren']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['147672-3-0-0.wav' 147672 0.644886 0.997283 1 2 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['163460-6-0-0.wav' 163460 0.0 0.672222 1 2 6 'gun_shot']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['175856-1-1-0.wav' 175856 0.9320440000000001 1.230957 1 8 1 'car_horn']
current batch: 384
/usr/local/lib/python2.7/dist-packages/librosa/core/pitch.py:145: UserWarning: Trying to estimate tuning from empty frequency set.
  warnings.warn('Trying to estimate tuning from empty frequency set.')
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['23161-6-1-0.wav' 23161 59.26566700000001 59.581943 2 4 6 'gun_shot']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['187356-1-0-0.wav' 187356 20.664779 21.032153 2 4 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['160094-3-0-0.wav' 160094 0.12069500000000001 0.31345300000000004 1 1 3
 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['89207-3-0-0.wav' 89207 0.028387 0.389303 1 8 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['117536-1-0-0.wav' 117536 31.064967 31.640588 2 4 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['151359-1-1-0.wav' 151359 9.100817999999999 9.255836 1 3 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['143115-1-4-0.wav' 143115 58.001371999999996 58.353505000000006 2 6 1
 'car_horn']
current batch: 512
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['160093-3-0-0.wav' 160093 0.015691 0.135517 1 3 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['26185-1-0-0.wav' 26185 0.007625 0.340057 1 9 1 'car_horn']
current batch: 640
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['118440-4-7-0.wav' 118440 8.195748 8.61258 1 5 4 'drilling']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['118440-4-0-0.wav' 118440 0.187868 0.810181 1 5 4 'drilling']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['160092-3-0-0.wav' 160092 0.013784000000000001 0.142207 1 2 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['162433-6-5-0.wav' 162433 2.82883 3.338725 1 8 6 'gun_shot']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['162702-1-0-0.wav' 162702 0.0 0.10596199999999999 1 6 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['71309-1-1-0.wav' 71309 38.797915 39.249054 2 8 1 'car_horn']
current batch: 768
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['4918-3-3-0.wav' 4918 4.8672 5.2832 1 8 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['77247-6-0-0.wav' 77247 2.8020080000000003 3.454014 1 5 6 'gun_shot']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['71439-1-1-0.wav' 71439 3.699306 3.906744 1 5 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['156893-7-10-0.wav' 156893 30.791428999999997 31.349245 2 2 7 'jackhammer']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['203128-3-3-0.wav' 203128 10.083655 10.529498 1 2 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['76091-6-2-0.wav' 76091 3.1252869999999997 3.321417 1 2 6 'gun_shot']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['66587-3-3-0.wav' 66587 7.811285000000001 8.299042 1 5 3 'dog_bark']
Audio buffer is not finite everywhere
skipped: ['87275-1-4-0.wav' 87275 2.4096740000000003 2.47873 2 1 1 'car_horn']
current batch: 896
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['151359-1-2-0.wav' 151359 11.322735 11.419621000000001 1 3 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['180156-1-12-0.wav' 180156 37.221565999999996 37.432653 2 9 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['43802-1-0-0.wav' 43802 1.111691 1.2936040000000002 1 6 1 'car_horn']
current batch: 1024
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['162433-6-3-0.wav' 162433 1.7764950000000002 2.240282 1 8 6 'gun_shot']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['87275-1-1-0.wav' 87275 1.613718 1.668236 2 1 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['180156-1-11-0.wav' 180156 31.029699 31.645366999999997 2 9 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['166268-3-2-0.wav' 166268 3.495586 3.706087 1 1 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['180156-1-7-0.wav' 180156 23.747218 23.993485999999997 2 9 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['47926-3-1-0.wav' 47926 3.315978 3.7522910000000005 1 4 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['162431-6-2-0.wav' 162431 4.449299 5.084912999999999 1 3 6 'gun_shot']
current batch: 1152
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['132073-1-2-0.wav' 132073 4.41645 4.78231 2 2 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['24965-3-1-0.wav' 24965 1.304201 1.8576759999999999 1 7 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['4912-3-0-0.wav' 4912 0.0 0.231748 1 6 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['162728-1-0-0.wav' 162728 2.7788060000000003 3.200719 1 7 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['44278-1-1-0.wav' 44278 9.944179 10.282033 2 4 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['159702-6-1-0.wav' 159702 0.597821 1.174665 1 6 6 'gun_shot']
current batch: 1280
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['34621-4-27-0.wav' 34621 26.346196000000003 26.925233000000002 1 2 4
 'drilling']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['11722-3-1-0.wav' 11722 2.02081 2.4595119999999997 1 10 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['24965-3-2-0.wav' 24965 2.3081110000000002 2.911635 1 7 3 'dog_bark']
Audio buffer is not finite everywhere
skipped: ['87275-1-3-0.wav' 87275 2.2025080000000004 2.264294 2 1 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['26255-3-8-0.wav' 26255 39.66359 40.252012 2 10 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['162541-1-0-0.wav' 162541 1.271857 1.758951 1 2 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['20285-3-2-0.wav' 20285 18.510488 19.062828 1 5 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['24965-3-0-0.wav' 24965 0.0 0.500483 1 7 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['34621-4-20-0.wav' 34621 21.215282000000002 21.826488 1 2 4 'drilling']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['162148-3-0-0.wav' 162148 0.0 0.45278100000000004 1 10 3 'dog_bark']
current batch: 1408
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['102103-3-0-0.wav' 102103 1.97603 2.508892 2 10 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['71439-1-3-0.wav' 71439 58.255433 58.808601 2 5 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['175848-1-0-0.wav' 175848 0.094818 0.632591 1 6 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['159702-6-0-0.wav' 159702 0.0 0.597821 1 6 6 'gun_shot']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['159708-6-2-0.wav' 159708 0.6251680000000001 0.9147790000000001 1 7 6
 'gun_shot']
current batch: 1536
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['159754-8-2-0.wav' 159754 9.989627 10.552889 1 5 8 'siren']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['34621-4-8-0.wav' 34621 12.288458 12.770989 1 2 4 'drilling']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['77509-1-0-0.wav' 77509 2.9673439999999998 3.5573080000000004 2 5 1
 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['115535-3-0-0.wav' 115535 0.0 0.490667 1 4 3 'dog_bark']
current batch: 1664
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['100648-1-0-0.wav' 100648 4.823402 5.471927 2 10 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['45256-1-0-0.wav' 45256 0.0 0.275708 2 5 1 'car_horn']
current batch: 1792
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['162540-1-2-0.wav' 162540 10.207327000000001 10.672773 1 1 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['118440-4-8-0.wav' 118440 8.794578 9.270119000000001 1 5 4 'drilling']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['179862-1-0-0.wav' 179862 0.324883 0.70578 1 4 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['24632-6-1-0.wav' 24632 29.012176 29.292025 2 4 6 'gun_shot']
current batch: 1920
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['43802-1-1-0.wav' 43802 2.425507 2.627632 1 6 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['197243-3-1-0.wav' 197243 2.1499610000000002 2.733176 1 5 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['179867-1-0-0.wav' 179867 0.025481999999999998 0.566271 1 1 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['162540-1-0-0.wav' 162540 1.2556200000000002 1.618235 1 1 1 'car_horn']
current batch: 2048
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['9032-3-0-0.wav' 9032 0.0 0.350864 1 6 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['162433-6-2-0.wav' 162433 1.309995 1.743948 1 8 6 'gun_shot']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['18594-1-4-0.wav' 18594 45.134957 45.350623 1 3 1 'car_horn']
Exception in thread Thread-4:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/usr/local/lib/python2.7/dist-packages/audioread/gstdec.py", line 149, in run
    self.loop.run()
  File "/usr/lib/python2.7/dist-packages/gi/overrides/GLib.py", line 576, in run
    raise KeyboardInterrupt
KeyboardInterrupt

all the input array dimensions except for the concatenation axis must match exactly
skipped: ['125520-1-4-0.wav' 125520 7.3182979999999995 7.885858 1 8 1 'car_horn']

In [ ]: