In [1]:
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import style
import librosa
import IPython.display
import librosa.display
import os
import random
from matplotlib.pyplot import specgram
import glob

Speech Features

Tonnetz

Networks or lattices of tones. The tonnetz tonal centroids — the “central” tones. These are features that help in Detecting Harmonic Change in Musical Audio or variances due to tones in audio.

STFT

Short-term fourier transform. Segments the signal into short frames and computes the fourier transform on each short segment.

Spectral Contrast

Relative distribution of energies.

Chromagrams

Chromagrams are based on the pitch scales


In [2]:
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    np.nan_to_num(X)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return mfccs, chroma, mel, contrast, tonnetz


def parse_audio_files(parent_dir, sub_dirs, classes, file_ext='*.ogg'):
    features, labels = np.empty((0, 193)), np.empty(0)
    for label, sub_dir in enumerate(sub_dirs):
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            mfccs, chroma, mel, contrast,tonnetz = extract_feature(fn)
            ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
            features = np.vstack([features,ext_features])
            labels = np.append(labels, classes.get(sub_dir))
    return np.array(features), np.array(labels, dtype = np.int)


def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels, n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

In [3]:
data_dir = '../data/esc-50'
sample_dir = os.path.join(data_dir, 'sample')
train_dir = os.path.join(data_dir, 'train')
test_dir = os.path.join(data_dir, 'test')

print 'Sound Sample Classes'
print '--------------------'
for d in os.listdir(sample_dir):
    print d


Sound Sample Classes
--------------------
rooster
coughing
insects
laughing

In [4]:
samples_dict = dict()
for d in os.listdir(sample_dir):
    sample_class_dir = os.path.join(sample_dir, d)
    samples_dict[d] = [os.path.join(sample_class_dir, f) for f in os.listdir(sample_class_dir)]

In [5]:
mfccs, chroma, mel, contrast,tonnetz = extract_feature(samples_dict.get('insects')[0])
ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz])

In [6]:
mfccs, chroma, mel, contrast,tonnetz = extract_feature(samples_dict.get('insects')[0])
ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
print len(ext_features)


193

In [7]:
features = np.empty((0,193))
print features.shape


(0, 193)

In [8]:
features = np.vstack([features,ext_features])
print features


[[ -2.71008667e+02   1.11177134e+02  -2.66969894e+01   1.64818867e+01
   -4.28178103e-01  -1.32390633e+01   1.21125811e+00  -5.36048044e+00
   -1.12257893e+01  -1.54257043e+01  -1.81542393e+01  -1.80371958e+01
   -1.07932755e+01  -8.53481359e+00  -6.64451044e+00  -1.93843792e+00
   -1.12454952e+01  -1.68509149e+01  -1.28325805e+01  -1.23412747e+01
   -9.02519197e+00  -5.68524379e+00  -5.22570074e+00  -7.41173323e+00
   -1.38022800e+01  -1.52868561e+01  -1.31671584e+01  -9.81586918e+00
   -6.73705615e+00  -8.49936622e+00  -1.17248014e+01  -1.46298240e+01
   -1.01961189e+01   3.74741798e+00   2.14096003e+01   3.56850792e+01
    3.83353708e+01   2.89771481e+01   1.19066973e+01  -2.89713765e+00
    1.59088475e-01   3.19025165e-01   3.33885749e-01   1.52062355e-01
    1.77843989e-01   2.93502119e-01   8.13136173e-01   8.85314120e-01
    3.31957176e-01   1.98507119e-01   3.94651468e-01   3.02763397e-01
    8.68044150e-02   1.83963744e-02   1.35715236e-02   2.39104148e-02
    1.39743146e-01   6.72779917e+00   4.90004694e+02   2.73282780e+02
    8.93248234e-01   5.59734983e-02   1.70156021e-02   2.59211479e-02
    9.61986968e-01   4.17390167e+01   8.62789041e+01   5.58839101e+00
    4.88813554e-02   6.79680806e-03   4.78441937e-03   1.48819929e-01
    3.20957326e+00   1.22941049e+01   4.12018888e+00   9.93672660e-02
    4.37581389e-03   2.98869747e-03   1.00605985e-02   2.59755824e-01
    7.90625275e-01   6.10696306e-01   7.46318680e-02   5.72895169e-03
    3.96717497e-03   9.20663652e-02   1.41673099e+00   6.02122098e+00
    6.44586128e+00   1.63331443e+00   1.09673649e-01   5.98863026e-03
    1.22476459e-02   7.09265921e-02   2.40077103e-01   3.19510478e-01
    8.61640183e-02   6.65731303e-03   1.16871322e-02   1.31723973e-01
    2.78013311e-01   1.51962472e-01   4.00416660e-02   6.17122649e-03
    7.81794417e-02   1.96918157e-01   1.51670048e-01   3.32580226e-02
    7.61031437e-03   3.29178922e-02   6.04684941e-02   1.53753055e-02
    7.11120307e-03   1.68225158e-02   2.11778417e-02   8.03014126e-03
    1.11053473e-02   3.95009299e-02   3.38501458e-02   1.31685055e-02
    2.22509392e-02   2.41385683e-02   1.13896898e-02   1.60631428e-02
    1.59461956e-02   6.22479372e-03   8.93443731e-03   1.07098259e-02
    6.18478379e-03   9.12808856e-03   7.54452137e-03   9.15637982e-03
    7.81352579e-03   6.35637785e-03   8.36853838e-03   6.21336188e-03
    6.53785908e-03   4.66720774e-03   4.59873942e-03   3.29128604e-03
    2.36636812e-03   1.52167571e-03   1.33269869e-03   1.14961995e-03
    1.01464148e-03   5.93145876e-04   4.99635008e-04   4.98640086e-04
    5.03423621e-04   4.61343924e-04   4.50671235e-04   4.80561314e-04
    3.64230392e-04   3.45524261e-04   3.06339818e-04   2.57827978e-04
    1.84518869e-04   1.36144799e-04   9.49975459e-05   9.42402109e-05
    9.45126118e-05   9.04012059e-05   7.84361819e-05   5.63061283e-05
    3.90798580e-05   2.93396714e-05   2.11918386e-05   1.61612281e-05
    1.21721234e-05   9.67922533e-06   6.89963830e-06   5.43285045e-06
    3.83645753e-06   2.90355287e-06   2.12411428e-06   1.89239558e-06
    1.59997136e-06   1.24610975e-06   5.06919169e-07   1.81620320e-07
    3.73000535e+01   2.72948934e+01   2.49868627e+01   2.53811632e+01
    1.81814493e+01   1.97994342e+01   3.85436245e+01   3.70963525e-02
   -2.24606709e-02   1.88734775e-01  -2.03544718e-01   5.38986268e-02
    1.56129083e-02]]

In [9]:
sample_dir = os.path.join(data_dir, 'sample')
sub_dirs = ['laughing', 'coughing', 'insects', 'rooster']
classes = {'laughing': 0, 'coughing': 1, 'insects': 2, 'rooster': 3}
features, labels = parse_audio_files(sample_dir, sub_dirs, classes)

In [10]:
print features.shape


(156, 193)

In [11]:
one_hot = one_hot_encode(labels)

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    one_hot, 
                                                    test_size=0.15, 
                                                    random_state=42)
print X_train.shape
print y_train.shape


(132, 193)
(132, 4)

In [56]:
n_hidden_units_one = 50 
n_hidden_units_two = 50

n_classes = 4
n_dim = X_train.shape[1]

In [57]:
from keras.models import Sequential
from keras.layers import Dense
import numpy

In [58]:
model = Sequential()
model.add(Dense(n_hidden_units_one, input_dim=n_dim, kernel_initializer='uniform', activation='relu'))
model.add(Dense(n_hidden_units_two, kernel_initializer='uniform', activation='relu'))
model.add(Dense(n_classes, kernel_initializer='uniform', activation='softmax'))

In [59]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [60]:
model.fit(X_train, y_train, nb_epoch=10, batch_size=20)


Epoch 1/10
132/132 [==============================] - 0s - loss: 1.3210 - acc: 0.3636     
Epoch 2/10
132/132 [==============================] - 0s - loss: 1.1869 - acc: 0.5530     
Epoch 3/10
132/132 [==============================] - 0s - loss: 1.0305 - acc: 0.6136     
Epoch 4/10
132/132 [==============================] - 0s - loss: 0.8609 - acc: 0.6970     
Epoch 5/10
132/132 [==============================] - 0s - loss: 0.7235 - acc: 0.7500     
Epoch 6/10
132/132 [==============================] - 0s - loss: 0.6231 - acc: 0.7879     
Epoch 7/10
132/132 [==============================] - 0s - loss: 0.5371 - acc: 0.8258     
Epoch 8/10
132/132 [==============================] - 0s - loss: 0.4258 - acc: 0.8788     
Epoch 9/10
132/132 [==============================] - 0s - loss: 0.3891 - acc: 0.8712     
Epoch 10/10
132/132 [==============================] - 0s - loss: 0.3196 - acc: 0.9015     
Out[60]:
<keras.callbacks.History at 0x7f03e13c67d0>

In [61]:
scores = model.evaluate(X_test, y_test)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))


24/24 [==============================] - 0s
acc: 83.33%

In [14]:
from sklearn.decomposition import PCA
from sklearn import preprocessing
import pandas as pd

features_df = pd.DataFrame(features)
data_scaled = pd.DataFrame(preprocessing.scale(features_df), columns=features_df.columns)
pca = PCA(n_components=5)
pca_results = pca.fit_transform(data_scaled)

print 'Shape of the transformed feature vector:', pca_results.shape
print 'Original training sample:', list(features_df.loc[0].values)
print 'Training sample after PCA:', list(pca_results[0])
print '\n'
# Percentage of variance explained for each components
print 'Explained variance ratio (first five components)'
print '------------------------------------------------'
for idx, r in enumerate(pca.explained_variance_ratio_):
    print 'Principal Component', idx, ':', r


 Shape of the transformed feature vector: (156, 5)
Original training sample: [-211.95342873967343, 126.77311337329317, -34.790516564858109, -10.512615152990707, -36.476548766848765, -15.265458386539253, -25.452147024852891, -5.373759791325627, -17.113109621494466, 9.623134344549225, -10.928963826555067, -10.522028894620606, -8.9315416259630567, -4.2521794394544665, -10.441037332644855, -7.5970087415701366, -8.8785380383648729, -1.1939344747472949, -7.0908558885145743, 0.76262883957394723, -6.841079475167918, -6.7085224890121546, -4.9157763803929528, -2.7260106406212623, -6.4026770846192411, -2.3801003698167018, -6.0828079226609626, -1.005822607342773, -4.4659888523414537, -1.1620847727575108, -4.0551530678673311, -0.10143759270967795, -5.5333870628773516, -1.0524975933573175, -3.2479466396041592, -1.5511760810055846, -3.3662660397266997, -2.0985671973177058, -3.8624834878166756, -0.23053952005125813, 0.51834445566621268, 0.50731462792430559, 0.54732341782571448, 0.54456663467689226, 0.55724919803408046, 0.63912478985697185, 0.69991761882011527, 0.71094263997045593, 0.70488867331070892, 0.66897406320086661, 0.58024313023707208, 0.54668903682846348, 0.00030898208816695281, 0.0011037863654963064, 0.016088965129742488, 0.048029955612885047, 0.062542814079642906, 0.13947951636564837, 0.23387052426737462, 0.22624280428373164, 0.19506449493940808, 0.29063113954904291, 0.4549219218919513, 0.73933110146098002, 0.69277632511016096, 0.50986683000985145, 0.60366819441108444, 0.80981155644544678, 1.3543601534067828, 1.320178095013623, 1.5462867580937649, 0.95649853417901898, 1.0331258896554507, 1.8075995803564224, 2.6111118874054506, 2.7215665094646804, 3.3259071652120826, 5.0641070479079797, 6.0979488185737845, 7.1051678508019638, 7.535051336622236, 7.6710790670789937, 2.8148214272133782, 2.7522209223856491, 2.5889673357936207, 1.4791161752630282, 1.1608426587244347, 1.2826584912174799, 1.5683734273342689, 1.0036368136086244, 0.72538112258986454, 0.49079046565747814, 0.39725536292546915, 0.31822184116950575, 0.54006440146321222, 0.48064187985263823, 0.34986304285334202, 0.24974973458894739, 0.19838699669755813, 0.17349815778111813, 0.12901942709396025, 0.11860289924781658, 0.15270080616908899, 0.17868360046591006, 0.19715498444407642, 0.21941677772091334, 0.23211492768070943, 0.25149226279331072, 0.18776629751326335, 0.17411642277448908, 0.17889784925497204, 0.096644062036501277, 0.082507450725597647, 0.066139808612824799, 0.05174036781666911, 0.046590424206075565, 0.038960488665702071, 0.038236938679630399, 0.02557265558608306, 0.021394920531427115, 0.016140895581957311, 0.015475472783039103, 0.011011644560546208, 0.010356935038617588, 0.0099145786292644516, 0.011856377501241988, 0.014797492405052352, 0.013112522923081406, 0.015755074069384874, 0.020782871349228575, 0.022437630686839058, 0.021883271850380963, 0.01657473185440838, 0.018241886561177423, 0.022016965476869915, 0.015009248627246756, 0.013437150754335971, 0.0084712427644161557, 0.0078142587165301747, 0.006294118115443466, 0.0046241263855647703, 0.0046417836148709589, 0.0040455707002778384, 0.0035629338770325389, 0.0035737071288845118, 0.0031781355610891746, 0.0022911490950632291, 0.0019606707835456614, 0.0014439297620097654, 0.0012277669175469535, 0.0010700037495356531, 0.0010706933391297537, 0.0012123657109182897, 0.0011216158135388798, 0.0010196895226450143, 0.0013053526270285333, 0.0017245984804699101, 0.0018016985150939903, 0.0016583987379953005, 0.0014503483148989862, 0.0011736064986719386, 0.0011487815179094805, 0.00098804534315929753, 0.00097006047956999658, 0.00076859565645174838, 0.00074170568164964358, 0.00055791487710688448, 0.00056102393166497023, 0.00038626005762788242, 0.00034570871649306054, 0.00029332994200811788, 0.00018045878544668763, 0.00014152537898716433, 0.00012862934087420268, 0.00011241898990175759, 0.00010035055321958268, 8.6017515487433528e-05, 5.7183426737683098e-05, 1.5404341109415376e-05, 1.0205875575903759e-06, 23.027444126555952, 12.259515859469532, 15.806141550857673, 17.774268658297565, 18.162222180882441, 18.832451488517545, 38.495340230619959, 0.0039265619510271983, 0.0031208547093051074, 0.016112377021180464, -0.0033134670188252908, -0.0038187426081049991, -0.0027186458454636149]
Training sample after PCA: [-2.6068373184520963, 2.4242388793647009, -0.65683308107782468, 4.2573390545248051, 1.079539326884505]


Explained variance ratio (first five components)
------------------------------------------------
Principal Component 0 : 0.18766387691
Principal Component 1 : 0.0877785919013
Principal Component 2 : 0.071878311726
Principal Component 3 : 0.0639192643987
Principal Component 4 : 0.0450622371579

In [18]:
from ggplot import *

df_pca = features_df.copy()
df_pca['x-pca'] = pca_results[:,0]
df_pca['y-pca'] = pca_results[:,1]
df_pca['label'] = labels
chart = ggplot( df_pca, aes(x='x-pca', y='y-pca', color='label') ) \
        + geom_point(size=75,alpha=0.8) \
        + ggtitle("First and Second Principal Components colored by gender")
chart


Out[18]:
<ggplot: (8730928897673)>

In [22]:
from sklearn.manifold import TSNE

print("Computing t-SNE embedding")
tsne = TSNE(n_components=2, verbose=1, perplexity=50, n_iter=500)
tsne_results = tsne.fit_transform(features_df)


Computing t-SNE embedding
[t-SNE] Computing pairwise distances...
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 156 / 156
[t-SNE] Mean sigma: 46.734307
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.395291
[t-SNE] Error after 400 iterations: 1.395291

In [23]:
df_tsne = features_df.copy()
df_tsne['x-tsne'] = tsne_results[:,0]
df_tsne['y-tsne'] = tsne_results[:,1]
df_tsne['label'] = labels
chart = ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label') ) \
        + geom_point(size=70,alpha=0.1) \
        + ggtitle("tSNE dimensions colored by gender")
chart


Out[23]:
<ggplot: (8730928702225)>

In [28]:
from sklearn.manifold import TSNE

print("Computing t-SNE embedding")
tsne = TSNE(n_components=2, verbose=1, perplexity=50, n_iter=1000)
tsne_results = tsne.fit_transform(pca_results)


Computing t-SNE embedding
[t-SNE] Computing pairwise distances...
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 156 / 156
[t-SNE] Mean sigma: 3.671794
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.414827
[t-SNE] Error after 325 iterations: 1.414827

In [29]:
df_tsne = features_df.copy()
df_tsne['x-tsne'] = tsne_results[:,0]
df_tsne['y-tsne'] = tsne_results[:,1]
df_tsne['label'] = labels
chart = ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label') ) \
        + geom_point(size=70,alpha=0.1) \
        + ggtitle("tSNE dimensions colored by gender")
chart


Out[29]:
<ggplot: (8730928573909)>

In [ ]: