In [1]:
    
import glob
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
import librosa
from python_speech_features import mfcc
from python_speech_features import logfbank
import scipy.io.wavfile as wav
import os
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
    
    
We use pretrained word embeddings from Google News.
In [15]:
    
word_embeddings_file = '../data/GoogleNews-vectors-negative300.bin'
w2v_model = KeyedVectors.load_word2vec_format(word_embeddings_file, binary=True)
    
In [3]:
    
data_dir = '../data/esc-50'
sample_dir = os.path.join(data_dir, 'sample')
train_dir = os.path.join(data_dir, 'train')
test_dir = os.path.join(data_dir, 'test')
    
In [4]:
    
print 'Sound Sample Classes'
print '--------------------'
for d in os.listdir(sample_dir):
    print d
    
    
In [5]:
    
samples_dict = dict()
for d in os.listdir(sample_dir):
    sample_class_dir = os.path.join(sample_dir, d)
    samples_dict[d] = [os.path.join(sample_class_dir, f) for f in os.listdir(sample_class_dir)]
for k, v in samples_dict.iteritems():
    print k, len(v)
    
    
Extract audio features as before. But also add word embeddings of the labels.
In [6]:
    
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    np.nan_to_num(X)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return mfccs,chroma,mel,contrast,tonnetz
def build_feature_matrix(parent_dir, sub_dirs, classes, file_ext='*.ogg'):
    features, labels = np.empty((0, 493)), np.empty(0)
    filenames = list()
    for label, sub_dir in enumerate(sub_dirs):
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            mfccs, chroma, mel, contrast,tonnetz = extract_feature(fn)
            w2v_features = get_w2v_features(sub_dir)
            ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz, w2v_features])
            features = np.vstack([features,ext_features])
            labels = np.append(labels, sub_dir)
            filenames.append(fn)
    return np.array(features), filenames, labels
def get_w2v_features(label):
    return w2v_model[label]
def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels, n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode
    
In [7]:
    
sample_dir = os.path.join(data_dir, 'sample')
sub_dirs = ['laughing', 'coughing', 'insects', 'rooster', 'crying_baby', 'footsteps']
classes = {'laughing': 0, 'coughing': 1, 'insects': 2, 'rooster': 3, 'crying_baby': 4, 'footsteps': 5}
if not os.path.exists('sample_sound_features.npy'): 
    features, filenames, labels = build_feature_matrix(sample_dir, sub_dirs, classes)
else:
    features = np.load('sample_sound_features.npy')
    with open('sample_files.txt') as fp:
        filenames = [fn.strip() for fn in fp.readlines()]
    with open('sample_labels.txt') as fp:
        labels = [lb.strip() for lb in fp.readlines()]
    
In [8]:
    
print features.shape
print len(filenames)
print len(labels)
    
    
In [9]:
    
outfile = 'sample_sound_features.npy'
np.save(outfile, features)
with open('sample_files.txt', 'w') as fp:
    fp.write('\n'.join(filenames))
with open('sample_labels.txt', 'w') as fp:
    fp.write('\n'.join(labels))
    
In [10]:
    
from sklearn.decomposition import PCA
from sklearn import preprocessing
import pandas as pd
features_df = pd.DataFrame(features)
data_scaled = pd.DataFrame(preprocessing.scale(features_df), columns=features_df.columns)
pca = PCA(n_components=5)
pca_results = pca.fit_transform(data_scaled)
print 'Shape of the transformed feature vector:', pca_results.shape
print 'Original training sample:', list(features_df.loc[0].values)
print 'Training sample after PCA:', list(pca_results[0])
print '\n'
# Percentage of variance explained for each components
print 'Explained variance ratio (first five components)'
print '------------------------------------------------'
for idx, r in enumerate(pca.explained_variance_ratio_):
    print 'Principal Component', idx, ':', r
    
    
In [11]:
    
from ggplot import *
df_pca = features_df.copy()
df_pca['x-pca'] = pca_results[:,0]
df_pca['y-pca'] = pca_results[:,1]
df_pca['label'] = labels
chart = ggplot( df_pca, aes(x='x-pca', y='y-pca', color='label') ) \
        + geom_point(size=75,alpha=0.8) \
        + ggtitle("First and Second Principal Components colored by class label")
chart
    
    
    
    Out[11]:
In [12]:
    
from sklearn.metrics.pairwise import cosine_similarity
import IPython.display
    
In [13]:
    
queries = ['laugh', 'chuckle', 'joke', 'walking', 'run', 
           'jogging', 'cough', 'throat', 'mosquito', 'wasp',
           'bees', 'parrot', 'cockarel', 'crowing']
    
In [31]:
    
query = 'laugh'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
    
    
    Out[31]:
In [17]:
    
query = 'chuckling'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True)
    
    
    Out[17]:
In [18]:
    
query = 'joke'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
    
    
    Out[18]:
In [32]:
    
query = 'walking'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
    
    
    Out[32]:
In [20]:
    
query = 'run'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
    
    
    Out[20]:
In [21]:
    
query = 'jogging'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
    
    
    Out[21]:
In [22]:
    
query = 'cough'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
    
    
    Out[22]:
In [33]:
    
query = 'throat'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
    
    
    Out[33]:
In [24]:
    
query = 'mosquito'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
    
    
    Out[24]:
In [25]:
    
query = 'wasp'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
    
    
    Out[25]:
In [34]:
    
query = 'bees'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
    
    
    Out[34]:
In [27]:
    
query = 'parrot'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
    
    
    Out[27]:
In [28]:
    
query = 'cock'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
    
    
    Out[28]:
In [29]:
    
query = 'buzzing'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
    
    
    Out[29]:
In [30]:
    
query = 'wheezing'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
    
    
    Out[30]:
In [ ]: