Demo of a simple text query based audio retrieval

We have a bunch of labeled audio files. We augment the audio features with embeddings of text labels associated with the audio. Can we then use this simple "multi-modal representation" for retrieving sounds that are "similar" to a text query?


In [1]:
import glob
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
import librosa
from python_speech_features import mfcc
from python_speech_features import logfbank
import scipy.io.wavfile as wav
import os
import numpy as np
from gensim.models.keyedvectors import KeyedVectors


Using Theano backend.

We use pretrained word embeddings from Google News.


In [15]:
word_embeddings_file = '../data/GoogleNews-vectors-negative300.bin'
w2v_model = KeyedVectors.load_word2vec_format(word_embeddings_file, binary=True)

In [3]:
data_dir = '../data/esc-50'
sample_dir = os.path.join(data_dir, 'sample')
train_dir = os.path.join(data_dir, 'train')
test_dir = os.path.join(data_dir, 'test')

In [4]:
print 'Sound Sample Classes'
print '--------------------'
for d in os.listdir(sample_dir):
    print d


Sound Sample Classes
--------------------
coughing
footsteps
insects
laughing
rooster

In [5]:
samples_dict = dict()
for d in os.listdir(sample_dir):
    sample_class_dir = os.path.join(sample_dir, d)
    samples_dict[d] = [os.path.join(sample_class_dir, f) for f in os.listdir(sample_class_dir)]
for k, v in samples_dict.iteritems():
    print k, len(v)


insects 40
coughing 40
footsteps 40
laughing 40
rooster 40

Extract audio features as before. But also add word embeddings of the labels.


In [6]:
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    np.nan_to_num(X)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return mfccs,chroma,mel,contrast,tonnetz


def build_feature_matrix(parent_dir, sub_dirs, classes, file_ext='*.ogg'):
    features, labels = np.empty((0, 493)), np.empty(0)
    filenames = list()
    for label, sub_dir in enumerate(sub_dirs):
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            mfccs, chroma, mel, contrast,tonnetz = extract_feature(fn)
            w2v_features = get_w2v_features(sub_dir)
            ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz, w2v_features])
            features = np.vstack([features,ext_features])
            labels = np.append(labels, sub_dir)
            filenames.append(fn)
    return np.array(features), filenames, labels


def get_w2v_features(label):
    return w2v_model[label]

def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels, n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

In [7]:
sample_dir = os.path.join(data_dir, 'sample')
sub_dirs = ['laughing', 'coughing', 'insects', 'rooster', 'crying_baby', 'footsteps']
classes = {'laughing': 0, 'coughing': 1, 'insects': 2, 'rooster': 3, 'crying_baby': 4, 'footsteps': 5}

if not os.path.exists('sample_sound_features.npy'): 
    features, filenames, labels = build_feature_matrix(sample_dir, sub_dirs, classes)
else:
    features = np.load('sample_sound_features.npy')
    with open('sample_files.txt') as fp:
        filenames = [fn.strip() for fn in fp.readlines()]
    with open('sample_labels.txt') as fp:
        labels = [lb.strip() for lb in fp.readlines()]

In [8]:
print features.shape
print len(filenames)
print len(labels)


(196, 493)
196
196

In [9]:
outfile = 'sample_sound_features.npy'
np.save(outfile, features)
with open('sample_files.txt', 'w') as fp:
    fp.write('\n'.join(filenames))
with open('sample_labels.txt', 'w') as fp:
    fp.write('\n'.join(labels))

PCA after adding word embeddings

There is an increase in the explained variance. The plot also shows how the variances have different ranges for different classes.


In [10]:
from sklearn.decomposition import PCA
from sklearn import preprocessing
import pandas as pd

features_df = pd.DataFrame(features)
data_scaled = pd.DataFrame(preprocessing.scale(features_df), columns=features_df.columns)
pca = PCA(n_components=5)
pca_results = pca.fit_transform(data_scaled)

print 'Shape of the transformed feature vector:', pca_results.shape
print 'Original training sample:', list(features_df.loc[0].values)
print 'Training sample after PCA:', list(pca_results[0])
print '\n'
# Percentage of variance explained for each components
print 'Explained variance ratio (first five components)'
print '------------------------------------------------'
for idx, r in enumerate(pca.explained_variance_ratio_):
    print 'Principal Component', idx, ':', r


Shape of the transformed feature vector: (196, 5)
Original training sample: [-211.95342873967343, 126.77311337329317, -34.790516564858109, -10.512615152990707, -36.476548766848765, -15.265458386539253, -25.452147024852891, -5.373759791325627, -17.113109621494466, 9.623134344549225, -10.928963826555067, -10.522028894620606, -8.9315416259630567, -4.2521794394544665, -10.441037332644855, -7.5970087415701366, -8.8785380383648729, -1.1939344747472949, -7.0908558885145743, 0.76262883957394723, -6.841079475167918, -6.7085224890121546, -4.9157763803929528, -2.7260106406212623, -6.4026770846192411, -2.3801003698167018, -6.0828079226609626, -1.005822607342773, -4.4659888523414537, -1.1620847727575108, -4.0551530678673311, -0.10143759270967795, -5.5333870628773516, -1.0524975933573175, -3.2479466396041592, -1.5511760810055846, -3.3662660397266997, -2.0985671973177058, -3.8624834878166756, -0.23053952005125813, 0.51834445566621268, 0.50731462792430559, 0.54732341782571448, 0.54456663467689226, 0.55724919803408046, 0.63912478985697185, 0.69991761882011527, 0.71094263997045593, 0.70488867331070892, 0.66897406320086661, 0.58024313023707208, 0.54668903682846348, 0.00030898208816695281, 0.0011037863654963064, 0.016088965129742488, 0.048029955612885047, 0.062542814079642906, 0.13947951636564837, 0.23387052426737462, 0.22624280428373164, 0.19506449493940808, 0.29063113954904291, 0.4549219218919513, 0.73933110146098002, 0.69277632511016096, 0.50986683000985145, 0.60366819441108444, 0.80981155644544678, 1.3543601534067828, 1.320178095013623, 1.5462867580937649, 0.95649853417901898, 1.0331258896554507, 1.8075995803564224, 2.6111118874054506, 2.7215665094646804, 3.3259071652120826, 5.0641070479079797, 6.0979488185737845, 7.1051678508019638, 7.535051336622236, 7.6710790670789937, 2.8148214272133782, 2.7522209223856491, 2.5889673357936207, 1.4791161752630282, 1.1608426587244347, 1.2826584912174799, 1.5683734273342689, 1.0036368136086244, 0.72538112258986454, 0.49079046565747814, 0.39725536292546915, 0.31822184116950575, 0.54006440146321222, 0.48064187985263823, 0.34986304285334202, 0.24974973458894739, 0.19838699669755813, 0.17349815778111813, 0.12901942709396025, 0.11860289924781658, 0.15270080616908899, 0.17868360046591006, 0.19715498444407642, 0.21941677772091334, 0.23211492768070943, 0.25149226279331072, 0.18776629751326335, 0.17411642277448908, 0.17889784925497204, 0.096644062036501277, 0.082507450725597647, 0.066139808612824799, 0.05174036781666911, 0.046590424206075565, 0.038960488665702071, 0.038236938679630399, 0.02557265558608306, 0.021394920531427115, 0.016140895581957311, 0.015475472783039103, 0.011011644560546208, 0.010356935038617588, 0.0099145786292644516, 0.011856377501241988, 0.014797492405052352, 0.013112522923081406, 0.015755074069384874, 0.020782871349228575, 0.022437630686839058, 0.021883271850380963, 0.01657473185440838, 0.018241886561177423, 0.022016965476869915, 0.015009248627246756, 0.013437150754335971, 0.0084712427644161557, 0.0078142587165301747, 0.006294118115443466, 0.0046241263855647703, 0.0046417836148709589, 0.0040455707002778384, 0.0035629338770325389, 0.0035737071288845118, 0.0031781355610891746, 0.0022911490950632291, 0.0019606707835456614, 0.0014439297620097654, 0.0012277669175469535, 0.0010700037495356531, 0.0010706933391297537, 0.0012123657109182897, 0.0011216158135388798, 0.0010196895226450143, 0.0013053526270285333, 0.0017245984804699101, 0.0018016985150939903, 0.0016583987379953005, 0.0014503483148989862, 0.0011736064986719386, 0.0011487815179094805, 0.00098804534315929753, 0.00097006047956999658, 0.00076859565645174838, 0.00074170568164964358, 0.00055791487710688448, 0.00056102393166497023, 0.00038626005762788242, 0.00034570871649306054, 0.00029332994200811788, 0.00018045878544668763, 0.00014152537898716433, 0.00012862934087420268, 0.00011241898990175759, 0.00010035055321958268, 8.6017515487433528e-05, 5.7183426737683098e-05, 1.5404341109415376e-05, 1.0205875575903759e-06, 23.027444126555952, 12.259515859469532, 15.806141550857673, 17.774268658297565, 18.162222180882441, 18.832451488517545, 38.495340230619959, 0.0039265619510271983, 0.0031208547093051074, 0.016112377021180464, -0.0033134670188252908, -0.0038187426081049991, -0.0027186458454636149, 0.1279296875, 0.00341796875, 0.1953125, 0.283203125, -0.06787109375, -0.087890625, 0.185546875, 0.08984375, 0.205078125, 0.208984375, 0.16015625, -0.314453125, -0.044677734375, 0.31640625, -0.408203125, 0.162109375, 0.0185546875, 0.228515625, 0.09375, -0.2421875, 0.2177734375, 0.1103515625, 0.049560546875, -0.04931640625, 0.043212890625, -0.1787109375, -0.040283203125, 0.00014591217041015625, 0.1484375, -0.10595703125, -0.353515625, 0.07373046875, -0.05517578125, -0.158203125, 0.099609375, -0.287109375, 0.234375, -0.1396484375, 0.138671875, 0.1494140625, 0.248046875, -0.275390625, 0.267578125, -0.318359375, -0.0419921875, 0.23046875, -0.208984375, -0.10888671875, 0.03857421875, 0.255859375, -0.25390625, -0.027587890625, 0.0859375, 0.12890625, 0.15625, 0.0947265625, 0.0986328125, -0.2490234375, 0.25390625, -0.2275390625, -0.2158203125, 0.1328125, -0.1533203125, 0.181640625, 0.1728515625, 0.12158203125, -0.16015625, -0.1318359375, -0.2412109375, 0.09033203125, 0.09521484375, 0.038818359375, -0.08154296875, 0.271484375, -0.447265625, 0.057861328125, -0.212890625, 0.01361083984375, 0.267578125, 0.1689453125, -0.0869140625, -0.1572265625, 0.07666015625, 0.01806640625, -0.10498046875, 0.0380859375, -0.1396484375, 0.28125, 0.053955078125, -0.049072265625, 0.07861328125, 0.40625, -0.08056640625, 0.041015625, -0.376953125, 0.091796875, 0.2275390625, 0.06201171875, -0.162109375, -0.283203125, -0.255859375, -0.0595703125, 0.08984375, 0.017578125, 0.0152587890625, 0.140625, 0.21484375, -0.0908203125, 0.010986328125, -0.1748046875, -0.033203125, -0.12060546875, 0.054931640625, -0.0556640625, 0.06787109375, -0.208984375, 0.0166015625, -0.025634765625, 0.095703125, -0.2197265625, -0.1845703125, 0.138671875, -0.028076171875, -0.140625, -0.134765625, -0.3046875, -0.2734375, 0.0037384033203125, -0.05126953125, 0.10595703125, -0.3671875, 0.08984375, -0.35546875, -0.000835418701171875, -0.234375, -0.0400390625, 0.10986328125, 0.0184326171875, -0.057861328125, -0.058837890625, 0.224609375, -0.2080078125, 0.23828125, -0.208984375, 0.03271484375, 0.1396484375, -0.099609375, -0.271484375, 0.1953125, -0.1591796875, 0.03173828125, 0.337890625, -0.1611328125, 0.1533203125, 0.0400390625, 0.1259765625, -0.26953125, -0.10302734375, -0.032958984375, -0.030517578125, -0.087890625, 0.1962890625, 0.2109375, 0.0211181640625, -7.343292236328125e-05, -0.11376953125, -0.07177734375, -0.048828125, 0.158203125, 0.10693359375, 0.04052734375, -0.181640625, 0.1572265625, 0.2236328125, -0.10205078125, 0.052978515625, -0.0888671875, -0.1826171875, 0.123046875, -0.040771484375, 0.1552734375, -0.029541015625, 0.08349609375, 0.0284423828125, -0.000789642333984375, 0.025390625, -0.2236328125, -0.06103515625, -0.189453125, -0.10302734375, 0.2158203125, -0.22265625, 0.031982421875, 0.2578125, 0.01953125, 0.251953125, -0.1943359375, 0.1318359375, -0.068359375, -0.11669921875, -0.08935546875, 0.10546875, -0.193359375, -0.05322265625, 0.10546875, -0.0048828125, -0.0133056640625, 0.25390625, -0.232421875, -0.0208740234375, 0.1787109375, -0.0269775390625, 0.09423828125, 0.06591796875, -0.095703125, 0.208984375, -0.1982421875, 0.09521484375, 0.150390625, 0.08837890625, 0.15625, 0.328125, 0.2353515625, 0.040283203125, 0.053466796875, 0.15625, 0.1171875, 0.038818359375, -0.11474609375, -0.07373046875, 0.0009765625, -0.07421875, -0.12451171875, 0.1533203125, 0.07861328125, -0.2041015625, 0.051025390625, 0.06884765625, 0.162109375, 0.0908203125, 0.0301513671875, 0.2353515625, -0.55859375, 0.048828125, -0.05810546875, 0.1875, 0.09814453125, -0.015625, 0.2890625, -0.2216796875, -0.09375, 0.16015625, 0.2314453125, 0.2890625, 0.08251953125, -0.06640625, -0.251953125, 0.11328125, -0.020751953125, -0.08447265625, -0.2890625, 0.1044921875, 0.07666015625, 0.052734375, -0.07958984375, 0.2431640625, 0.25390625, 0.10546875, 0.05517578125, -0.15625, -0.166015625, 0.302734375, 0.224609375, 0.010009765625, 0.11181640625, -0.031494140625, -0.1181640625, 0.11767578125, -0.1005859375, -0.2890625, 0.27734375, 0.0654296875, -0.04052734375, -0.047607421875, 0.01007080078125, -0.24609375, 0.1669921875, 0.193359375, 0.32421875, -0.10009765625, 0.0732421875, 0.21875, -0.10595703125, -0.2294921875, -0.236328125, 0.06005859375, 0.0140380859375, -0.08544921875, 0.1611328125, 0.033203125]
Training sample after PCA: [-5.3770980297690212, 3.7704776602022045, -4.3937515182962494, -13.532921497259064, 0.54073212906330181]


Explained variance ratio (first five components)
------------------------------------------------
Principal Component 0 : 0.206933327237
Principal Component 1 : 0.185554212761
Principal Component 2 : 0.150936390477
Principal Component 3 : 0.121738803392
Principal Component 4 : 0.0609150075334

In [11]:
from ggplot import *

df_pca = features_df.copy()
df_pca['x-pca'] = pca_results[:,0]
df_pca['y-pca'] = pca_results[:,1]
df_pca['label'] = labels
chart = ggplot( df_pca, aes(x='x-pca', y='y-pca', color='label') ) \
        + geom_point(size=75,alpha=0.8) \
        + ggtitle("First and Second Principal Components colored by class label")
chart


/Users/sanket/anaconda/lib/python2.7/site-packages/ggplot/utils.py:81: FutureWarning: pandas.tslib is deprecated and will be removed in a future version.
You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
/Users/sanket/anaconda/lib/python2.7/site-packages/ggplot/stats/smoothers.py:4: FutureWarning: The pandas.lib module is deprecated and will be removed in a future version. These are private functions and can be accessed from pandas._libs.lib instead
  from pandas.lib import Timestamp
/Users/sanket/anaconda/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
Out[11]:
<ggplot: (296064793)>

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
import IPython.display

In [13]:
queries = ['laugh', 'chuckle', 'joke', 'walking', 'run', 
           'jogging', 'cough', 'throat', 'mosquito', 'wasp',
           'bees', 'parrot', 'cockarel', 'crowing']

In [31]:
query = 'laugh'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)


Matching class: laughing
Out[31]:

In [17]:
query = 'chuckling'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True)


Matching class: laughing
Out[17]:

In [18]:
query = 'joke'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)


Matching class: laughing
Out[18]:

In [32]:
query = 'walking'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)


Matching class: footsteps
Out[32]:

In [20]:
query = 'run'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)


Matching class: footsteps
Out[20]:

In [21]:
query = 'jogging'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)


Matching class: footsteps
Out[21]:

In [22]:
query = 'cough'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)


Matching class: coughing
Out[22]:

In [33]:
query = 'throat'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)


Matching class: coughing
Out[33]:

In [24]:
query = 'mosquito'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)


Matching class: insects
Out[24]:

In [25]:
query = 'wasp'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)


Matching class: insects
Out[25]:

In [34]:
query = 'bees'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)


Matching class: insects
Out[34]:

In [27]:
query = 'parrot'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)


Matching class: insects
Out[27]:

In [28]:
query = 'cock'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)


Matching class: rooster
Out[28]:

In [29]:
query = 'buzzing'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)


Matching class: insects
Out[29]:

In [30]:
query = 'wheezing'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)


Matching class: coughing
Out[30]:

In [ ]: