notebook.community

Edit and run



In [1]:

    
import glob
from sklearn.cluster import KMeans, AffinityPropagation, SpectralClustering
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
import librosa
from python_speech_features import mfcc
from python_speech_features import logfbank
import scipy.io.wavfile as wav
import os
import numpy as np
from gensim.models.keyedvectors import KeyedVectors









    



Using Theano backend.
WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10).  Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 1080 (CNMeM is enabled with initial size: 70.0% of memory, cuDNN 5110)



In [2]:

    
word_embeddings_file = '../data/GoogleNews-vectors-negative300.bin'
w2v_model = KeyedVectors.load_word2vec_format(word_embeddings_file, binary=True)



In [5]:

    
import random

data_dir = '../data/esc-50/ESC-50-master'
sound_dirs = random.sample([os.path.join(data_dir, d) 
                            for d in os.listdir(data_dir) 
                            if os.path.isdir(os.path.join(data_dir, d))], 10)



In [6]:

    
sound_files_dict = dict()
for d in sound_dirs:
    sound_class = os.path.split(d)[-1].split('-')[-1].strip().lower()
    sound_files_dict[sound_class] = [os.path.join(d, f) for f in os.listdir(d)]

print 'Sound Class Num Samples'
print '--------------------------'
for k, v in sound_files_dict.iteritems():
    print k, '\t', len(v)









    



Sound Class Num Samples
--------------------------
fireworks 	40
brushing teeth 	40
sneezing 	40
thunderstorm 	40
door knock 	40
pig 	40
rooster 	38
sea waves 	40
hen 	40
sipping 	40



In [7]:

    
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    np.nan_to_num(X)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T, axis=0)
    return mfccs, chroma, mel, contrast, tonnetz


def build_feature_matrix(dirs, classes):
    features, labels = np.empty((0, 493)), np.empty(0)
    filenames = list()
    for d, label in zip(dirs, classes):
        print 'Started extracting features for class: ', label
        for fn in os.listdir(d):
            fn = os.path.join(d, fn)
            mfccs, chroma, mel, contrast, tonnetz = extract_feature(fn)
            w2v_features = get_w2v_features(label)
            ext_features = np.hstack([mfccs, chroma, mel, contrast, tonnetz, w2v_features])
            features = np.vstack([features, ext_features])
            labels = np.append(labels, label)
            filenames.append(fn)
        print 'Completed extracting features for class: ', label
    return np.array(features), filenames, labels


def get_w2v_features(label):
    words = label.split()
    embeddings = list()
    for w in words:
        if w in w2v_model.vocab:
            embeddings.append(w2v_model[w])
    return np.mean(np.array(embeddings), axis=0)



In [8]:

    
classes = sound_files_dict.keys()
features, filenames, labels = build_feature_matrix(sound_dirs, classes)









    



Started extracting features for class:  fireworks
Completed extracting features for class:  fireworks
Started extracting features for class:  brushing teeth
Completed extracting features for class:  brushing teeth
Started extracting features for class:  sneezing
Completed extracting features for class:  sneezing
Started extracting features for class:  thunderstorm






    



/home/sanket/anaconda/lib/python2.7/site-packages/librosa/core/pitch.py:145: UserWarning: Trying to estimate tuning from empty frequency set.
  warnings.warn('Trying to estimate tuning from empty frequency set.')






    



Completed extracting features for class:  thunderstorm
Started extracting features for class:  door knock
Completed extracting features for class:  door knock
Started extracting features for class:  pig
Completed extracting features for class:  pig
Started extracting features for class:  rooster
Completed extracting features for class:  rooster
Started extracting features for class:  sea waves
Completed extracting features for class:  sea waves
Started extracting features for class:  hen
Completed extracting features for class:  hen
Started extracting features for class:  sipping
Completed extracting features for class:  sipping



In [43]:

    
print features.shape









    



(1992, 493)



In [44]:

    
outfile = 'sound_features.npy'
np.save(outfile, features)



In [106]:

    
query = 'wasp'
query_features = np.append(np.zeros(193), get_w2v_features(query))



In [107]:

    
from sklearn.metrics.pairwise import cosine_similarity
import IPython.display



In [108]:

    
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))



In [109]:

    
result = filenames[idx]
print result









    



../data/esc-50/ESC-50-master/510 - Hand saw/4-198360-A.ogg



In [110]:

    
IPython.display.Audio(result, embed=True, autoplay=True)









    Out[110]:



In [ ]: