In [1]:
import glob
from sklearn.cluster import KMeans, AffinityPropagation, SpectralClustering
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
import librosa
from python_speech_features import mfcc
from python_speech_features import logfbank
import scipy.io.wavfile as wav
import os
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
In [2]:
word_embeddings_file = '../data/GoogleNews-vectors-negative300.bin'
w2v_model = KeyedVectors.load_word2vec_format(word_embeddings_file, binary=True)
In [5]:
import random
data_dir = '../data/esc-50/ESC-50-master'
sound_dirs = random.sample([os.path.join(data_dir, d)
for d in os.listdir(data_dir)
if os.path.isdir(os.path.join(data_dir, d))], 10)
In [6]:
sound_files_dict = dict()
for d in sound_dirs:
sound_class = os.path.split(d)[-1].split('-')[-1].strip().lower()
sound_files_dict[sound_class] = [os.path.join(d, f) for f in os.listdir(d)]
print 'Sound Class Num Samples'
print '--------------------------'
for k, v in sound_files_dict.iteritems():
print k, '\t', len(v)
In [7]:
def extract_feature(file_name):
X, sample_rate = librosa.load(file_name)
np.nan_to_num(X)
stft = np.abs(librosa.stft(X))
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0)
tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T, axis=0)
return mfccs, chroma, mel, contrast, tonnetz
def build_feature_matrix(dirs, classes):
features, labels = np.empty((0, 493)), np.empty(0)
filenames = list()
for d, label in zip(dirs, classes):
print 'Started extracting features for class: ', label
for fn in os.listdir(d):
fn = os.path.join(d, fn)
mfccs, chroma, mel, contrast, tonnetz = extract_feature(fn)
w2v_features = get_w2v_features(label)
ext_features = np.hstack([mfccs, chroma, mel, contrast, tonnetz, w2v_features])
features = np.vstack([features, ext_features])
labels = np.append(labels, label)
filenames.append(fn)
print 'Completed extracting features for class: ', label
return np.array(features), filenames, labels
def get_w2v_features(label):
words = label.split()
embeddings = list()
for w in words:
if w in w2v_model.vocab:
embeddings.append(w2v_model[w])
return np.mean(np.array(embeddings), axis=0)
In [8]:
classes = sound_files_dict.keys()
features, filenames, labels = build_feature_matrix(sound_dirs, classes)
In [43]:
print features.shape
In [44]:
outfile = 'sound_features.npy'
np.save(outfile, features)
In [106]:
query = 'wasp'
query_features = np.append(np.zeros(193), get_w2v_features(query))
In [107]:
from sklearn.metrics.pairwise import cosine_similarity
import IPython.display
In [108]:
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
In [109]:
result = filenames[idx]
print result
In [110]:
IPython.display.Audio(result, embed=True, autoplay=True)
Out[110]:
In [ ]: