In [1]:
import glob
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
import librosa
from python_speech_features import mfcc
from python_speech_features import logfbank
import scipy.io.wavfile as wav
import os
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
We use pretrained word embeddings from Google News.
In [15]:
word_embeddings_file = '../data/GoogleNews-vectors-negative300.bin'
w2v_model = KeyedVectors.load_word2vec_format(word_embeddings_file, binary=True)
In [3]:
data_dir = '../data/esc-50'
sample_dir = os.path.join(data_dir, 'sample')
train_dir = os.path.join(data_dir, 'train')
test_dir = os.path.join(data_dir, 'test')
In [4]:
print 'Sound Sample Classes'
print '--------------------'
for d in os.listdir(sample_dir):
print d
In [5]:
samples_dict = dict()
for d in os.listdir(sample_dir):
sample_class_dir = os.path.join(sample_dir, d)
samples_dict[d] = [os.path.join(sample_class_dir, f) for f in os.listdir(sample_class_dir)]
for k, v in samples_dict.iteritems():
print k, len(v)
Extract audio features as before. But also add word embeddings of the labels.
In [6]:
def extract_feature(file_name):
X, sample_rate = librosa.load(file_name)
np.nan_to_num(X)
stft = np.abs(librosa.stft(X))
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
return mfccs,chroma,mel,contrast,tonnetz
def build_feature_matrix(parent_dir, sub_dirs, classes, file_ext='*.ogg'):
features, labels = np.empty((0, 493)), np.empty(0)
filenames = list()
for label, sub_dir in enumerate(sub_dirs):
for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
mfccs, chroma, mel, contrast,tonnetz = extract_feature(fn)
w2v_features = get_w2v_features(sub_dir)
ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz, w2v_features])
features = np.vstack([features,ext_features])
labels = np.append(labels, sub_dir)
filenames.append(fn)
return np.array(features), filenames, labels
def get_w2v_features(label):
return w2v_model[label]
def one_hot_encode(labels):
n_labels = len(labels)
n_unique_labels = len(np.unique(labels))
one_hot_encode = np.zeros((n_labels, n_unique_labels))
one_hot_encode[np.arange(n_labels), labels] = 1
return one_hot_encode
In [7]:
sample_dir = os.path.join(data_dir, 'sample')
sub_dirs = ['laughing', 'coughing', 'insects', 'rooster', 'crying_baby', 'footsteps']
classes = {'laughing': 0, 'coughing': 1, 'insects': 2, 'rooster': 3, 'crying_baby': 4, 'footsteps': 5}
if not os.path.exists('sample_sound_features.npy'):
features, filenames, labels = build_feature_matrix(sample_dir, sub_dirs, classes)
else:
features = np.load('sample_sound_features.npy')
with open('sample_files.txt') as fp:
filenames = [fn.strip() for fn in fp.readlines()]
with open('sample_labels.txt') as fp:
labels = [lb.strip() for lb in fp.readlines()]
In [8]:
print features.shape
print len(filenames)
print len(labels)
In [9]:
outfile = 'sample_sound_features.npy'
np.save(outfile, features)
with open('sample_files.txt', 'w') as fp:
fp.write('\n'.join(filenames))
with open('sample_labels.txt', 'w') as fp:
fp.write('\n'.join(labels))
In [10]:
from sklearn.decomposition import PCA
from sklearn import preprocessing
import pandas as pd
features_df = pd.DataFrame(features)
data_scaled = pd.DataFrame(preprocessing.scale(features_df), columns=features_df.columns)
pca = PCA(n_components=5)
pca_results = pca.fit_transform(data_scaled)
print 'Shape of the transformed feature vector:', pca_results.shape
print 'Original training sample:', list(features_df.loc[0].values)
print 'Training sample after PCA:', list(pca_results[0])
print '\n'
# Percentage of variance explained for each components
print 'Explained variance ratio (first five components)'
print '------------------------------------------------'
for idx, r in enumerate(pca.explained_variance_ratio_):
print 'Principal Component', idx, ':', r
In [11]:
from ggplot import *
df_pca = features_df.copy()
df_pca['x-pca'] = pca_results[:,0]
df_pca['y-pca'] = pca_results[:,1]
df_pca['label'] = labels
chart = ggplot( df_pca, aes(x='x-pca', y='y-pca', color='label') ) \
+ geom_point(size=75,alpha=0.8) \
+ ggtitle("First and Second Principal Components colored by class label")
chart
Out[11]:
In [12]:
from sklearn.metrics.pairwise import cosine_similarity
import IPython.display
In [13]:
queries = ['laugh', 'chuckle', 'joke', 'walking', 'run',
'jogging', 'cough', 'throat', 'mosquito', 'wasp',
'bees', 'parrot', 'cockarel', 'crowing']
In [31]:
query = 'laugh'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
Out[31]:
In [17]:
query = 'chuckling'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True)
Out[17]:
In [18]:
query = 'joke'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
Out[18]:
In [32]:
query = 'walking'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
Out[32]:
In [20]:
query = 'run'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
Out[20]:
In [21]:
query = 'jogging'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
Out[21]:
In [22]:
query = 'cough'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
Out[22]:
In [33]:
query = 'throat'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
Out[33]:
In [24]:
query = 'mosquito'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
Out[24]:
In [25]:
query = 'wasp'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
Out[25]:
In [34]:
query = 'bees'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
Out[34]:
In [27]:
query = 'parrot'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
Out[27]:
In [28]:
query = 'cock'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
Out[28]:
In [29]:
query = 'buzzing'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
Out[29]:
In [30]:
query = 'wheezing'
query_features = np.append(np.zeros(193), get_w2v_features(query))
idx = np.argmax(cosine_similarity(features, query_features.reshape(1, -1), dense_output=True))
result = filenames[idx]
print 'Matching class:', labels[idx]
IPython.display.Audio(result, embed=True, autoplay=True)
Out[30]:
In [ ]: