The filterbanks that we use are similar to the ones we use in speech: 25ms sliding windows, 10ms timesteps, 40 Mel-based log filterbanks. Look at the "MFCC_pipeline.png" in this Monkey Sounds folder to get an idea, we stop before the last cepstra transformation (just before MFCC) and use N=40 filterbanks.
In [5]:
from pylab import imshow, show
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 15, 10
import numpy as np
from scipy.io import wavfile
wavfname = "Blue monkey/FA.wav" # I used "sox FA.WAV FA_soxed.wav" because there was a non-data warning by scipy.io.wavfile
srate, sound = wavfile.read(wavfname)
print sound.shape
print srate
if len(sound.shape) == 2: # in mono sound is a list
sound = sound[:, 0] + sound[:, 1]
print sound
from spectral import Mel # you need https://github.com/mwv/spectral
nfbanks = 40
wfbanks = 0.025 # 25ms
rfbanks = 100 # 10ms
fbanks = Mel(nfilt=nfbanks, # nb of filters in mel bank
alpha=0.97, # pre-emphasis
fs=srate, # sampling rate
frate=rfbanks, # frame rate
wlen=wfbanks, # window length
nfft=1024, # length of dft => 512 is not enough and produces a glitch
mel_deltas=False, # speed
mel_deltasdeltas=False # acceleration
)
fbank = fbanks.transform(sound)[0] # first dimension is for
# deltas & deltasdeltas
print fbank.shape
imshow(fbank[:100].T, interpolation='nearest')
Out[5]:
In [7]:
import csv
fname = "FA"
all_monkeys = []
with open('Monkey Sounds.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=';', quotechar='|')
keys = reader.next()
print keys
all_monkeys = [dict(zip(keys, row)) for row in reader]
#print sounds
# Selecting blue Monkeys for the rest from now on:
blue = filter(lambda l: l['Species'] == 'Blue monkey', all_monkeys)
t = [filter(lambda (k, v): k in ['Word', 'Filename', 'StartTime (s)', 'EndTime (s)'], d.iteritems()) for d in blue]
#t = [zip(*l)[1] for l in t]
t = [map(lambda (k, v): float(v.replace(',', '.')) if '(s)' in k else v, l) for l in t]
print t
In [75]:
fbanks_arrays = {}
durations = [] # We will also look at the durations of the calls in the same loop pass
for s in t:
durations.append(s[2] - s[1])
if not s[-1] in fbanks_arrays:
fname = s[-1] + ".wav"
srate, wav = wavfile.read(fname)
# rebuilding the Mel object in case the sampling rate "srate" changes accross files
fbanks = Mel(nfilt=nfbanks, alpha=0.97, fs=srate, frate=rfbanks, wlen=wfbanks,
nfft=1024, mel_deltas=False, mel_deltasdeltas=False)
fbanks_arrays[s[-1]] = fbanks.transform(wav)[0]
dur_mean = np.mean(durations)
dur_std = np.std(durations)
print dur_mean
print dur_std
print dur_mean + 3*dur_std
In [82]:
y = []
x = []
for s in t:
y.append(s[0])
# let's take 420ms of sound for the call to cover most cases, this has to be changed / revaluated
middle = s[1] + (s[2]-s[1])/2
start = int((middle - 0.21) * rfbanks)
end = int((middle + 0.21) * rfbanks)
print start, end
assert fbanks_arrays[s[-1]][start:end].shape[0] == 42
x.append(fbanks_arrays[s[-1]][start:end])
y = np.asarray(y)
x = np.asarray(x)
#print y
#print x
print y.shape
print x.shape
In [100]:
imshow(np.mean(x[y=='Hack'], axis=0).T, interpolation='nearest')
Out[100]:
In [101]:
imshow(np.mean(x[y=='Pyow'], axis=0).T, interpolation='nearest')
Out[101]:
In [102]:
from sklearn import svm
clf = svm.SVC()
X = x.reshape(x.shape[0], x.shape[1]*x.shape[2]) # reshaping to (n_samples, n_features)
clf.fit(X, y) # THIS IS WHERE THE MAGIC HAPPENS!
# If the magic is not strong enough, get bigger guns (features engineering and selection,
# less eyeballing/more data-driven duration selection, HMMs, deep neural networks...)
print clf
print y, clf.predict(X) # too few examples (by far!), but seems linearly separable
In [103]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
xx = pca.fit_transform(X)
print pca
print pca.components_
print pca.explained_variance_ratio_
print xx
xplt = zip(*xx)
colors = {'Pyow': 'r', 'Hack': 'b'}
plt.scatter(xplt[0], xplt[1], color=map(lambda lab: colors[lab], y))
DO_ICA = False
if DO_ICA:
from sklearn.decomposition import FastICA
ica = FastICA(n_components=2)
xx = ica.fit_transform(X)
print ica
print ica.components_
print xx
xplt = zip(*xx)
plt.scatter(xplt[0], xplt[1])
In [16]:
t = [filter(lambda (k, v): k in ['Species', 'Word', 'Filename', 'StartTime (s)', 'EndTime (s)'], d.iteritems()) for d in all_monkeys]
t = [map(lambda (k, v): float(v.replace(',', '.')) if '(s)' in k else v, l) for l in t]
print t
fbanks_arrays = {}
durations = [] # We will also look at the durations of the calls in the same loop pass
for s in t:
durations.append(s[2] - s[1])
if not s[-2] in fbanks_arrays:
fname = s[-1] + '/' + s[-2] + ".wav"
print fname
srate, wav = wavfile.read(fname)
if len(wav.shape) > 1 and wav.shape[1] == 2:
wav = (wav[:, 0] + wav[:, 1]) / 2.
# rebuilding the Mel object in case the sampling rate "srate" changes accross files
fbanks = Mel(nfilt=nfbanks, alpha=0.97, fs=srate, frate=rfbanks, wlen=wfbanks,
nfft=1024, mel_deltas=False, mel_deltasdeltas=False)
fbanks_arrays[s[-2]] = fbanks.transform(wav)[0]
dur_mean = np.mean(durations)
dur_std = np.std(durations)
print dur_mean
print dur_std
print dur_mean + 3*dur_std
In [19]:
y = []
x = []
for s in t:
y.append(s[0])
# let's take 420ms of sound for the call to cover most cases, this has to be changed / revaluated
middle = s[1] + (s[2]-s[1])/2
start = int((middle - 0.21) * rfbanks)
end = int((middle + 0.21) * rfbanks)
print start, end
assert fbanks_arrays[s[-2]][start:end].shape[0] == 42
x.append(fbanks_arrays[s[-2]][start:end])
y = np.asarray(y)
x = np.asarray(x)
#print y
#print x
print y.shape
print x.shape
In [21]:
imshow(np.mean(x[y=='Snort'], axis=0).T, interpolation='nearest')
Out[21]:
In [24]:
from sklearn import svm
clf = svm.SVC()
X = x.reshape(x.shape[0], x.shape[1]*x.shape[2]) # reshaping to (n_samples, n_features)
clf.fit(X, y) # THIS IS WHERE THE MAGIC HAPPENS!
# If the magic is not strong enough, get bigger guns (features engineering and selection,
# less eyeballing/more data-driven duration selection, HMMs, deep neural networks...)
print clf
print y
print clf.predict(X) # too few examples (by far!), but seems linearly separable
In [30]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
xx = pca.fit_transform(X)
print pca
print pca.components_
print pca.explained_variance_ratio_
print xx
xplt = zip(*xx)
colors = {'Snort': 'g', 'Roar': 'c', 'Pyow': 'r', 'Hack': 'b', 'Chirp': 'm', 'Cheep': 'k'}
plt.scatter(xplt[0], xplt[1], color=map(lambda lab: colors[lab], y))
Out[30]:
In [29]:
from sklearn.decomposition import FastICA
ica = FastICA(n_components=2)
xx = ica.fit_transform(X)
print ica
print ica.components_
print xx
xplt = zip(*xx)
plt.scatter(xplt[0], xplt[1], color=map(lambda lab: colors[lab], y))
Out[29]:
In [42]:
yy = map(lambda call_name: colors.keys().index(call_name), y)
svc = svm.SVC(kernel='linear').fit(xx, yy)
x_min, x_max = xx[:, 0].min() - 1, xx[:, 0].max() + 1
y_min, y_max = xx[:, 1].min() - 1, xx[:, 1].max() + 1
h = 0.1
x_grid, y_grid = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = svc.predict(np.c_[x_grid.ravel(), y_grid.ravel()])
Z = Z.reshape(x_grid.shape)
pl.contourf(x_grid, y_grid, Z, cmap=pl.cm.Paired)
pl.axis('off')
pl.scatter(xx[:, 0], xx[:, 1], c=yy, cmap=pl.cm.Paired)
Out[42]:
In [43]:
svm_rbf = svm.SVC(kernel='poly').fit(xx, yy)
Z = svm_rbf.predict(np.c_[x_grid.ravel(), y_grid.ravel()])
Z = Z.reshape(x_grid.shape)
pl.contourf(x_grid, y_grid, Z, cmap=pl.cm.Paired)
pl.axis('off')
pl.scatter(xx[:, 0], xx[:, 1], c=yy, cmap=pl.cm.Paired)
Out[43]:
In [ ]: