In [1]:
%matplotlib inline
import seaborn as sns
from birdsonganalysis.distribs import get_distribs
from birdsonganalysis import all_song_features
from birdsonganalysis import utils
from scipy.io import wavfile
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
from collections import defaultdict
In [2]:
sr, samba = wavfile.read('../songs/samba.wav')
sr, simple = wavfile.read('../songs/simple.wav')
sr, bells = wavfile.read('../songs/bells.wav')
sr, flashcam = wavfile.read('../songs/flashcam.wav')
sr, lg109 = wavfile.read('../songs/LG109.wav')
sr, lg193 = wavfile.read('../songs/LG193.wav')
sr, pu7 = wavfile.read('../songs/Pu7.wav')
sr, pu40 = wavfile.read('../songs/Pu40.wav')
sr, pu72 = wavfile.read('../songs/Pu72.wav')
songs = [samba, simple, bells, flashcam, lg109, lg193, pu7, pu40]
In [6]:
f = defaultdict(lambda: np.array([], dtype=float))
for song in songs:
feats = all_song_features(song, sr, freq_range=256, fft_step=40, fft_size=1024)
# remove the silence
for key in feats:
if (key != 'amplitude'):
feats[key] = feats[key][feats['amplitude'] > np.percentile(feats['amplitude'], 20)]
for fname in feats:
f[fname] = np.concatenate((f[fname], feats[fname]))
In [7]:
def mad(arr):
""" Median Absolute Deviation: a "Robust" version of standard deviation.
Indices variabililty of the sample.
https://en.wikipedia.org/wiki/Median_absolute_deviation
"""
arr = np.ma.array(arr).compressed() # should be faster to not use masked arrays.
med = np.median(arr)
return np.median(np.abs(arr - med))
In [11]:
from pprint import pformat
dmed = {}
dmad = {}
for key in f:
cmed = np.median(f[key])
cmad = mad(f[key])
dmed[key] = cmed
dmad[key] = cmad
print('med = {}'.format(pformat(dmed, indent=4)))
print()
print('mad = {}'.format(pformat(dmad, indent=4)))
utils.set_med_mad(dmed, dmad)
Let's put these values in utils.py
Let's compute all the global errors and locals errors possible between all of these songs against each other (therefore, 10×9/2=45 comparisons)
In [6]:
# Takes an hour or more, can be parallelised at some point
allG, allL = get_distribs(songs)
In [7]:
sns.distplot(allG)
Out[7]:
Looks lognormal
In [8]:
logallG = np.log(allG+0.01)
sns.distplot(logallG) # Adds 0.01 to avoid -inf
Out[8]:
In [9]:
np.mean(logallG)
Out[9]:
In [10]:
np.std(logallG, ddof=1)
Out[10]:
In [11]:
sns.distplot(logallG) # Adds 0.01 to avoid -inf
plt.plot(np.linspace(-4, 8, 1000), norm.pdf(np.linspace(-4, 8, 1000), np.mean(logallG), np.std(logallG, ddof=1)))
Out[11]:
The fit looks decent in log.
In [22]:
a = []
Gsorted = np.sort(allG)
for i in range(1, 101):
a.append(np.percentile(Gsorted, i))
pprint(a, indent=4)
In [13]:
sns.distplot(allL)
Out[13]:
Looks lognormal
In [14]:
logallL = np.log(allL+0.01)
sns.distplot(logallL) # Adds 0.01 to avoid -inf
Out[14]:
In [15]:
np.mean(logallL)
Out[15]:
In [16]:
np.std(logallL, ddof=1)
Out[16]:
In [17]:
sns.distplot(logallL) # Adds 0.001 to avoid -inf
imin, imax = np.min(logallL), np.max(logallL)
plt.plot(np.linspace(imin, imax, 1000), norm.pdf(np.linspace(imin, imax, 1000), np.mean(logallL), np.std(logallL, ddof=1)))
Out[17]:
The fit looks decent in log. We will use this value in similarity.py
In [18]:
plt.plot(np.linspace(imin, imax, 1000), norm.cdf(np.linspace(imin, imax, 1000), np.mean(logallL), np.std(logallL, ddof=1)))
plt.plot([-4, 10], [0.01, 0.01])
Out[18]:
In [21]:
b = []
Lsorted = np.sort(allL)
for i in range(1, 101):
b.append( np.percentile(Lsorted, i))
pprint(b, indent=4)
In [ ]: