In [34]:
repo_pth = '../../'
#resource_pth = '../../opt/rfcx-data/'
resource_pth = '../../../resources/'
In [35]:
import os
import sys
from datetime import datetime
import numpy as np
import pandas
import seaborn
from matplotlib import pyplot as plt
import sqlite3
import pickle
add_paths = [repo_pth+'rfcx-worker-analysis/modules/domain_modules', repo_pth+'notebook-display']
for p in add_paths:
if p not in sys.path:
sys.path.append(p)
import load_sound
import spectral_analysis
import fingerprinting
import sound_classification
from IPython.html.widgets import interactive, Checkbox, interact
from IPython.display import display, HTML
from IPython.html import widgets
from IPython.core.display import clear_output
import nbio
# cropped portion of sound file:
# seek_sec -15, +20
In [36]:
reload(load_sound)
reload(spectral_analysis)
reload(fingerprinting)
reload(sound_classification)
reload(nbio)
show = nbio.show
read_sound = load_sound.read_sound
write_sound = load_sound.write_sound
Sound = load_sound.Sound
Spectrum = spectral_analysis.Spectrum
Profile = fingerprinting.Profile
SoundClassifier = sound_classification.SoundClassifier
def play(snd):
nbio.play(snd.data, snd.samplerate)
def read_sound(fp):
fn = fp.split('/')[-1]
name = fn.split('.')[0]
gdate, time = name.split('T')
time = time.replace('-',':')
gid, y, m, d = gdate.split('-')
date = '-'.join([y,m,d])
dt = 'T'.join([date,time])
meta_data = {'guardian_id':gid, 'start_time':dt}
return load_sound.read_sound(fp, meta_data)
In [37]:
event_fn = resource_pth+'events2.tsv'
data_pth = resource_pth+'wav/'
in_dir = sorted(os.listdir(data_pth))
print '%s files found in %s' % (len(set(in_dir)), data_pth)
In [38]:
df = pandas.io.parsers.read_csv(
event_fn,
sep ='\t',
#delim_whitespace=True,
parse_dates = ['time'],
infer_datetime_format=True,
).groupby('has_file').get_group(True).copy()
In [39]:
fips = pickle.load(open('current_classifier_fips.pkl'))
df['fips'] = fips
print len(fips)
In [40]:
df2 = df.copy()
df2['has_fip'] = [f is not None for f in df2['fips']]
df2 = df2.groupby('has_fip').get_group(True).copy()
df2['not_gsm'] = [f['classification']!='GSM_Noise' for f in df2['fips']]
df2 = df2.groupby('not_gsm').get_group(True).copy()
#df2['has_alert'] = [type(e)!=type(None) for e in df2['vectors']]
#df2 = df2.groupby('has_alert').get_group(True)
df2['checked'] = df2['valid']!=4
df2 = df2.groupby('checked').get_group(1).copy()
df2['isnt_bad'] = [f!='22720ac238e1' for f in df2['guardian']]
df2 = df2.groupby('isnt_bad').get_group(True).copy()
print len(df2)
In [41]:
set(df2['guardian'])
Out[41]:
In [42]:
fips[0].keys()
Out[42]:
In [43]:
df2.iloc[0]
Out[43]:
In [44]:
df2['vectors'] = [fip['fingerprint'] for fip in df2['fips']]
df2['harmonic_intvl'] = [fip['harmonic_intvl'] for fip in df2['fips']]
df2['volume_power'] = [fip['volume_power'] for fip in df2['fips']]
df2['harmonic_power'] = [fip['harmonic_power'] for fip in df2['fips']]
df2['duration'] = [fip['time_interval'][1]-fip['time_interval'][0] for fip in df2['fips']]
df2['offset'] = [abs(15-fip['time_interval'][0]) for fip in df2['fips']]
df2['classification'] = [fip['classification'] for fip in df2['fips']]
#df['time_interval'] = interval
mv = [fip['moving_volume_fit'] for fip in df2['fips']]
lsq,vals = zip(*mv)
a,b,c = zip(*vals)
df2['fit_lsq'] = lsq
df2['fit_a'] = a
df2['fit_b'] = b
df2['fit_c'] = c
df2['harmonic_intvl_mean'] = [np.mean(e) for e in df2['harmonic_intvl']]
df2['harmonic_intvl_std'] = [np.std(e) for e in df2['harmonic_intvl']]
df2['harmonic_intvl_dmean'] = [np.mean(np.abs(np.diff(e))) for e in df2['harmonic_intvl']]
df2['harmonic_intvl_dstd'] = [np.std(np.abs(np.diff(e))) for e in df2['harmonic_intvl']]
df2['volume_mean'] = [np.mean(np.abs(e)) for e in df2['volume_power']]
df2['volume_max'] = [np.max(e) for e in df2['volume_power']]
df2['volume_std'] = [np.std(e) for e in df2['volume_power']]
df2['harmonic_mean'] = [np.mean(e) for e in df2['harmonic_power']]
df2['harmonic_max'] = [np.max(e) for e in df2['harmonic_power']]
df2['harmonic_std'] = [np.std(e) for e in df2['harmonic_power']]
In [45]:
df3 = df2.reindex(np.random.permutation(df2.index))
#df3 = df2
df3['valid'].dtype
Out[45]:
In [46]:
from sklearn.svm import SVC, SVR
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from nolearn.dbn import DBN
#help(RandomForestClassifier)
In [48]:
t_idx = 'vectors'
vector_len = len(df3[t_idx][0])
r0 = np.empty((len(df3[t_idx]),vector_len))
for i,e in enumerate(df3[t_idx]):
r0[i,:]=e
#print 'train', vehic_train.shape
features = [
'harmonic_intvl_mean','harmonic_intvl_dmean',
'harmonic_intvl_std', 'harmonic_intvl_dstd',
'duration',
'fit_a','fit_b','fit_c',
'fit_lsq',
'harmonic_mean','harmonic_max', 'harmonic_std',
'volume_mean', 'volume_std',
]
cols = [np.asarray(df3[s])[:,None] for s in features] +[r0]
vehic_train = np.hstack(cols)
vehic_train = r0
print 'train', vehic_train.shape, vehic_train.dtype
vehic_target = np.asarray(df3['valid'])
print 'target', vehic_target.shape, vehic_target.dtype
clf = DBN(
[-1, 30, -1],
learn_rates=0.3,
learn_rate_decays=0.9,
epochs=20,
#verbose=1,
)
#clf = linear_model.SGDClassifier()
#clf = SVC(kernel='poly')
#clf = MultinomialNB().fit(vehic_train, vehic_target)
#clf = RandomForestClassifier(n_estimators=100, class_weight ={0:1, 1:10})
split = int(.75* len(df3))
# train the classifier
X_train = vehic_train[:split]
X_test = vehic_train[split:]
y_train = vehic_target[:split]
y_test = vehic_train[split:]
clf.fit(X_train, y_train)
# run the classifier
y_pred = clf.predict(X_test)
vehic_predict = y_pred
from sklearn.metrics import classification_report
#from sklearn.metrics import zero_one_score
#print "Accuracy:", zero_one_score(y_test, y_pred)
print "Classification report:"
#print classification_report(y_test, y_pred)
# test the classifier
from sklearn import metrics
print metrics.confusion_matrix(vehic_target[split:], vehic_predict)
print 'predict', vehic_predict.sum()
print 'real', vehic_target[split:].sum()
try:
for f,v in zip(features+range(vector_len), clf.feature_importances_):
print f,v
except:
pass
print r0.shape
In [92]:
dfx = df3.sort('harmonic_mean')
arr = np.asarray(dfx['valid'])
ham = [1]
nbio.show(np.convolve(arr,ham))
In [93]:
for i in range(5):
ev = dfx.iloc[-i]
fip = ev['fips']
snd = read_sound(fip['meta_data']['filepath']).crop(ev.start_seek, ev.stop_seek)
spc = Spectrum(snd)
pro = Profile(spc)
pro.get_harmonic_sound_bounds()
nbio.show(pro.profile_plot())
play(snd)
print ev.location
In [73]:
print np.mean(df_1['harmonic_mean'])
print np.mean(df_0['harmonic_mean'])
In [40]:
for i,ev in df_0.iterrows():
if ev['harmonic_mean']>15:
fip = ev['fips']
snd = read_sound(fip['meta_data']['filepath']).crop(ev.start_seek, ev.stop_seek)
play(snd)
#nbio.show(fip['harmonic_power'])
plt.clf()
plt.plot(fip['harmonic_power'],'.')
nbio.show(plt.gcf())
break
In [74]:
i = 1930
ev = df3.loc[i]
print ev
loc = ev['location']
seek = ev['seek_sec']
vol = ev['volume']
a = ev['fit_factor']
b = ev['fit_fac_b']
c = ev['fit_fac_c']
i0,i1 = ev['time_interval']
snd = read_sound(data_pth+loc, {}).crop(seek-15+i0,seek-15+i1)
x = np.arange(len(vol))
v2 = b/(a**2+(x-c)**2)
play(read_sound(data_pth+loc, {}))
print c
nbio.show(vol)
nbio.show(v2)
In [113]:
ax = seaborn.factorplot('valid', "volume_max",
data=df3, kind="box",
)
for e in ax.axes:
for ee in e:
ee.set_ylim(0,1000)
nbio.show(seaborn.plt.gcf())
In [72]:
dfp = pandas.pivot_table(valid_df, values="valid",
index=["weekday"], columns=["hour"],
aggfunc=np.sum).reindex(index='Mon Tue Wed Thu Fri Sat Sun'.split(),
columns=range(24)
)
In [73]:
type(dfp)
Out[73]:
In [74]:
sns.plt.clf()
sns.heatmap(dfp)
nbio.show(sns.plt.gcf())
In [ ]: