In [1]:
repo_pth = '../../'
#resource_pth = '../../opt/rfcx-data/'
resource_pth = '../../../resources/'
In [46]:
import os
import sys
from datetime import datetime
import numpy as np
import pandas
import seaborn
from matplotlib import pyplot as plt
import sqlite3
import pickle
add_paths = [repo_pth+'rfcx-worker-analysis/modules/domain_modules', repo_pth+'notebook-display']
for p in add_paths:
if p not in sys.path:
sys.path.append(p)
import load_sound
import spectral_analysis
import fingerprinting
import sound_classification
from IPython.html.widgets import interactive, Checkbox, interact
from IPython.display import display, HTML
from IPython.html import widgets
from IPython.core.display import clear_output
import nbio
In [47]:
reload(load_sound)
reload(spectral_analysis)
reload(fingerprinting)
reload(sound_classification)
reload(nbio)
show = nbio.show
read_sound = load_sound.read_sound
write_sound = load_sound.write_sound
Sound = load_sound.Sound
Spectrum = spectral_analysis.Spectrum
Profile = fingerprinting.Profile
SoundClassifier = sound_classification.SoundClassifier
def play(snd):
nbio.play(snd.data, snd.samplerate)
def read_sound(fp):
fn = fp.split('/')[-1]
name = fn.split('.')[0]
gdate, time = name.split('T')
time = time.replace('-',':')
gid, y, m, d = gdate.split('-')
date = '-'.join([y,m,d])
dt = 'T'.join([date,time])
meta_data = {'guardian_id':gid, 'start_time':dt}
return load_sound.read_sound(fp, meta_data)
In [48]:
event_fn = resource_pth+'events2.tsv'
data_pth = resource_pth+'wav/'
in_dir = sorted(os.listdir(data_pth))
print '%s files found in %s' % (len(set(in_dir)), data_pth)
In [49]:
df = pandas.io.parsers.read_csv(
event_fn,
sep ='\t',
#delim_whitespace=True,
parse_dates = ['time'],
infer_datetime_format=True,
).groupby('has_file').get_group(True).copy()
In [50]:
fips = pickle.load(open('current_classifier_fips.pkl'))
df['fips'] = fips
print len(fips)
In [51]:
df2 = df.copy()
df2['has_fip'] = [f is not None for f in df2['fips']]
df2 = df2.groupby('has_fip').get_group(True).copy()
df2['not_gsm'] = [f['classification']!='GSM_Noise' for f in df2['fips']]
df2 = df2.groupby('not_gsm').get_group(True).copy()
#df2['has_alert'] = [type(e)!=type(None) for e in df2['vectors']]
#df2 = df2.groupby('has_alert').get_group(True)
df2['checked'] = df2['valid']!=4
df2 = df2.groupby('checked').get_group(1).copy()
df2['isnt_bad'] = [f!='22720ac238e1' for f in df2['guardian']]
df2 = df2.groupby('isnt_bad').get_group(True).copy()
print len(df2)
In [53]:
df2['vectors'] = [fip['fingerprint'] for fip in df2['fips']]
df2['harmonic_intvl'] = [fip['harmonic_intvl'] for fip in df2['fips']]
df2['volume_power'] = [fip['volume_power'] for fip in df2['fips']]
df2['harmonic_power'] = [fip['harmonic_power'] for fip in df2['fips']]
df2['duration'] = [fip['time_interval'][1]-fip['time_interval'][0] for fip in df2['fips']]
df2['offset'] = [abs(15-fip['time_interval'][0]) for fip in df2['fips']]
df2['classification'] = [fip['classification'] for fip in df2['fips']]
#df['time_interval'] = interval
mv = [fip['moving_volume_fit'] for fip in df2['fips']]
lsq,vals = zip(*mv)
a,b,c = zip(*vals)
df2['fit_lsq'] = lsq
df2['fit_a'] = a
df2['fit_b'] = b
df2['fit_c'] = c
df2['harmonic_intvl_mean'] = [np.mean(e) for e in df2['harmonic_intvl']]
df2['harmonic_intvl_std'] = [np.std(e) for e in df2['harmonic_intvl']]
df2['harmonic_intvl_dmean'] = [np.mean(np.abs(np.diff(e))) for e in df2['harmonic_intvl']]
df2['harmonic_intvl_dstd'] = [np.std(np.abs(np.diff(e))) for e in df2['harmonic_intvl']]
df2['volume_mean'] = [np.mean(np.abs(e)) for e in df2['volume_power']]
df2['volume_max'] = [np.max(e) for e in df2['volume_power']]
df2['volume_std'] = [np.std(e) for e in df2['volume_power']]
df2['harmonic_mean'] = [np.mean(e) for e in df2['harmonic_power']]
df2['harmonic_max'] = [np.max(e) for e in df2['harmonic_power']]
df2['harmonic_std'] = [np.std(e) for e in df2['harmonic_power']]
t_idx = 'vectors'
vector_len = len(df2[t_idx][0])
vectors = np.empty((len(df2[t_idx]),vector_len))
for i,e in enumerate(df2[t_idx]):
vectors[i,:]=e
df2['hsum'] = vectors.sum(1)
for i in range(vector_len):
df2['h'+str(i)] = vectors[:,i] #/ df2['hsum']
In [54]:
df3 = df2
In [55]:
v_features = ['h'+str(i) for i in range(vector_len)]
features = [
'harmonic_intvl_mean','harmonic_intvl_dmean',
'harmonic_intvl_std', 'harmonic_intvl_dstd',
'duration',
'fit_a','fit_b','fit_c',
'fit_lsq',
'harmonic_mean','harmonic_max', 'harmonic_std',
'volume_mean', 'volume_std',
]
In [56]:
df_grouped = df3.groupby('valid')
df_0 = df_grouped.get_group(0)
df_1 = df_grouped.get_group(1)
#plt.scatter(df_0['fit_factor'],df_0['fit_fac_c'],marker='+',color='r')
#plt.scatter(df_1['fit_factor'],df_1['fit_fac_c'])
#plt.scatter(df3['valid'],df3['fit_fac_c'])
#
plt.clf()
ax = seaborn.corrplot(df2[v_features+['valid']], diag_names=False,)
ax.set_title('vectors')
nbio.show(plt.gcf())
plt.clf()
ax = seaborn.corrplot(df2[features+['valid']], diag_names=False,)
ax.set_title('all')
nbio.show(plt.gcf())
plt.clf()
ax = seaborn.corrplot(df_0[features], diag_names=False)
ax.set_title('not vehicle')
nbio.show(plt.gcf())
plt.clf()
ax = seaborn.corrplot(df_1[features], diag_names=False)
ax.set_title('vehicle')
nbio.show(plt.gcf())
In [57]:
for e in v_features:
#if e.startswith('harm'):
plt.clf()
seaborn.boxplot(df3[e], df3['valid'], )
#seaborn.violinplot(df3[e], df3['valid'], )
ax = plt.gca()
lo, hi = np.percentile(df3[e],(1,99))
print lo, hi
#ax.set_xlim(lo,2000)
ax.set_ylim(lo,hi)
nbio.show(plt.gcf())
In [ ]: