In [1]:
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os
import random
In [2]:
import sys
sys.path.append('..')
uncommoent the relevant pipeline in ../seizure_detection.py and run
cd ..
./doall data
or
./doall td
./doall tt
In [3]:
FEATURES = 'gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70'
In [4]:
FEATURES1 = 'gen-8_maxdiff-60'
In [5]:
nbands = 0
nwindows = 0
for p in FEATURES.split('-'):
if p[0] == 'b':
nbands += 1
elif p[0] == 'w':
nwindows = int(p[1:])
nbands -= 1
nbands, nwindows
Out[5]:
In [6]:
NUNITS = 2
In [7]:
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')
In [8]:
def read_data(target, data_type, features=FEATURES):
fname = 'data_%s_%s_%s'%(data_type,target,features)
print fname
return cached_data_loader.load(fname,None)
In [9]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False,max_depth=10,
n_jobs=-1)#
In [10]:
fpout = open('../submissions/141112-predict.2.csv','w')
print >>fpout,'clip,preictal'
In [11]:
def process(X, X1, percentile=[0.05,0.95], nunits=NUNITS, nwindows=nwindows):
N, Nf = X.shape
print '# samples',N,'# power points', Nf
nchannels = Nf / (nbands*nwindows)
print '# channels', nchannels
newX = []
for i in range(N):
nw = nwindows//nunits
windows = X[i,:].reshape((nunits,nw,-1))
mask = X1[i,:].reshape((nunits,nw,-1)) # max value for each channel
for j in range(nunits):
for k in range(nchannels):
m = mask[j,:,k] > 5000 # find large windows
if np.any(m):
# print 'FIX', sum(m)
if not np.all(m): # make sure we had at least one good window so we can re use its values
# replace the bands of a large windows with the mean of the bands in all other windows
windows[j,m,k*nbands:(k+1)*nbands] = np.mean(windows[j,~m,k*nbands:(k+1)*nbands], axis=0)
sorted_windows = np.sort(windows, axis=1)
features = np.concatenate([sorted_windows[:,int(p*nw),:] for p in percentile], axis=-1)
newX.append(features.ravel())
newX = np.array(newX)
return newX
In [23]:
def bunch(X,nwindows=nwindows):
N, Nf = X.shape
X = X.reshape((N,nwindows,-1))
Y = X[:,:-2,:]*X[:,1:-1,:]*X[:,2:,:]
Y = np.concatenate((Y,Y[:,-2:,:]),axis=1)
return Y.reshape((N,-1))
In [26]:
for target in ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2']:
pdata = read_data(target, 'preictal') # positive examples
ndata = read_data(target, 'interictal') # negative examples
X0 = np.concatenate((pdata.X, ndata.X))
X1 = bunch(X0,nwindows)
pdata1 = read_data(target, 'preictal', FEATURES1) # positive examples
ndata1 = read_data(target, 'interictal', FEATURES1) # negative examples
Xm = np.concatenate((pdata1.X, ndata1.X))
X0 = process(X0, Xm)
X1 = process(X1, Xm)
X = np.hstack((X0, X1))
y = np.zeros(X.shape[0])
y[:pdata.X.shape[0]] = 1
# shuffle
idxs=range(len(y))
random.shuffle(idxs)
X = X[idxs,:]
y = y[idxs]
clf.fit(X,y)
# predict
tdata = read_data(target, 'test') # test examples
Xt0 = tdata.X
Xt1 = bunch(Xt0,nwindows)
tdata1 = read_data(target, 'test', FEATURES1) # test examples
Xtm = tdata1.X
Xt = np.hstack((process(Xt0, Xtm), process(Xt1, Xtm)))
y_proba = clf.predict_proba(Xt)[:,1]
# write results
for i,p in enumerate(y_proba):
print >>fpout,'%s_test_segment_%04d.mat,%.15f' % (target, i+1, p)
In [27]:
fpout.close()
In [ ]: