In [1]:
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os
import random

In [2]:
import sys 
sys.path.append('..')

Read precomputed features

uncommoent the relevant pipeline in ../seizure_detection.py and run

cd ..
./doall data

or

./doall td
./doall tt

In [3]:
FEATURES = 'gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70'

In [4]:
FEATURES1 = 'gen-8_maxdiff-60'

In [5]:
nbands = 0
nwindows = 0
for p in FEATURES.split('-'):
    if p[0] == 'b':
        nbands += 1
    elif p[0] == 'w':
        nwindows = int(p[1:])

nbands -= 1
nbands, nwindows


Out[5]:
(5, 60)

In [6]:
NUNITS = 2

In [7]:
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')

In [8]:
def read_data(target, data_type, features=FEATURES):
    fname = 'data_%s_%s_%s'%(data_type,target,features)
    print fname
    return cached_data_loader.load(fname,None)

Predict


In [9]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False,max_depth=10,
                             n_jobs=-1)#

In [10]:
fpout = open('../submissions/141107-predict.1.csv','w')
print >>fpout,'clip,preictal'

In [11]:
def process(X, X1, percentile=[0.05,0.95], nunits=NUNITS):
    N, Nf = X.shape
    print '# samples',N,'# power points', Nf
    nchannels = Nf / (nbands*nwindows)
    print '# channels', nchannels

    newX = []
    for i in range(N):
        nw = nwindows//nunits
        windows = X[i,:].reshape((nunits,nw,-1))
        mask = X1[i,:].reshape((nunits,nw,-1)) # max value for each channel
        for j in range(nunits):
            for k in range(nchannels):
                m = mask[j,:,k] > 5000 # find large windows
                if np.any(m):
#                     print 'FIX', sum(m)
                    if not np.all(m): # make sure we had at least one good window so we can re use its values
                        # replace the bands of a large windows with the mean of the bands in all other windows
                        windows[j,m,k*nbands:(k+1)*nbands] = np.mean(windows[j,~m,k*nbands:(k+1)*nbands], axis=0)
        sorted_windows = np.sort(windows, axis=1)
        features = np.concatenate([sorted_windows[:,int(p*nw),:] for p in percentile], axis=-1)
        newX.append(features.ravel())
    newX = np.array(newX)

    return newX

In [12]:
for target in ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2']:
    pdata = read_data(target, 'preictal') # positive examples
    ndata = read_data(target, 'interictal') # negative examples
    X = np.concatenate((pdata.X, ndata.X))

    pdata1 = read_data(target, 'preictal', FEATURES1) # positive examples
    ndata1 = read_data(target, 'interictal', FEATURES1) # negative examples
    X1 = np.concatenate((pdata1.X, ndata1.X))

    X = process(X, X1)
    
    y = np.zeros(X.shape[0])
    y[:pdata.X.shape[0]] = 1
    
    # shuffle
    idxs=range(len(y))
    random.shuffle(idxs)
    X = X[idxs,:]
    y = y[idxs]
 
    clf.fit(X,y)
    # predict
    tdata = read_data(target, 'test') # test examples
    tdata1 = read_data(target, 'test', FEATURES1) # test examples
    Xt = process(tdata.X, tdata1.X)
    
    y_proba = clf.predict_proba(Xt)[:,1]
    
    # write results
    for i,p in enumerate(y_proba):
        print >>fpout,'%s_test_segment_%04d.mat,%.15f' % (target, i+1, p)


data_preictal_Dog_1_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_interictal_Dog_1_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_preictal_Dog_1_gen-8_maxdiff-60
data_interictal_Dog_1_gen-8_maxdiff-60
# samples 664 # power points 4800
# channels 16
data_test_Dog_1_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_test_Dog_1_gen-8_maxdiff-60
# samples 502 # power points 4800
# channels 16
data_preictal_Dog_2_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_interictal_Dog_2_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_preictal_Dog_2_gen-8_maxdiff-60
data_interictal_Dog_2_gen-8_maxdiff-60
# samples 822 # power points 4800
# channels 16
data_test_Dog_2_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_test_Dog_2_gen-8_maxdiff-60
# samples 1000 # power points 4800
# channels 16
data_preictal_Dog_3_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_interictal_Dog_3_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_preictal_Dog_3_gen-8_maxdiff-60
data_interictal_Dog_3_gen-8_maxdiff-60
# samples 1992 # power points 4800
# channels 16
data_test_Dog_3_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_test_Dog_3_gen-8_maxdiff-60
# samples 907 # power points 4800
# channels 16
data_preictal_Dog_4_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_interictal_Dog_4_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_preictal_Dog_4_gen-8_maxdiff-60
data_interictal_Dog_4_gen-8_maxdiff-60
# samples 1541 # power points 4800
# channels 16
data_test_Dog_4_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_test_Dog_4_gen-8_maxdiff-60
# samples 990 # power points 4800
# channels 16
data_preictal_Dog_5_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_interictal_Dog_5_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_preictal_Dog_5_gen-8_maxdiff-60
data_interictal_Dog_5_gen-8_maxdiff-60
# samples 680 # power points 4500
# channels 15
data_test_Dog_5_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_test_Dog_5_gen-8_maxdiff-60
# samples 191 # power points 4500
# channels 15
data_preictal_Patient_1_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_interictal_Patient_1_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_preictal_Patient_1_gen-8_maxdiff-60
data_interictal_Patient_1_gen-8_maxdiff-60
# samples 188 # power points 4500
# channels 15
data_test_Patient_1_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_test_Patient_1_gen-8_maxdiff-60
# samples 195 # power points 4500
# channels 15
data_preictal_Patient_2_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_interictal_Patient_2_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_preictal_Patient_2_gen-8_maxdiff-60
data_interictal_Patient_2_gen-8_maxdiff-60
# samples 180 # power points 7200
# channels 24
data_test_Patient_2_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_test_Patient_2_gen-8_maxdiff-60
# samples 150 # power points 7200
# channels 24

In [13]:
fpout.close()

In [13]: