Run Random Forest on all data: training and the best test result so far


In [24]:
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os
import random

In [25]:
import sys 
sys.path.append('..')

Read precomputed features

uncommoent the relevant pipeline in ../seizure_detection.py and run

cd ..
./doall data

or

./doall td
./doall tt

In [26]:
FEATURES = 'gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9'

In [27]:
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')

In [28]:
def read_data(target, data_type, features):
    fname = 'data_%s_%s_%s'%(data_type,target,features)
    print fname
    return cached_data_loader.load(fname,None)

In [29]:
best = pd.read_csv('../submissions/141029-predict.10.csv', index_col='clip', squeeze=True)

In [30]:
def prb2logit(x):
    return np.log(x/(1.-x))
def logit2prb(x):
    return 1./(1+np.exp(-x))

In [31]:
prb2logit(np.clip(best,0.02,0.95)).hist(bins=50)


Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x11562fcd0>

Predict


In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score

clf = RandomForestRegressor(n_estimators=3000, min_samples_split=1, bootstrap=False,max_depth=10,
                             n_jobs=-1)#, max_features=15

In [34]:
fpout = open('../submissions/141101-predict.4.csv','w')
print >>fpout,'clip,preictal'

In [35]:
def prb2logit(x):
    return np.log(x/(1.-x))
def logit2prb(x):
    return 1./(1+np.exp(-x))
SMOOTH = 0.
TRAIN_LOGIT = 5.
TEST_MIN = 0.05
TEST_MAX = 0.9

for target in ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2']:
    pdata0 = read_data(target, 'preictal', FEATURES) # positive examples
    ndata0 = read_data(target, 'interictal', FEATURES) # negative examples
    X0 = np.concatenate((pdata0.X, ndata0.X))
    y0 = np.zeros(X0.shape[0])
    y0[:pdata0.X.shape[0]] = 1
    y0logit = (y0*2.-1.)*TRAIN_LOGIT # turn real test from 1/0 to +/-TRAIN_LOGIT

    # predict
    tdata = read_data(target, 'test', FEATURES) # test examples
    Xt = tdata.X
    Nt = Xt.shape[0]
    yt = np.array([best['%s_test_segment_%04d.mat' % (target, i+1)] for i in range(Nt)])
    yt = prb2logit(np.clip(yt, TEST_MIN, TEST_MAX))
    yt = yt*(1.-SMOOTH) + SMOOTH*prb2logit(y0.mean())
    
    X = np.concatenate((X0,Xt))
    y = np.concatenate((y0logit, yt))
    clf.fit(X,y)
    
    y_proba_logit = clf.predict(Xt)
    y_proba = logit2prb(y_proba_logit)
    
    # write results
    for i,p in enumerate(y_proba):
        print >>fpout,'%s_test_segment_%04d.mat,%.15f' % (target, i+1, p)


data_preictal_Dog_1_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_interictal_Dog_1_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_test_Dog_1_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_preictal_Dog_2_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_interictal_Dog_2_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_test_Dog_2_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_preictal_Dog_3_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_interictal_Dog_3_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_test_Dog_3_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_preictal_Dog_4_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_interictal_Dog_4_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_test_Dog_4_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_preictal_Dog_5_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_interictal_Dog_5_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_test_Dog_5_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_preictal_Patient_1_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_interictal_Patient_1_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_test_Patient_1_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_preictal_Patient_2_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_interictal_Patient_2_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9
data_test_Patient_2_gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9

In [36]:
fpout.close()

In [37]:
!head ../submissions/141101-predict.4.csv


clip,preictal
Dog_1_test_segment_0001.mat,0.659886022674020
Dog_1_test_segment_0002.mat,0.318131558270606
Dog_1_test_segment_0003.mat,0.128956856228520
Dog_1_test_segment_0004.mat,0.456104807924915
Dog_1_test_segment_0005.mat,0.295515867515557
Dog_1_test_segment_0006.mat,0.432390997320269
Dog_1_test_segment_0007.mat,0.171492493505366
Dog_1_test_segment_0008.mat,0.456104807924915
Dog_1_test_segment_0009.mat,0.187827592788726

In [ ]: