In [1]:

    
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os



In [2]:

    
import sys
sys.path.append('..')

Read precomputed features



In [3]:

    
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')



In [24]:

    
target = 'Dog_4'



In [25]:

    
pdata = cached_data_loader.load('data_preictal_%s_gen2_medianwindow-fft-with-time-freq-corr-1-48-r400-usf-w600'%target,None)









    



Loaded ../data-cache/data_preictal_Dog_4_gen2_medianwindow-fft-with-time-freq-corr-1-48-r400-usf-w600.hkl in 0s



In [26]:

    
ndata = cached_data_loader.load('data_interictal_%s_gen2_medianwindow-fft-with-time-freq-corr-1-48-r400-usf-w600'%target,None)









    



Loaded ../data-cache/data_interictal_Dog_4_gen2_medianwindow-fft-with-time-freq-corr-1-48-r400-usf-w600.hkl in 0s



In [27]:

    
pdata.X.shape, ndata.X.shape









    Out[27]:





((257, 3072), (804, 3072))



In [28]:

    
X = np.concatenate((pdata.X, ndata.X))



In [29]:

    
y = np.zeros(X.shape[0])
y[:pdata.X.shape[0]] = 1



In [39]:

    
import random
idxs=range(len(y))
random.shuffle(idxs)
X = X[idxs,:]
y = y[idxs]

Predict



In [40]:

    
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=3000, min_samples_split=1, max_depth=10,
                             n_jobs=-1)



In [41]:

    
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score

skf = StratifiedKFold(y, n_folds=3)
y_all_proba = np.zeros(y.shape)
y_all_count = np.zeros(y.shape)

for train, test in skf:
    clf.fit(X[train,:],y[train])
    y_proba = clf.predict_proba(X[test,:])[:,1]
    auc = roc_auc_score(y[test], y_proba)
    y_all_proba[test] += y_proba
    y_all_count[test] += 1
    print auc









    



0.970583130857
0.96528982992
0.982221246708



In [42]:

    
y_all_proba /= y_all_count



In [43]:

    
roc_auc_score(y, y_all_proba)









    Out[43]:





0.97301914551754853



In [44]:

    
len(y_all_proba)









    Out[44]:





1061



In [45]:

    
def calchist(y_r,y_v,bins = 1000):
    pbins = np.zeros(bins)
    tbins = np.zeros(bins)
    N = len(y_r)
    assert len(y_v) == N
    bin = (bins*y_v).astype(int)
    tbins = np.bincount(bin, minlength=bins).astype(float)
    pbins = np.bincount(bin[y_r>0.5], minlength=bins).astype(float)
    return pbins/tbins



In [61]:

    
hist = calchist(y,y_all_proba,bins=50)
pl.subplot(121)
pl.plot(hist)
pl.subplot(122)
pl.hist(y_all_proba,bins=50)
pl.gcf().set_size_inches(12.,5.);



In [62]:

    
from sklearn.isotonic import IsotonicRegression as IR
ir = IR( out_of_bounds = 'clip' )
ir.fit(y_all_proba,y)
y_calibrated = ir.transform(y_all_proba)



In [64]:

    
hist = calchist(y,y_calibrated,bins=50)
pl.subplot(121)
pl.plot(hist)
pl.subplot(122)
pl.hist(y_calibrated,bins=50)
pl.gcf().set_size_inches(12.,5.);



In [67]:

    
from sklearn.linear_model import LogisticRegression as LR

lr = LR()                                                       
lr.fit( y_all_proba.reshape( -1, 1 ), y )     # LR needs X to be 2-dimensional
y_calibrated = lr.predict_proba( y_all_proba.reshape( -1, 1 ))[:,1]



In [68]:

    
hist = calchist(y,y_calibrated,bins=50)
pl.subplot(121)
pl.plot(hist)
pl.subplot(122)
pl.hist(y_calibrated,bins=50)
pl.gcf().set_size_inches(12.,5.);



In [ ]: