In [1]:
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os

In [2]:
import sys
sys.path.append('..')

Read precomputed features


In [3]:
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')

In [24]:
target = 'Dog_4'

In [25]:
pdata = cached_data_loader.load('data_preictal_%s_gen2_medianwindow-fft-with-time-freq-corr-1-48-r400-usf-w600'%target,None)


Loaded ../data-cache/data_preictal_Dog_4_gen2_medianwindow-fft-with-time-freq-corr-1-48-r400-usf-w600.hkl in 0s

In [26]:
ndata = cached_data_loader.load('data_interictal_%s_gen2_medianwindow-fft-with-time-freq-corr-1-48-r400-usf-w600'%target,None)


Loaded ../data-cache/data_interictal_Dog_4_gen2_medianwindow-fft-with-time-freq-corr-1-48-r400-usf-w600.hkl in 0s

In [27]:
pdata.X.shape, ndata.X.shape


Out[27]:
((257, 3072), (804, 3072))

In [28]:
X = np.concatenate((pdata.X, ndata.X))

In [29]:
y = np.zeros(X.shape[0])
y[:pdata.X.shape[0]] = 1

In [39]:
import random
idxs=range(len(y))
random.shuffle(idxs)
X = X[idxs,:]
y = y[idxs]

Predict


In [40]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=3000, min_samples_split=1, max_depth=10,
                             n_jobs=-1)

In [41]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score

skf = StratifiedKFold(y, n_folds=3)
y_all_proba = np.zeros(y.shape)
y_all_count = np.zeros(y.shape)

for train, test in skf:
    clf.fit(X[train,:],y[train])
    y_proba = clf.predict_proba(X[test,:])[:,1]
    auc = roc_auc_score(y[test], y_proba)
    y_all_proba[test] += y_proba
    y_all_count[test] += 1
    print auc


0.970583130857
0.96528982992
0.982221246708

In [42]:
y_all_proba /= y_all_count

In [43]:
roc_auc_score(y, y_all_proba)


Out[43]:
0.97301914551754853

In [44]:
len(y_all_proba)


Out[44]:
1061

In [45]:
def calchist(y_r,y_v,bins = 1000):
    pbins = np.zeros(bins)
    tbins = np.zeros(bins)
    N = len(y_r)
    assert len(y_v) == N
    bin = (bins*y_v).astype(int)
    tbins = np.bincount(bin, minlength=bins).astype(float)
    pbins = np.bincount(bin[y_r>0.5], minlength=bins).astype(float)
    return pbins/tbins

In [61]:
hist = calchist(y,y_all_proba,bins=50)
pl.subplot(121)
pl.plot(hist)
pl.subplot(122)
pl.hist(y_all_proba,bins=50)
pl.gcf().set_size_inches(12.,5.);



In [62]:
from sklearn.isotonic import IsotonicRegression as IR
ir = IR( out_of_bounds = 'clip' )
ir.fit(y_all_proba,y)
y_calibrated = ir.transform(y_all_proba)

In [64]:
hist = calchist(y,y_calibrated,bins=50)
pl.subplot(121)
pl.plot(hist)
pl.subplot(122)
pl.hist(y_calibrated,bins=50)
pl.gcf().set_size_inches(12.,5.);



In [67]:
from sklearn.linear_model import LogisticRegression as LR

lr = LR()                                                       
lr.fit( y_all_proba.reshape( -1, 1 ), y )     # LR needs X to be 2-dimensional
y_calibrated = lr.predict_proba( y_all_proba.reshape( -1, 1 ))[:,1]

In [68]:
hist = calchist(y,y_calibrated,bins=50)
pl.subplot(121)
pl.plot(hist)
pl.subplot(122)
pl.hist(y_calibrated,bins=50)
pl.gcf().set_size_inches(12.,5.);



In [ ]: