In [1]:
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os
In [2]:
import sys
sys.path.append('..')
In [3]:
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')
In [24]:
target = 'Dog_4'
In [25]:
pdata = cached_data_loader.load('data_preictal_%s_gen2_medianwindow-fft-with-time-freq-corr-1-48-r400-usf-w600'%target,None)
In [26]:
ndata = cached_data_loader.load('data_interictal_%s_gen2_medianwindow-fft-with-time-freq-corr-1-48-r400-usf-w600'%target,None)
In [27]:
pdata.X.shape, ndata.X.shape
Out[27]:
In [28]:
X = np.concatenate((pdata.X, ndata.X))
In [29]:
y = np.zeros(X.shape[0])
y[:pdata.X.shape[0]] = 1
In [39]:
import random
idxs=range(len(y))
random.shuffle(idxs)
X = X[idxs,:]
y = y[idxs]
In [40]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=3000, min_samples_split=1, max_depth=10,
n_jobs=-1)
In [41]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score
skf = StratifiedKFold(y, n_folds=3)
y_all_proba = np.zeros(y.shape)
y_all_count = np.zeros(y.shape)
for train, test in skf:
clf.fit(X[train,:],y[train])
y_proba = clf.predict_proba(X[test,:])[:,1]
auc = roc_auc_score(y[test], y_proba)
y_all_proba[test] += y_proba
y_all_count[test] += 1
print auc
In [42]:
y_all_proba /= y_all_count
In [43]:
roc_auc_score(y, y_all_proba)
Out[43]:
In [44]:
len(y_all_proba)
Out[44]:
In [45]:
def calchist(y_r,y_v,bins = 1000):
pbins = np.zeros(bins)
tbins = np.zeros(bins)
N = len(y_r)
assert len(y_v) == N
bin = (bins*y_v).astype(int)
tbins = np.bincount(bin, minlength=bins).astype(float)
pbins = np.bincount(bin[y_r>0.5], minlength=bins).astype(float)
return pbins/tbins
In [61]:
hist = calchist(y,y_all_proba,bins=50)
pl.subplot(121)
pl.plot(hist)
pl.subplot(122)
pl.hist(y_all_proba,bins=50)
pl.gcf().set_size_inches(12.,5.);
In [62]:
from sklearn.isotonic import IsotonicRegression as IR
ir = IR( out_of_bounds = 'clip' )
ir.fit(y_all_proba,y)
y_calibrated = ir.transform(y_all_proba)
In [64]:
hist = calchist(y,y_calibrated,bins=50)
pl.subplot(121)
pl.plot(hist)
pl.subplot(122)
pl.hist(y_calibrated,bins=50)
pl.gcf().set_size_inches(12.,5.);
In [67]:
from sklearn.linear_model import LogisticRegression as LR
lr = LR()
lr.fit( y_all_proba.reshape( -1, 1 ), y ) # LR needs X to be 2-dimensional
y_calibrated = lr.predict_proba( y_all_proba.reshape( -1, 1 ))[:,1]
In [68]:
hist = calchist(y,y_calibrated,bins=50)
pl.subplot(121)
pl.plot(hist)
pl.subplot(122)
pl.hist(y_calibrated,bins=50)
pl.gcf().set_size_inches(12.,5.);
In [ ]: