In [1]:
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os
import random
from collections import defaultdict
In [2]:
import sys
sys.path.append('..')
uncommoent the relevant pipeline in ../seizure_detection.py and run
cd ..
./doall data
In [3]:
FEATURES1 = 'gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9'
In [4]:
FEATURES2 = 'gen-8_medianwindow-timecorr-usf-w60-0.1-0.5-0.9'
In [5]:
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')
In [6]:
def read_data(target, data_type, features):
return cached_data_loader.load('data_%s_%s_%s'%(data_type,target,features),None)
In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression as LR
clf = RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False,max_depth=10,
n_jobs=-1) #
In [8]:
with_weights = False
suffix = 'max_depth10'
Compute AUC for each target separatly
In [9]:
target2iter2ys = {}
for target in ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2']:
# positive examples
pdata1 = read_data(target, 'preictal', FEATURES1)
Np, NF1 = pdata1.X.shape
pdata2 = read_data(target, 'preictal', FEATURES2)
Xp = np.hstack((pdata1.X, pdata2.X))
assert np.all(pdata1.latencies == pdata2.latencies)
latencies = pdata1.latencies
# split the positive examples into segments, each from the same event
# in each CV-split we will take all examples from the same segment to either train or validate
segments = []
start = 0
last_l = 0
for i,l in enumerate(latencies):
if l<last_l:
segments.append(np.arange(start,i))
start = i
last_l = l
segments.append(np.arange(start,i+1))
Ns = len(segments)
# negative examples
ndata1 = read_data(target, 'interictal', FEATURES1)
ndata2 = read_data(target, 'interictal', FEATURES2)
Nn = ndata1.X.shape[0]
Xn = np.hstack((ndata1.X, ndata2.X))
npratio = float(Nn)/Np
print target,1/(1+npratio),Ns,Np,Nn
iter2ys = defaultdict(list) # {niter: Ns *[(ytest,y_proba)]
for s in segments:
for niter in range(3):
# each time, take one segment for testing and randomly pick negative examples
Xtestp = Xp[s,:]
weightstest = latencies[s] # latency for first segment is 1
Ntrainp = len(s)
Ntrainn = int(Ntrainp*npratio)
n = np.array(random.sample(xrange(Nn),Ntrainn))
Xtestn = Xn[n,:]
Xtrainp = Xp[-s,:]
weightsp = latencies[-s] # latency for first segment is 1
Xtrainn = Xn[-n,:]
weightsn = np.ones(Xtrainn.shape[0])
Xtrain = np.concatenate((Xtrainp,Xtrainn))
weights = np.concatenate((weightsp,weightsn))
ytrain = np.concatenate((np.ones(Ntrainp),np.zeros(Ntrainn)))
perm = np.random.permutation(len(ytrain))
ytrain = ytrain[perm]
Xtrain = Xtrain[perm,:]
weights = weights[perm]
Xtest = np.concatenate((Xtestp,Xtestn))
ytest = np.concatenate((np.ones(Xtestp.shape[0]),np.zeros(Xtestn.shape[0])))
if with_weights:
clf.fit(Xtrain, ytrain, sample_weight=weights)
else:
clf.fit(Xtrain, ytrain)
y_proba = clf.predict_proba(Xtest)[:,1]
iter2ys[niter].append((ytest, y_proba))
auc = roc_auc_score(ytest, y_proba)
print '%.3f'%auc,Ntrainp,np.mean(weightstest)
target2iter2ys[target] = iter2ys
print
In [10]:
Xp.shape,Xn.shape
Out[10]:
In [11]:
fname = '../data-cache/140924-CV-combine2.%s:%s:%s.pkl'%(suffix, FEATURES1,FEATURES2)
with open(fname,'wb') as fp:
pickle.dump(target2iter2ys,fp,-1)
In [12]:
fname
Out[12]:
Generate a single AUC score
In [13]:
def p(a,b):
return '%d E%d'%(1000*a,1000*b)
for f in [
'max_depth10:gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9:gen-8_medianwindow-timecorr-usf-w60-0.1-0.5-0.9',
]:
all_ytest = all_y_proba =None
all_aucs = []
with open('../data-cache/140924-CV-combine2.%s.pkl'%f,'rb') as fp:
target2iter2ys = pickle.load(fp)
for target, iter2ys in target2iter2ys.iteritems():
target_ytest = target_y_proba =None
target_aucs = []
print target,
for ys in iter2ys.itervalues():
ytest = y_proba =None
aucs = []
for y in ys:
yt, yp = y
ytest = yt if ytest is None else np.concatenate((ytest,yt))
y_proba = yp if y_proba is None else np.concatenate((y_proba,yp))
aucs.append(roc_auc_score(yt, yp))
print p(roc_auc_score(ytest, y_proba), np.mean(aucs)),
target_aucs += aucs
target_ytest = ytest if target_ytest is None else np.concatenate((target_ytest,ytest))
target_y_proba = y_proba if target_y_proba is None else np.concatenate((target_y_proba,y_proba))
print target,p(roc_auc_score(target_ytest, target_y_proba),np.mean(target_aucs))
all_aucs += target_aucs
all_ytest = target_ytest if all_ytest is None else np.concatenate((all_ytest,target_ytest))
all_y_proba = target_y_proba if all_y_proba is None else np.concatenate((all_y_proba,target_y_proba))
# if target == 'Dog_3':
# pl.hist(target_aucs,alpha=0.5)
print f,p(roc_auc_score(all_ytest, all_y_proba),np.mean(all_aucs))
print
In [13]: