In [1]:
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os
import random
from collections import defaultdict
In [2]:
import sys
sys.path.append('..')
uncommoent the relevant pipeline in ../seizure_detection.py
and run
cd ..
./doall data
In [3]:
FEATURES = 'gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9'
In [4]:
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')
In [5]:
def read_data(target, data_type):
return cached_data_loader.load('data_%s_%s_%s'%(data_type,target,FEATURES),None)
In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression as LR
clf = RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False,max_depth=5,
n_jobs=-1) #
In [9]:
with_weights = False
suffix = 'fft.'
Compute AUC for each target separatly
In [10]:
target2iter2ys = {}
for target in ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2']:
# positive examples
pdata = read_data(target, 'preictal')
Np, NF = pdata.X.shape
t = NF//3
assert 3*t == NF
mask = range(t-5,t) + range(2*t-5,2*t) + range(3*t-5,3*t)
# split the positive examples into segments, each from the same event
# in each CV-split we will take all examples from the same segment to either train or validate
segments = []
start = 0
last_l = 0
for i,l in enumerate(pdata.latencies):
if l<last_l:
segments.append(np.arange(start,i))
start = i
last_l = l
segments.append(np.arange(start,i+1))
Ns = len(segments)
# negative examples
ndata = read_data(target, 'interictal')
Nn = ndata.X.shape[0]
npratio = float(Nn)/Np
print target,1/(1+npratio),Ns,Np,Nn
iter2ys = defaultdict(list) # {niter: Ns *[(ytest,y_proba)]
for s in segments:
for niter in range(3):
# each time, take one segment for testing and randomly pick negative examples
Xtestp = pdata.X[s,:]
weightstest = pdata.latencies[s] # latency for first segment is 1
Ntrainp = len(s)
Ntrainn = int(Ntrainp*npratio)
n = np.array(random.sample(xrange(Nn),Ntrainn))
Xtestn = ndata.X[n,:]
Xtrainp = pdata.X[-s,:]
weightsp = pdata.latencies[-s] # latency for first segment is 1
Xtrainn = ndata.X[-n,:]
weightsn = np.ones(Xtrainn.shape[0])
Xtrain = np.concatenate((Xtrainp,Xtrainn))
weights = np.concatenate((weightsp,weightsn))
ytrain = np.concatenate((np.ones(Ntrainp),np.zeros(Ntrainn)))
perm = np.random.permutation(len(ytrain))
ytrain = ytrain[perm]
Xtrain = Xtrain[perm,:]
weights = weights[perm]
Xtest = np.concatenate((Xtestp,Xtestn))
ytest = np.concatenate((np.ones(Xtestp.shape[0]),np.zeros(Xtestn.shape[0])))
if with_weights:
clf.fit(Xtrain[:,mask], ytrain, sample_weight=weights)
else:
clf.fit(Xtrain[:,mask], ytrain)
y_proba = clf.predict_proba(Xtest[:,mask])[:,1]
iter2ys[niter].append((ytest, y_proba))
auc = roc_auc_score(ytest, y_proba)
print '%.3f'%auc,Ntrainp,np.mean(weightstest)
target2iter2ys[target] = iter2ys
print
In [11]:
fname = '../data-cache/140907-CV.%s%s.pkl'%(suffix, FEATURES)
with open(fname,'wb') as fp:
pickle.dump(target2iter2ys,fp,-1)
In [12]:
fname
Out[12]:
Generate a single AUC score
In [23]:
def p(a,b):
return '%d E%d'%(1000*a,1000*b)
for f in [
'max_depth2.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
'max_depth5.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
'max_depth5.p15.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
'max_depth5.p5.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
'max_depth5.p59.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
'max_depth10.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
'gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
'n3.gen8_medianwindow1-fft-with-time-freq-corr-1-48-r400-usf-w600',
# 'n3.1.gen8_medianwindow-fft-with-time-freq-corr-1-48-r400-usf-w600',
# 'n3.w.gen8_medianwindow-fft-with-time-freq-corr-1-48-r400-usf-w600',
# 'n3.w.gen16_medianwindow-fft-with-time-freq-corr-1-48-r400-usf-w600',
]:
all_ytest = all_y_proba =None
all_aucs = []
with open('../data-cache/140907-CV.%s.pkl'%f,'rb') as fp:
target2iter2ys = pickle.load(fp)
for target, iter2ys in target2iter2ys.iteritems():
target_ytest = target_y_proba =None
target_aucs = []
print target,
for ys in iter2ys.itervalues():
ytest = y_proba =None
aucs = []
for y in ys:
yt, yp = y
ytest = yt if ytest is None else np.concatenate((ytest,yt))
y_proba = yp if y_proba is None else np.concatenate((y_proba,yp))
aucs.append(roc_auc_score(yt, yp))
print p(roc_auc_score(ytest, y_proba), np.mean(aucs)),
target_aucs += aucs
target_ytest = ytest if target_ytest is None else np.concatenate((target_ytest,ytest))
target_y_proba = y_proba if target_y_proba is None else np.concatenate((target_y_proba,y_proba))
print target,p(roc_auc_score(target_ytest, target_y_proba),np.mean(target_aucs))
all_aucs += target_aucs
all_ytest = target_ytest if all_ytest is None else np.concatenate((all_ytest,target_ytest))
all_y_proba = target_y_proba if all_y_proba is None else np.concatenate((all_y_proba,target_y_proba))
# if target == 'Dog_3':
# pl.hist(target_aucs,alpha=0.5)
print f,p(roc_auc_score(all_ytest, all_y_proba),np.mean(all_aucs))
print
In [12]:
NF
Out[12]:
In [ ]: