with 140925-metainfo I found that the negative examples (interictal) also have sequences. I fixed the gen_ictal
to generate inter-sequnece segments also for the negative examples.
In [2]:
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os
import random
from collections import defaultdict
In [3]:
import sys
sys.path.append('..')
uncommoent the relevant pipeline in ../seizure_detection.py
and run
cd ..
./doall data
In [1]:
FEATURES = 'gen-8_medianwindow-bands2-usf-w10-hammingP2-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9'
In [4]:
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')
In [5]:
def read_data(target, data_type):
return cached_data_loader.load('data_%s_%s_%s'%(data_type,target,FEATURES),None)
In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression as LR
clf = RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False,max_depth=5,
n_jobs=-1) #min_samples_leaf=4
In [7]:
with_weights = False
PWEIGHT = 6.
LWEIGHT = 0.
suffix = 'max_depth5'
split examples into segments, each from the same event in each CV-split we will take all examples from the same segment to either train or validate
In [8]:
def getsegments(pdata):
segments = []
start = 0
last_l = 0
for i,l in enumerate(pdata.latencies):
if l<last_l:
segments.append(np.arange(start,i))
start = i
last_l = l
segments.append(np.arange(start,i+1))
return np.array(segments)
Compute AUC for each target separatly
In [9]:
import itertools
target2iter2ys = {}
for target in ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2']:
# positive examples
pdata = read_data(target, 'preictal')
Np, NF = pdata.X.shape
psegments = getsegments(pdata)
Nps = len(psegments)
# negative examples
ndata = read_data(target, 'interictal')
Nn, NF = ndata.X.shape
nsegments = getsegments(ndata)
Nns = len(nsegments)
npratio = float(Nn)/Np
print target,1/(1+npratio),Np,Nn
npsratio = float(Nns)/Nps
print target,1/(1+npsratio),Nps,Nns
Ntrainps = 1
Ntrainns = int(Ntrainps*npsratio)
iter2ys = defaultdict(list) # {niter: Ns *[(ytest,y_proba)]
for s in psegments:
Xtestp = pdata.X[s,:]
weightstest = pdata.latencies[s] # latency for first segment is 1
Ntrainp = len(s)
Ntrainn = int(Ntrainp*npratio)
for niter in range(3):
# n = np.array(random.sample(xrange(Nn),Ntrainn)) # segment based
ns = np.array(random.sample(xrange(Nns),Ntrainns)) # sequence based
n = np.array(list(itertools.chain(*nsegments[ns]))) # .ravel does not work - elements of nsegments are not of equal length
Xtestn = ndata.X[n,:]
Xtrainp = pdata.X[-s,:]
Xtrainn = ndata.X[-n,:]
Xtrain = np.concatenate((Xtrainp,Xtrainn))
ytrain = np.concatenate((np.ones(Xtrainp.shape[0]),np.zeros(Xtrainn.shape[0])))
perm = np.random.permutation(len(ytrain))
ytrain = ytrain[perm]
Xtrain = Xtrain[perm,:]
Xtest = np.concatenate((Xtestp,Xtestn))
ytest = np.concatenate((np.ones(Xtestp.shape[0]),np.zeros(Xtestn.shape[0])))
if with_weights:
weightsp = PWEIGHT*np.ones(Xtrainp.shape[0])
weightsp += LWEIGHT * (pdata.latencies[-s]-1.) # latency for first segment is 1
weightsn = np.ones(Xtrainn.shape[0])
weights = np.concatenate((weightsp,weightsn))
weights = weights[perm]
clf.fit(Xtrain, ytrain, sample_weight=weights)
else:
clf.fit(Xtrain, ytrain)
y_proba = clf.predict_proba(Xtest)[:,1]
iter2ys[niter].append((ytest, y_proba))
auc = roc_auc_score(ytest, y_proba)
print '%.3f'%auc,Ntrainp,np.mean(weightstest)
target2iter2ys[target] = iter2ys
print
In [10]:
fname = '../data-cache/140926-CV.%s%s.pkl'%(suffix, FEATURES)
with open(fname,'wb') as fp:
pickle.dump(target2iter2ys,fp,-1)
In [11]:
fname
Out[11]:
Generate a single AUC score
In [12]:
def p(a,b):
return '%d E%d'%(1000*a,1000*b)
for f in [
'max_depth5gen-8_medianwindow-bands2-usf-w10-hammingP2-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
'max_depth10gen-8_medianwindow-bandsI2-usf-w60-b0.2-b4-b8-b12-b30-b50-b75-b100-b117-0.1-0.5-0.9',
'max_depth5gen-8_medianwindow-bandsI2-usf-w60-b0.2-b4-b8-b12-b30-b50-b75-b100-b117-0.1-0.5-0.9',
'max_depth5.min_samples_leaf4.gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
'max_depth5.gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
]:
all_ytest = all_y_proba =None
all_aucs = []
with open('../data-cache/140926-CV.%s.pkl'%f,'rb') as fp:
target2iter2ys = pickle.load(fp)
for target, iter2ys in target2iter2ys.iteritems():
target_ytest = target_y_proba =None
target_aucs = []
print target,
for ys in iter2ys.itervalues():
ytest = y_proba =None
aucs = []
for y in ys:
yt, yp = y
ytest = yt if ytest is None else np.concatenate((ytest,yt))
y_proba = yp if y_proba is None else np.concatenate((y_proba,yp))
aucs.append(roc_auc_score(yt, yp))
print p(roc_auc_score(ytest, y_proba), np.mean(aucs)),
target_aucs += aucs
target_ytest = ytest if target_ytest is None else np.concatenate((target_ytest,ytest))
target_y_proba = y_proba if target_y_proba is None else np.concatenate((target_y_proba,y_proba))
print target,p(roc_auc_score(target_ytest, target_y_proba),np.mean(target_aucs))
all_aucs += target_aucs
all_ytest = target_ytest if all_ytest is None else np.concatenate((all_ytest,target_ytest))
all_y_proba = target_y_proba if all_y_proba is None else np.concatenate((all_y_proba,target_y_proba))
# if target == 'Dog_3':
# pl.hist(target_aucs,alpha=0.5)
print f,p(roc_auc_score(all_ytest, all_y_proba),np.mean(all_aucs))
print
In [ ]: