In [1]:
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os
import random
from collections import defaultdict

In [2]:
import sys
sys.path.append('..')

Read precomputed features

uncommoent the relevant pipeline in ../seizure_detection.py and run

cd ..
./doall data

In [3]:
FEATURES1 = 'gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9'

In [4]:
FEATURES2 = 'gen-8_medianwindow-timecorr-usf-w60-0.1-0.5-0.9'

In [5]:
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')

In [6]:
def read_data(target, data_type, features):
    return cached_data_loader.load('data_%s_%s_%s'%(data_type,target,features),None)

Predict


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression as LR

clf = RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False,max_depth=10,
                             n_jobs=-1) #

In [8]:
with_weights = False
suffix = 'max_depth10'

Compute AUC for each target separatly


In [9]:
target2iter2ys = {}
for target in ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2']:
    # positive examples
    pdata1 = read_data(target, 'preictal', FEATURES1)
    Np, NF1 = pdata1.X.shape

    pdata2 = read_data(target, 'preictal', FEATURES2)
    Xp = np.hstack((pdata1.X, pdata2.X))
    
    assert np.all(pdata1.latencies == pdata2.latencies)
    latencies = pdata1.latencies

    # split the positive examples into segments, each from the same event
    # in each CV-split we will take all examples from the same segment to either train or validate
    segments = []
    start = 0
    last_l = 0
    for i,l in enumerate(latencies):
        if l<last_l:
            segments.append(np.arange(start,i))
            start = i
        last_l = l
    segments.append(np.arange(start,i+1))
    Ns = len(segments)

    # negative examples
    ndata1 = read_data(target, 'interictal', FEATURES1)
    ndata2 = read_data(target, 'interictal', FEATURES2)
    Nn = ndata1.X.shape[0]
    Xn = np.hstack((ndata1.X, ndata2.X))

    
    npratio = float(Nn)/Np
    print target,1/(1+npratio),Ns,Np,Nn

    iter2ys = defaultdict(list) # {niter: Ns *[(ytest,y_proba)]
    for s in segments:
        for niter in range(3):
            # each time, take one segment for testing and randomly pick negative examples
            Xtestp = Xp[s,:]
            weightstest = latencies[s] # latency for first segment is 1

            Ntrainp = len(s)
            Ntrainn = int(Ntrainp*npratio)
            n = np.array(random.sample(xrange(Nn),Ntrainn))
            Xtestn = Xn[n,:]

            Xtrainp = Xp[-s,:]
            weightsp = latencies[-s] # latency for first segment is 1
            Xtrainn = Xn[-n,:]
            weightsn = np.ones(Xtrainn.shape[0]) 

            Xtrain = np.concatenate((Xtrainp,Xtrainn))
            weights = np.concatenate((weightsp,weightsn))
            ytrain = np.concatenate((np.ones(Ntrainp),np.zeros(Ntrainn)))
            perm = np.random.permutation(len(ytrain))
            ytrain = ytrain[perm]
            Xtrain = Xtrain[perm,:]
            weights = weights[perm]

            Xtest = np.concatenate((Xtestp,Xtestn))
            ytest = np.concatenate((np.ones(Xtestp.shape[0]),np.zeros(Xtestn.shape[0])))

            if with_weights:
                clf.fit(Xtrain, ytrain, sample_weight=weights)
            else:
                clf.fit(Xtrain, ytrain)

            y_proba = clf.predict_proba(Xtest)[:,1]
            iter2ys[niter].append((ytest, y_proba))
            
            auc = roc_auc_score(ytest, y_proba)
            print '%.3f'%auc,Ntrainp,np.mean(weightstest)
    target2iter2ys[target] = iter2ys
    print


Dog_1 0.277108433735 4 184 480
0.769 46 3.5
0.791 46 3.5
0.784 46 3.5
0.434 46 3.5
0.473 46 3.5
0.539 46 3.5
0.674 46 3.5
0.643 46 3.5
0.569 46 3.5
0.573 46 3.5
0.495 46 3.5
0.392 46 3.5

Dog_2 0.391727493917 7 322 500
0.998 46 3.5
0.996 46 3.5
0.994 46 3.5
0.905 46 3.5
0.981 46 3.5
0.963 46 3.5
0.981 46 3.5
0.979 46 3.5
0.967 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
0.985 46 3.5
1.000 46 3.5
1.000 46 3.5
0.997 46 3.5
1.000 46 3.5
1.000 46 3.5
0.753 46 3.5
0.772 46 3.5
0.653 46 3.5

Dog_3 0.277108433735 12 552 1440
0.891 46 3.5
0.890 46 3.5
0.918 46 3.5
0.763 46 3.5
0.607 46 3.5
0.551 46 3.5
0.881 46 3.5
0.814 46 3.5
0.748 46 3.5
0.780 46 3.5
0.621 46 3.5
0.685 46 3.5
0.749 46 3.5
0.838 46 3.5
0.924 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
0.967 46 3.5
0.968 46 3.5
0.986 46 3.5
0.768 46 3.5
0.741 46 3.5
0.765 46 3.5
0.754 46 3.5
0.563 46 3.5
0.755 46 3.5
0.594 46 3.5
0.568 46 3.5
0.491 46 3.5
0.369 46 3.5
0.343 46 3.5
0.260 46 3.5

Dog_4 0.478260869565 17 737 804
0.975 46 3.5
1.000 46 3.5
1.000 46 3.5
0.453 46 3.5
0.580 46 3.5
0.451 46 3.5
0.596 46 3.5
0.602 46 3.5
0.652 46 3.5
0.470 37 4.0
0.592 37 4.0
0.718 37 4.0
0.489 19 5.0
0.545 19 5.0
0.832 19 5.0
0.728 46 3.5
0.647 46 3.5
0.735 46 3.5
0.695 46 3.5
0.614 46 3.5
0.686 46 3.5
0.897 37 4.0
0.908 37 4.0
0.893 37 4.0
0.391 46 3.5
0.268 46 3.5
0.382 46 3.5
0.398 46 3.5
0.285 46 3.5
0.416 46 3.5
0.610 46 3.5
0.858 46 3.5
0.847 46 3.5
0.811 46 3.5
0.800 46 3.5
0.608 46 3.5
0.363 46 3.5
0.411 46 3.5
0.486 46 3.5
0.626 46 3.5
0.604 46 3.5
0.385 46 3.5
0.325 46 3.5
0.525 46 3.5
0.322 46 3.5
0.352 46 3.5
0.348 46 3.5
0.480 46 3.5
0.802 46 3.5
0.772 46 3.5
0.814 46 3.5

Dog_5 0.338235294118 5 230 450
0.919 46 3.5
0.812 46 3.5
0.904 46 3.5
0.993 46 3.5
0.995 46 3.5
0.992 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
0.983 46 3.5
0.981 46 3.5
0.979 46 3.5
0.789 46 3.5
0.789 46 3.5
0.818 46 3.5

Patient_1 0.734042553191 3 138 50
0.633 46 3.5
0.781 46 3.5
0.924 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
0.790 46 3.5
0.938 46 3.5
0.898 46 3.5

Patient_2 0.766666666667 3 138 42
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
0.843 46 3.5
0.784 46 3.5
0.881 46 3.5


In [10]:
Xp.shape,Xn.shape


Out[10]:
((138, 1260), (42, 1260))

In [11]:
fname = '../data-cache/140924-CV-combine2.%s:%s:%s.pkl'%(suffix, FEATURES1,FEATURES2)
with open(fname,'wb') as fp:
    pickle.dump(target2iter2ys,fp,-1)

In [12]:
fname


Out[12]:
'../data-cache/140924-CV-combine2.max_depth10:gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9:gen-8_medianwindow-timecorr-usf-w60-0.1-0.5-0.9.pkl'

Generate a single AUC score


In [13]:
def p(a,b):
    return '%d E%d'%(1000*a,1000*b)

for f in [
            'max_depth10:gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9:gen-8_medianwindow-timecorr-usf-w60-0.1-0.5-0.9',
        ]:
    all_ytest = all_y_proba =None
    all_aucs = []
    with open('../data-cache/140924-CV-combine2.%s.pkl'%f,'rb') as fp:
        target2iter2ys = pickle.load(fp)
    for target, iter2ys in target2iter2ys.iteritems():
        target_ytest = target_y_proba =None
        target_aucs = []
        print target,
        for ys in iter2ys.itervalues():
            ytest = y_proba =None
            aucs = []
            for y in ys:
                yt, yp = y
                ytest = yt if ytest is None else np.concatenate((ytest,yt))
                y_proba = yp if y_proba is None else np.concatenate((y_proba,yp))
                aucs.append(roc_auc_score(yt, yp))
            print p(roc_auc_score(ytest, y_proba), np.mean(aucs)),
            target_aucs += aucs
            target_ytest = ytest if target_ytest is None else np.concatenate((target_ytest,ytest))
            target_y_proba = y_proba if target_y_proba is None else np.concatenate((target_y_proba,y_proba))
        print target,p(roc_auc_score(target_ytest, target_y_proba),np.mean(target_aucs))
        all_aucs += target_aucs        
        all_ytest = target_ytest if all_ytest is None else np.concatenate((all_ytest,target_ytest))
        all_y_proba = target_y_proba if all_y_proba is None else np.concatenate((all_y_proba,target_y_proba))
#         if target == 'Dog_3':
#             pl.hist(target_aucs,alpha=0.5)
    print f,p(roc_auc_score(all_ytest, all_y_proba),np.mean(all_aucs))
    print


Dog_2 918 E945 933 E961 913 E939 Dog_2 921 E948
Dog_3 761 E792 733 E745 741 E757 Dog_3 745 E765
Dog_1 618 E612 615 E600 584 E570 Dog_1 606 E594
Dog_4 578 E587 598 E609 609 E629 Dog_4 595 E608
Dog_5 925 E936 920 E915 940 E938 Dog_5 928 E930
Patient_2 926 E947 943 E928 937 E960 Patient_2 934 E945
Patient_1 793 E807 910 E906 937 E940 Patient_1 877 E884
max_depth10:gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9:gen-8_medianwindow-timecorr-usf-w60-0.1-0.5-0.9 765 E758


In [13]: