with 140925-metainfo I found that the negative examples (interictal) also have sequences. I fixed the gen_ictal to generate inter-sequnece segments also for the negative examples.


In [2]:
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os
import random
from collections import defaultdict

In [3]:
import sys
sys.path.append('..')

Read precomputed features

uncommoent the relevant pipeline in ../seizure_detection.py and run

cd ..
./doall data

In [1]:
FEATURES = 'gen-8_medianwindow-bands2-usf-w10-hammingP2-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9'

In [4]:
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')

In [5]:
def read_data(target, data_type):
    return cached_data_loader.load('data_%s_%s_%s'%(data_type,target,FEATURES),None)

Predict


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression as LR

clf = RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False,max_depth=5,
                             n_jobs=-1) #min_samples_leaf=4

In [7]:
with_weights = False
PWEIGHT = 6.
LWEIGHT = 0.
suffix = 'max_depth5'

split examples into segments, each from the same event in each CV-split we will take all examples from the same segment to either train or validate


In [8]:
def getsegments(pdata):
    segments = []
    start = 0
    last_l = 0
    for i,l in enumerate(pdata.latencies):
        if l<last_l:
            segments.append(np.arange(start,i))
            start = i
        last_l = l
    segments.append(np.arange(start,i+1))
    return np.array(segments)

Compute AUC for each target separatly


In [9]:
import itertools

target2iter2ys = {}
for target in ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2']:
    # positive examples
    pdata = read_data(target, 'preictal')
    Np, NF = pdata.X.shape
    
    psegments = getsegments(pdata)
    Nps = len(psegments)

    # negative examples
    ndata = read_data(target, 'interictal')
    Nn, NF = ndata.X.shape
    nsegments = getsegments(ndata)
    Nns = len(nsegments)
    
    npratio = float(Nn)/Np
    print target,1/(1+npratio),Np,Nn
    npsratio = float(Nns)/Nps
    print target,1/(1+npsratio),Nps,Nns
    Ntrainps = 1
    Ntrainns = int(Ntrainps*npsratio)

    iter2ys = defaultdict(list) # {niter: Ns *[(ytest,y_proba)]
    for s in psegments:
        Xtestp = pdata.X[s,:]
        weightstest = pdata.latencies[s] # latency for first segment is 1
        
        Ntrainp = len(s)
        Ntrainn = int(Ntrainp*npratio)
        
        for niter in range(3):

#             n = np.array(random.sample(xrange(Nn),Ntrainn)) # segment based
            ns = np.array(random.sample(xrange(Nns),Ntrainns)) # sequence based
            n = np.array(list(itertools.chain(*nsegments[ns]))) # .ravel does not work - elements of nsegments are not of equal length
            Xtestn = ndata.X[n,:]

            Xtrainp = pdata.X[-s,:]
            Xtrainn = ndata.X[-n,:]

            Xtrain = np.concatenate((Xtrainp,Xtrainn))
            ytrain = np.concatenate((np.ones(Xtrainp.shape[0]),np.zeros(Xtrainn.shape[0])))
            perm = np.random.permutation(len(ytrain))
            ytrain = ytrain[perm]
            Xtrain = Xtrain[perm,:]

            Xtest = np.concatenate((Xtestp,Xtestn))
            ytest = np.concatenate((np.ones(Xtestp.shape[0]),np.zeros(Xtestn.shape[0])))

            if with_weights:
                weightsp = PWEIGHT*np.ones(Xtrainp.shape[0])
                weightsp += LWEIGHT * (pdata.latencies[-s]-1.) # latency for first segment is 1
                weightsn = np.ones(Xtrainn.shape[0]) 
                weights = np.concatenate((weightsp,weightsn))
                weights = weights[perm]
                clf.fit(Xtrain, ytrain, sample_weight=weights)
            else:
                clf.fit(Xtrain, ytrain)

            y_proba = clf.predict_proba(Xtest)[:,1]
            iter2ys[niter].append((ytest, y_proba))
            
            auc = roc_auc_score(ytest, y_proba)
            print '%.3f'%auc,Ntrainp,np.mean(weightstest)
    target2iter2ys[target] = iter2ys
    print


Dog_1 0.277108433735 184 480
Dog_1 0.047619047619 4 80
0.805 46 3.5
0.874 46 3.5
0.835 46 3.5
0.403 46 3.5
0.458 46 3.5
0.515 46 3.5
0.581 46 3.5
0.447 46 3.5
0.755 46 3.5
0.674 46 3.5
0.713 46 3.5
0.450 46 3.5

Dog_2 0.391727493917 322 500
Dog_2 0.0769230769231 7 84
0.976 46 3.5
0.996 46 3.5
0.984 46 3.5
0.955 46 3.5
0.982 46 3.5
0.997 46 3.5
0.945 46 3.5
0.930 46 3.5
0.911 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
0.972 46 3.5
0.959 46 3.5
0.951 46 3.5
1.000 46 3.5
0.993 46 3.5
1.000 46 3.5
0.605 46 3.5
0.832 46 3.5
0.617 46 3.5

Dog_3 0.277108433735 552 1440
Dog_3 0.047619047619 12 240
0.848 46 3.5
0.880 46 3.5
0.873 46 3.5
0.630 46 3.5
0.770 46 3.5
0.791 46 3.5
0.858 46 3.5
0.893 46 3.5
0.976 46 3.5
0.670 46 3.5
0.653 46 3.5
0.797 46 3.5
0.807 46 3.5
0.705 46 3.5
0.563 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
0.999 46 3.5
0.872 46 3.5
0.984 46 3.5
0.963 46 3.5
0.601 46 3.5
0.493 46 3.5
0.566 46 3.5
0.604 46 3.5
0.522 46 3.5
0.548 46 3.5
0.753 46 3.5
0.620 46 3.5
0.597 46 3.5
0.456 46 3.5
0.609 46 3.5
0.532 46 3.5

Dog_4 0.478260869565 737 804
Dog_4 0.112582781457 17 134
0.922 46 3.5
0.949 46 3.5
0.964 46 3.5
0.571 46 3.5
0.600 46 3.5
0.274 46 3.5
0.568 46 3.5
0.651 46 3.5
0.754 46 3.5
0.576 37 4.0
0.751 37 4.0
0.674 37 4.0
0.623 19 5.0
0.573 19 5.0
0.654 19 5.0
0.862 46 3.5
0.672 46 3.5
0.980 46 3.5
0.716 46 3.5
0.616 46 3.5
0.492 46 3.5
0.764 37 4.0
0.810 37 4.0
0.654 37 4.0
0.240 46 3.5
0.542 46 3.5
0.152 46 3.5
0.419 46 3.5
0.380 46 3.5
0.508 46 3.5
0.502 46 3.5
0.466 46 3.5
0.193 46 3.5
0.947 46 3.5
0.953 46 3.5
0.731 46 3.5
0.452 46 3.5
0.895 46 3.5
0.274 46 3.5
0.621 46 3.5
0.333 46 3.5
0.682 46 3.5
0.811 46 3.5
0.536 46 3.5
0.457 46 3.5
0.320 46 3.5
0.298 46 3.5
0.380 46 3.5
0.292 46 3.5
0.501 46 3.5
0.552 46 3.5

Dog_5 0.338235294118 230 450
Dog_5 0.0625 5 75
0.839 46 3.5
0.965 46 3.5
0.843 46 3.5
0.973 46 3.5
0.914 46 3.5
0.961 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
0.995 46 3.5
0.907 46 3.5
0.853 46 3.5
0.820 46 3.5

Patient_1 0.734042553191 138 50
Patient_1 0.25 3 9
0.149 46 3.5
0.565 46 3.5
0.419 46 3.5
1.000 46 3.5
1.000 46 3.5
1.000 46 3.5
0.784 46 3.5
0.731 46 3.5
0.709 46 3.5

Patient_2 0.766666666667 138 42
Patient_2 0.3 3 7
1.000 46 3.5
1.000 46 3.5
0.946 46 3.5
0.996 46 3.5
0.995 46 3.5
0.995 46 3.5
0.808 46 3.5
0.547 46 3.5
1.000 46 3.5


In [10]:
fname = '../data-cache/140926-CV.%s%s.pkl'%(suffix, FEATURES)
with open(fname,'wb') as fp:
    pickle.dump(target2iter2ys,fp,-1)

In [11]:
fname


Out[11]:
'../data-cache/140926-CV.max_depth5gen-8_medianwindow-bands2-usf-w10-hammingP2-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9.pkl'

Generate a single AUC score


In [12]:
def p(a,b):
    return '%d E%d'%(1000*a,1000*b)

for f in [
            'max_depth5gen-8_medianwindow-bands2-usf-w10-hammingP2-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
            'max_depth10gen-8_medianwindow-bandsI2-usf-w60-b0.2-b4-b8-b12-b30-b50-b75-b100-b117-0.1-0.5-0.9',
            'max_depth5gen-8_medianwindow-bandsI2-usf-w60-b0.2-b4-b8-b12-b30-b50-b75-b100-b117-0.1-0.5-0.9',
            'max_depth5.min_samples_leaf4.gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
            'max_depth5.gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
        ]:
    all_ytest = all_y_proba =None
    all_aucs = []
    with open('../data-cache/140926-CV.%s.pkl'%f,'rb') as fp:
        target2iter2ys = pickle.load(fp)
    for target, iter2ys in target2iter2ys.iteritems():
        target_ytest = target_y_proba =None
        target_aucs = []
        print target,
        for ys in iter2ys.itervalues():
            ytest = y_proba =None
            aucs = []
            for y in ys:
                yt, yp = y
                ytest = yt if ytest is None else np.concatenate((ytest,yt))
                y_proba = yp if y_proba is None else np.concatenate((y_proba,yp))
                aucs.append(roc_auc_score(yt, yp))
            print p(roc_auc_score(ytest, y_proba), np.mean(aucs)),
            target_aucs += aucs
            target_ytest = ytest if target_ytest is None else np.concatenate((target_ytest,ytest))
            target_y_proba = y_proba if target_y_proba is None else np.concatenate((target_y_proba,y_proba))
        print target,p(roc_auc_score(target_ytest, target_y_proba),np.mean(target_aucs))
        all_aucs += target_aucs        
        all_ytest = target_ytest if all_ytest is None else np.concatenate((all_ytest,target_ytest))
        all_y_proba = target_y_proba if all_y_proba is None else np.concatenate((all_y_proba,target_y_proba))
#         if target == 'Dog_3':
#             pl.hist(target_aucs,alpha=0.5)
    print f,p(roc_auc_score(all_ytest, all_y_proba),np.mean(all_aucs))
    print


Dog_2 887 E921 919 E955 895 E922 Dog_2 899 E933
Dog_3 733 E758 738 E760 738 E767 Dog_3 737 E762
Dog_1 612 E615 598 E622 618 E638 Dog_1 611 E625
Dog_4 592 E600 599 E619 537 E551 Dog_4 577 E590
Dog_5 939 E943 940 E946 913 E923 Dog_5 931 E937
Patient_2 894 E934 840 E847 914 E980 Patient_2 884 E920
Patient_1 705 E644 778 E765 757 E709 Patient_1 747 E706
max_depth5gen-8_medianwindow-bands2-usf-w10-hammingP2-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9 763 E740

Dog_2 885 E910 894 E927 888 E915 Dog_2 888 E917
Dog_3 744 E751 767 E773 722 E723 Dog_3 744 E749
Dog_1 621 E585 706 E711 672 E663 Dog_1 667 E653
Dog_4 516 E502 581 E590 533 E563 Dog_4 545 E552
Dog_5 929 E929 912 E916 946 E961 Dog_5 927 E935
Patient_2 754 E708 916 E883 806 E788 Patient_2 821 E793
Patient_1 709 E686 651 E604 850 E892 Patient_1 734 E727
max_depth10gen-8_medianwindow-bandsI2-usf-w60-b0.2-b4-b8-b12-b30-b50-b75-b100-b117-0.1-0.5-0.9 755 E718

Dog_2 872 E912 854 E889 864 E895 Dog_2 864 E898
Dog_3 790 E798 768 E777 742 E765 Dog_3 766 E780
Dog_1 676 E682 640 E651 589 E606 Dog_1 635 E646
Dog_4 591 E611 636 E684 531 E533 Dog_4 585 E609
Dog_5 961 E967 927 E918 946 E934 Dog_5 945 E940
Patient_2 968 E953 808 E972 784 E684 Patient_2 850 E870
Patient_1 637 E667 856 E857 907 E860 Patient_1 779 E795
max_depth5gen-8_medianwindow-bandsI2-usf-w60-b0.2-b4-b8-b12-b30-b50-b75-b100-b117-0.1-0.5-0.9 765 E751

Dog_2 861 E895 861 E909 879 E916 Dog_2 866 E907
Dog_3 716 E750 753 E756 738 E714 Dog_3 734 E740
Dog_1 617 E628 552 E562 563 E594 Dog_1 575 E595
Dog_4 617 E643 591 E605 540 E567 Dog_4 581 E605
Dog_5 905 E893 902 E923 957 E958 Dog_5 920 E925
Patient_2 868 E958 792 E727 676 E668 Patient_2 797 E784
Patient_1 722 E735 872 E862 706 E694 Patient_1 766 E764
max_depth5.min_samples_leaf4.gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9 750 E729

Dog_2 889 E919 908 E954 875 E914 Dog_2 890 E929
Dog_3 794 E818 766 E787 771 E777 Dog_3 777 E794
Dog_1 605 E605 538 E524 603 E605 Dog_1 581 E578
Dog_4 595 E610 565 E569 556 E583 Dog_4 571 E587
Dog_5 937 E926 908 E901 947 E951 Dog_5 930 E926
Patient_2 991 E992 977 E949 858 E973 Patient_2 936 E972
Patient_1 819 E807 886 E866 695 E636 Patient_1 797 E770
max_depth5.gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9 767 E749


In [ ]: