In [1]:
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os
import random
from collections import defaultdict

In [2]:
import sys
sys.path.append('..')

Read precomputed features

uncommoent the relevant pipeline in ../seizure_detection.py and run

cd ..
./doall data

In [3]:
FEATURES = 'gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9'

In [4]:
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')

In [5]:
def read_data(target, data_type):
    return cached_data_loader.load('data_%s_%s_%s'%(data_type,target,FEATURES),None)

Predict


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression as LR

clf = RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False,max_depth=5,
                             n_jobs=-1) #

In [9]:
with_weights = False
suffix = 'fft.'

Compute AUC for each target separatly


In [10]:
target2iter2ys = {}
for target in ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2']:
    # positive examples
    pdata = read_data(target, 'preictal')
    Np, NF = pdata.X.shape
    t = NF//3
    assert 3*t == NF
    mask = range(t-5,t) + range(2*t-5,2*t) + range(3*t-5,3*t)
    
    # split the positive examples into segments, each from the same event
    # in each CV-split we will take all examples from the same segment to either train or validate
    segments = []
    start = 0
    last_l = 0
    for i,l in enumerate(pdata.latencies):
        if l<last_l:
            segments.append(np.arange(start,i))
            start = i
        last_l = l
    segments.append(np.arange(start,i+1))
    Ns = len(segments)

    # negative examples
    ndata = read_data(target, 'interictal')
    Nn = ndata.X.shape[0]
    
    npratio = float(Nn)/Np
    print target,1/(1+npratio),Ns,Np,Nn

    iter2ys = defaultdict(list) # {niter: Ns *[(ytest,y_proba)]
    for s in segments:
        for niter in range(3):
            # each time, take one segment for testing and randomly pick negative examples
            Xtestp = pdata.X[s,:]
            weightstest = pdata.latencies[s] # latency for first segment is 1

            Ntrainp = len(s)
            Ntrainn = int(Ntrainp*npratio)
            n = np.array(random.sample(xrange(Nn),Ntrainn))
            Xtestn = ndata.X[n,:]

            Xtrainp = pdata.X[-s,:]
            weightsp = pdata.latencies[-s] # latency for first segment is 1
            Xtrainn = ndata.X[-n,:]
            weightsn = np.ones(Xtrainn.shape[0]) 

            Xtrain = np.concatenate((Xtrainp,Xtrainn))
            weights = np.concatenate((weightsp,weightsn))
            ytrain = np.concatenate((np.ones(Ntrainp),np.zeros(Ntrainn)))
            perm = np.random.permutation(len(ytrain))
            ytrain = ytrain[perm]
            Xtrain = Xtrain[perm,:]
            weights = weights[perm]

            Xtest = np.concatenate((Xtestp,Xtestn))
            ytest = np.concatenate((np.ones(Xtestp.shape[0]),np.zeros(Xtestn.shape[0])))

            if with_weights:
                clf.fit(Xtrain[:,mask], ytrain, sample_weight=weights)
            else:
                clf.fit(Xtrain[:,mask], ytrain)

            y_proba = clf.predict_proba(Xtest[:,mask])[:,1]
            iter2ys[niter].append((ytest, y_proba))
            
            auc = roc_auc_score(ytest, y_proba)
            print '%.3f'%auc,Ntrainp,np.mean(weightstest)
    target2iter2ys[target] = iter2ys
    print


Dog_1 0.277108433735 4 184 480
0.724 46 3.5
0.772 46 3.5
0.731 46 3.5
0.350 46 3.5
0.283 46 3.5
0.311 46 3.5
0.510 46 3.5
0.511 46 3.5
0.468 46 3.5
0.720 46 3.5
0.666 46 3.5
0.741 46 3.5

Dog_2 0.391727493917 7 322 500
0.940 46 3.5
0.936 46 3.5
0.868 46 3.5
0.583 46 3.5
0.364 46 3.5
0.478 46 3.5
0.910 46 3.5
0.855 46 3.5
0.833 46 3.5
1.000 46 3.5
0.999 46 3.5
1.000 46 3.5
0.809 46 3.5
0.945 46 3.5
0.931 46 3.5
0.942 46 3.5
0.948 46 3.5
0.852 46 3.5
0.645 46 3.5
0.690 46 3.5
0.699 46 3.5

Dog_3 0.277108433735 12 552 1440
0.884 46 3.5
0.908 46 3.5
0.965 46 3.5
0.903 46 3.5
0.911 46 3.5
0.926 46 3.5
0.882 46 3.5
0.834 46 3.5
0.874 46 3.5
0.720 46 3.5
0.675 46 3.5
0.754 46 3.5
0.368 46 3.5
0.397 46 3.5
0.364 46 3.5
0.993 46 3.5
0.995 46 3.5
0.987 46 3.5
1.000 46 3.5
0.997 46 3.5
1.000 46 3.5
0.944 46 3.5
0.804 46 3.5
0.842 46 3.5
0.504 46 3.5
0.559 46 3.5
0.603 46 3.5
0.666 46 3.5
0.753 46 3.5
0.826 46 3.5
0.857 46 3.5
0.853 46 3.5
0.870 46 3.5
0.669 46 3.5
0.630 46 3.5
0.664 46 3.5

Dog_4 0.478260869565 17 737 804
0.950 46 3.5
0.969 46 3.5
0.979 46 3.5
0.587 46 3.5
0.478 46 3.5
0.519 46 3.5
0.696 46 3.5
0.840 46 3.5
0.490 46 3.5
0.217 37 4.0
0.289 37 4.0
0.512 37 4.0
0.403 19 5.0
0.303 19 5.0
0.311 19 5.0
0.946 46 3.5
0.941 46 3.5
0.915 46 3.5
0.718 46 3.5
0.618 46 3.5
0.612 46 3.5
0.757 37 4.0
0.733 37 4.0
0.691 37 4.0
0.325 46 3.5
0.405 46 3.5
0.312 46 3.5
0.536 46 3.5
0.588 46 3.5
0.697 46 3.5
0.690 46 3.5
0.618 46 3.5
0.680 46 3.5
0.812 46 3.5
0.717 46 3.5
0.766 46 3.5
0.908 46 3.5
0.928 46 3.5
0.886 46 3.5
0.343 46 3.5
0.505 46 3.5
0.415 46 3.5
0.628 46 3.5
0.443 46 3.5
0.445 46 3.5
0.531 46 3.5
0.492 46 3.5
0.366 46 3.5
0.723 46 3.5
0.769 46 3.5
0.861 46 3.5

Dog_5 0.338235294118 5 230 450
0.876 46 3.5
0.810 46 3.5
0.849 46 3.5
0.949 46 3.5
0.907 46 3.5
0.939 46 3.5
1.000 46 3.5
1.000 46 3.5
0.998 46 3.5
0.937 46 3.5
0.943 46 3.5
0.919 46 3.5
0.868 46 3.5
0.843 46 3.5
0.716 46 3.5

Patient_1 0.734042553191 3 138 50
0.686 46 3.5
0.803 46 3.5
0.537 46 3.5
0.997 46 3.5
0.988 46 3.5
0.999 46 3.5
0.789 46 3.5
0.815 46 3.5
0.938 46 3.5

Patient_2 0.766666666667 3 138 42
0.932 46 3.5
0.731 46 3.5
0.742 46 3.5
1.000 46 3.5
1.000 46 3.5
0.970 46 3.5
0.743 46 3.5
0.745 46 3.5
0.731 46 3.5


In [11]:
fname = '../data-cache/140907-CV.%s%s.pkl'%(suffix, FEATURES)
with open(fname,'wb') as fp:
    pickle.dump(target2iter2ys,fp,-1)

In [12]:
fname


Out[12]:
'../data-cache/140907-CV.fft.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9.pkl'

Generate a single AUC score


In [23]:
def p(a,b):
    return '%d E%d'%(1000*a,1000*b)

for f in [
            'max_depth2.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
            'max_depth5.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
            'max_depth5.p15.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
            'max_depth5.p5.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
            'max_depth5.p59.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
            'max_depth10.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
            'gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9',
            'n3.gen8_medianwindow1-fft-with-time-freq-corr-1-48-r400-usf-w600',
#             'n3.1.gen8_medianwindow-fft-with-time-freq-corr-1-48-r400-usf-w600',
#             'n3.w.gen8_medianwindow-fft-with-time-freq-corr-1-48-r400-usf-w600',
#             'n3.w.gen16_medianwindow-fft-with-time-freq-corr-1-48-r400-usf-w600',
        ]:
    all_ytest = all_y_proba =None
    all_aucs = []
    with open('../data-cache/140907-CV.%s.pkl'%f,'rb') as fp:
        target2iter2ys = pickle.load(fp)
    for target, iter2ys in target2iter2ys.iteritems():
        target_ytest = target_y_proba =None
        target_aucs = []
        print target,
        for ys in iter2ys.itervalues():
            ytest = y_proba =None
            aucs = []
            for y in ys:
                yt, yp = y
                ytest = yt if ytest is None else np.concatenate((ytest,yt))
                y_proba = yp if y_proba is None else np.concatenate((y_proba,yp))
                aucs.append(roc_auc_score(yt, yp))
            print p(roc_auc_score(ytest, y_proba), np.mean(aucs)),
            target_aucs += aucs
            target_ytest = ytest if target_ytest is None else np.concatenate((target_ytest,ytest))
            target_y_proba = y_proba if target_y_proba is None else np.concatenate((target_y_proba,y_proba))
        print target,p(roc_auc_score(target_ytest, target_y_proba),np.mean(target_aucs))
        all_aucs += target_aucs        
        all_ytest = target_ytest if all_ytest is None else np.concatenate((all_ytest,target_ytest))
        all_y_proba = target_y_proba if all_y_proba is None else np.concatenate((all_y_proba,target_y_proba))
#         if target == 'Dog_3':
#             pl.hist(target_aucs,alpha=0.5)
    print f,p(roc_auc_score(all_ytest, all_y_proba),np.mean(all_aucs))
    print


Dog_2 848 E895 826 E854 850 E875 Dog_2 842 E875
Dog_3 733 E758 710 E736 736 E776 Dog_3 727 E757
Dog_1 489 E495 468 E462 501 E503 Dog_1 486 E487
Dog_4 574 E609 577 E574 562 E592 Dog_4 571 E591
Dog_5 826 E873 823 E859 837 E889 Dog_5 830 E874
Patient_2 869 E920 923 E939 908 E920 Patient_2 903 E927
Patient_1 823 E764 854 E808 769 E772 Patient_1 813 E781
max_depth2.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9 709 E720

Dog_2 894 E925 892 E925 913 E945 Dog_2 899 E931
Dog_3 775 E786 787 E799 777 E780 Dog_3 780 E788
Dog_1 613 E615 585 E578 629 E624 Dog_1 609 E606
Dog_4 640 E653 578 E587 599 E613 Dog_4 605 E618
Dog_5 937 E932 943 E942 958 E962 Dog_5 946 E945
Patient_2 923 E954 916 E973 925 E901 Patient_2 918 E942
Patient_1 915 E881 833 E764 863 E860 Patient_1 871 E835
max_depth5.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9 776 E764

Dog_2 877 E909 888 E926 913 E938 Dog_2 892 E924
Dog_3 775 E775 774 E775 788 E804 Dog_3 779 E784
Dog_1 562 E561 575 E567 568 E558 Dog_1 568 E562
Dog_4 578 E588 574 E580 578 E567 Dog_4 577 E578
Dog_5 930 E933 936 E925 937 E944 Dog_5 933 E934
Patient_2 956 E996 765 E763 848 E814 Patient_2 860 E858
Patient_1 855 E821 889 E872 840 E796 Patient_1 858 E829
max_depth5.p15.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9 759 E739

Dog_2 890 E930 898 E937 900 E924 Dog_2 896 E930
Dog_3 782 E799 773 E782 793 E811 Dog_3 783 E797
Dog_1 585 E584 549 E546 611 E627 Dog_1 581 E586
Dog_4 580 E566 596 E595 606 E632 Dog_4 595 E598
Dog_5 920 E935 906 E917 933 E944 Dog_5 920 E932
Patient_2 892 E866 951 E944 857 E900 Patient_2 901 E903
Patient_1 889 E904 890 E860 833 E778 Patient_1 873 E847
max_depth5.p5.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9 765 E755

Dog_2 916 E942 895 E919 893 E912 Dog_2 901 E924
Dog_3 787 E806 770 E771 769 E781 Dog_3 776 E786
Dog_1 633 E647 578 E582 629 E638 Dog_1 613 E622
Dog_4 565 E590 576 E571 607 E615 Dog_4 582 E592
Dog_5 942 E943 921 E923 944 E954 Dog_5 936 E940
Patient_2 850 E870 867 E922 937 E943 Patient_2 887 E911
Patient_1 837 E781 894 E857 871 E881 Patient_1 866 E840
max_depth5.p59.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9 768 E753

Dog_2 894 E956 903 E928 875 E916 Dog_2 890 E934
Dog_3 792 E801 772 E790 784 E795 Dog_3 784 E795
Dog_1 606 E601 624 E610 624 E605 Dog_1 618 E605
Dog_4 600 E619 572 E574 576 E583 Dog_4 583 E592
Dog_5 948 E947 945 E959 939 E938 Dog_5 944 E948
Patient_2 881 E849 888 E842 894 E899 Patient_2 886 E864
Patient_1 759 E650 921 E938 887 E896 Patient_1 861 E828
max_depth10.gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9 770 E752

Dog_2 906 E940 875 E913 901 E929 Dog_2 894 E927
Dog_3 774 E788 752 E768 777 E790 Dog_3 768 E782
Dog_1 608 E591 637 E638 632 E631 Dog_1 626 E620
Dog_4 629 E648 586 E587 578 E583 Dog_4 599 E606
Dog_5 961 E962 947 E947 958 E962 Dog_5 955 E957
Patient_2 878 E893 835 E922 876 E919 Patient_2 865 E911
Patient_1 783 E805 715 E750 912 E911 Patient_1 810 E822
gen-8_medianwindow-bands-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9 772 E758

Dog_2 912 E964 875 E924 880 E925 Dog_2 889 E937
Dog_3 715 E724 731 E735 750 E751 Dog_3 732 E737
Dog_1 638 E626 626 E633 643 E635 Dog_1 636 E632
Dog_4 620 E658 586 E590 591 E608 Dog_4 599 E619
Dog_5 940 E951 953 E960 934 E958 Dog_5 942 E956
Patient_2 949 E977 874 E822 923 E952 Patient_2 916 E917
Patient_1 881 E896 902 E901 915 E923 Patient_1 899 E907
n3.gen8_medianwindow1-fft-with-time-freq-corr-1-48-r400-usf-w600 763 E759


In [12]:
NF


Out[12]:
360

In [ ]: