Manually run this notebooks on all targets


In [97]:
target = 'Patient_2'

In [98]:
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os
import re
import math
import sys
import random
from collections import defaultdict

In [99]:
import sys
sys.path.append('..')
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')
def read_data(target, data_type, features):
    return cached_data_loader.load('data_%s_%s_%s'%(data_type,target,features),None)

Read data


In [100]:
FEATURES = 'gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9'

each target receive a different model

positive examples. The positive examles were upsampled (using gen_ictal=-8)


In [101]:
pdata = read_data(target, 'preictal', FEATURES)
Np, NF = pdata.X.shape
Np, NF


Out[101]:
(138, 360)

negative examples


In [102]:
ndata = read_data(target, 'interictal', FEATURES)
Nn, NF = ndata.X.shape
Nn, NF


Out[102]:
(42, 360)

data is broken into segments that should be taken together when splitting to training and validation


In [103]:
def getsegments(pdata):
    segments = []
    start = 0
    last_l = 0
    for i,l in enumerate(pdata.latencies):
        if l<last_l:
            segments.append(range(start,i))
            start = i
        last_l = l
    segments.append(range(start,i+1))
    return segments

get positive and negative segments


In [104]:
psegments = getsegments(pdata)
Nps = len(psegments)
nsegments = getsegments(ndata)
Nns = len(nsegments)
Nps, Nns


Out[104]:
(3, 7)

it looks like there is much more negative segments than positive, but keep in mind that with the gen-8_... option there are about 8 positive examples generated for every negative. So the number of examples per segment is about the same:


In [105]:
(Np/8.)/Nps,Nn/Nns


Out[105]:
(5.75, 6)

statistics


In [106]:
npratio = float(Nn)/Np # ration of number of negative to positive examples
pratio = 1/(1+npratio) # ratio of positive examples from all
print target,pratio,Np,Nn
npsratio = float(Nns)/Nps # ration of number of negative to positive segments
psratio = 1/(1+npsratio) # ratio of positie segment to all
print target,psratio,Nps,Nns
Ntrainps = 1
Ntrainns = int(Ntrainps*npsratio)
print 'For every one positive segment, we need %d negative segments'%Ntrainns


Patient_2 0.766666666667 138 42
Patient_2 0.3 3 7
For every one positive segment, we need 2 negative segments

combine positive and negative examples


In [107]:
X = np.concatenate((pdata.X, ndata.X))
y = np.zeros(X.shape[0])
y[:pdata.X.shape[0]] = 1
nsegments = [[s+Np for s in ns] for ns in nsegments] # the location of the negative segments in the combined data

In [108]:
y.mean()


Out[108]:
0.76666666666666672

In [109]:
latencies = np.concatenate((pdata.latencies,ndata.latencies))

In [110]:
N, NF = X.shape
N, NF


Out[110]:
(180, 360)

Classification

shared memory

The data itself is identical so there is no need to duplicate it for each process and we will use shared memory (shmem)


In [111]:
!rm ../data-cache/X.mmap
mmX = np.memmap('../data-cache/X.mmap', shape=X.shape, dtype=np.float32, mode='w+')
mmX[:,:] = X
del mmX # flush to disk

parallel

We will use ipython parallel processing infrastructure. Visit Clusters tab in the Home page of ipython and start 8 engines (or as many cores you have on your machine) from the default profile

OR you can run the command line:

ipcluster start --n=8

wait a little bit (otherwise you will get an error on next cell)


In [112]:
#!sleep 30

In [113]:
from IPython.parallel import Client
client = Client()
lv = client.load_balanced_view()
#lv.set_flags(block = False, retries = 0)
clients=client[:]
Ncores = len(clients)
Ncores


Out[113]:
8

copy some information to all engines


In [114]:
clients.scatter('engineid', range(Ncores))


Out[114]:
<AsyncResult: finished>

In [115]:
clients['X_shape'] = X.shape
clients['latencies'] = latencies
clients['y'] = y
clients['psegments'] = psegments
clients['nsegments'] = nsegments
clients['target'] = target

load the shared memory on all engines


In [116]:
%%px
import numpy as np
N, NF = X_shape
X = np.memmap('../data-cache/X.mmap', shape=X_shape, dtype=np.float32, mode='r')

Split data to training and validation. Randomly pick one positive segment for validation and a matching number of negative segments


In [117]:
%%px
import random, itertools
def random_train_validation_split(psegments=psegments, nsegments=nsegments, N=N, pratio=1, select=-1):
    """
    N - total number of samples.
    pratio - 0: no validation, 1: use all training data, 0<pratio<1: bootstrap from all training data
    select - the index of the positive segment we want to use for validation, if -1 then make a random selection
    """
    if pratio == 0:
        return range(N), []
    Nps = len(psegments)
    assert Nps > 1
    Nns = len(nsegments)
    assert Nns > 1
    npsratio = float(Nns)/Nps
    Ntrainps = 1
    Ntrainns = min(max(1,int(Ntrainps*npsratio+0.5)), Nns-1) # make sure we have at least one segment in train and validate
    
    if select < 0:
        s = random.choice(psegments)
    else:
        s = psegments[select]
    ns = random.sample(nsegments,Ntrainns) # select negative segments
    n = list(itertools.chain(*ns)) # .ravel does not work - elements of nsegments are not of equal length
    sample_validate = s + n
    random.shuffle(sample_validate)
    
    
    all_p = list(itertools.chain(*psegments))
    all_n = list(itertools.chain(*nsegments))

    testp = list(set(all_p) - set(s))
    if pratio != 1:
#         testp *= pratio
        ntestp = len(testp)
        boot_ntestp = int(ntestp*pratio)
        w = np.ones(ntestp)/float(ntestp)
        testp = [testp[i] for i, n in enumerate(np.random.multinomial(boot_ntestp, w))
                 for k in xrange(n)]
        
    testn = list(set(all_n) - set(n))
    sample_test = testp + testn
    random.shuffle(sample_test)

    return sample_test, sample_validate

In [118]:
%%px
from sklearn.ensemble import RandomForestClassifier

def predict(select):
    sample_train, sample_validate = random_train_validation_split(pratio=1, select=select)    

    X_trn = X[sample_train,:]
    y_trn = y[sample_train]
    latencies_trn = latencies[sample_train]
    X_val = X[sample_validate,:]
    y_val = y[sample_validate]
    latencies_val = latencies[sample_validate]
    
    est = RandomForestClassifier(n_estimators=3000, min_samples_split=1,
                                 bootstrap=False,#max_features='log2',
                                 max_depth=10,n_jobs=2)
    est.fit(X_trn, y_trn)
    y_val_est = est.predict_proba(X_val)[:,1]
    return (sample_validate,latencies_val,y_val,y_val_est)

In [119]:
def work(select):
    prediction = predict(select)
    
    from lockfile import LockFile
    import cPickle as pickle

    lock = LockFile('.lock')
    with lock:
        with open('../data-cache/validate-%s.spkl'%target,'ab') as fp:
            # in a later stage we will use the OOB samples to make predictions on all samples
            # so keep a record of the index of each OOB sample
            pickle.dump(prediction, fp, -1)
    return select

In [120]:
!rm ../data-cache/validate-{target}.spkl

In [121]:
results = lv.map(work, range(Nps)*8)

In [122]:
import IPython
itr = results.__iter__()
while True:
    try:
        r = itr.next()
    except StopIteration:
        print 'stopped'
        break
    except IPython.parallel.error.RemoteError as e:
        print e
        continue
    except Exception as e:
        print e.__class__
        continue
    print r


0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
stopped

In [123]:
fp = open('../data-cache/validate-%s.spkl'%target,'rb')
count = 0
y_est = np.zeros(N)
y_true = np.zeros(N)
y_count = np.zeros(N)
vals = []
while True:
    try:
        sample_validate,latencies_val,y_val,y_val_est = pickle.load(fp)
    except:
        break
    count += 1
    y_est[sample_validate] += y_val_est
    y_true[sample_validate] += y_val
    y_count[sample_validate] += 1
y_est /= y_count
y_true /= y_count

In [124]:
assert np.all(y_count > 0)

In [125]:
y_true.mean(),y_est.mean()


Out[125]:
(0.76666666666666672, 0.24479941211052325)

In [126]:
y_true.shape, y_est.shape


Out[126]:
((180,), (180,))

In [127]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_true, y_est)


Out[127]:
0.51708074534161486

In [127]: