Manually run this notebooks on all targets
In [97]:
target = 'Patient_2'
In [98]:
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os
import re
import math
import sys
import random
from collections import defaultdict
In [99]:
import sys
sys.path.append('..')
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')
def read_data(target, data_type, features):
return cached_data_loader.load('data_%s_%s_%s'%(data_type,target,features),None)
In [100]:
FEATURES = 'gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9'
each target receive a different model
positive examples. The positive examles were upsampled (using gen_ictal=-8
)
In [101]:
pdata = read_data(target, 'preictal', FEATURES)
Np, NF = pdata.X.shape
Np, NF
Out[101]:
negative examples
In [102]:
ndata = read_data(target, 'interictal', FEATURES)
Nn, NF = ndata.X.shape
Nn, NF
Out[102]:
data is broken into segments that should be taken together when splitting to training and validation
In [103]:
def getsegments(pdata):
segments = []
start = 0
last_l = 0
for i,l in enumerate(pdata.latencies):
if l<last_l:
segments.append(range(start,i))
start = i
last_l = l
segments.append(range(start,i+1))
return segments
get positive and negative segments
In [104]:
psegments = getsegments(pdata)
Nps = len(psegments)
nsegments = getsegments(ndata)
Nns = len(nsegments)
Nps, Nns
Out[104]:
it looks like there is much more negative segments than positive, but keep in mind that with the gen-8_...
option there are about 8 positive examples generated for every negative. So the number of examples per segment is about the same:
In [105]:
(Np/8.)/Nps,Nn/Nns
Out[105]:
statistics
In [106]:
npratio = float(Nn)/Np # ration of number of negative to positive examples
pratio = 1/(1+npratio) # ratio of positive examples from all
print target,pratio,Np,Nn
npsratio = float(Nns)/Nps # ration of number of negative to positive segments
psratio = 1/(1+npsratio) # ratio of positie segment to all
print target,psratio,Nps,Nns
Ntrainps = 1
Ntrainns = int(Ntrainps*npsratio)
print 'For every one positive segment, we need %d negative segments'%Ntrainns
combine positive and negative examples
In [107]:
X = np.concatenate((pdata.X, ndata.X))
y = np.zeros(X.shape[0])
y[:pdata.X.shape[0]] = 1
nsegments = [[s+Np for s in ns] for ns in nsegments] # the location of the negative segments in the combined data
In [108]:
y.mean()
Out[108]:
In [109]:
latencies = np.concatenate((pdata.latencies,ndata.latencies))
In [110]:
N, NF = X.shape
N, NF
Out[110]:
The data itself is identical so there is no need to duplicate it for each process and we will use shared memory (shmem)
In [111]:
!rm ../data-cache/X.mmap
mmX = np.memmap('../data-cache/X.mmap', shape=X.shape, dtype=np.float32, mode='w+')
mmX[:,:] = X
del mmX # flush to disk
We will use ipython parallel processing infrastructure. Visit Clusters tab in the Home page of ipython and start 8 engines (or as many cores you have on your machine) from the default profile
OR you can run the command line:
ipcluster start --n=8
wait a little bit (otherwise you will get an error on next cell)
In [112]:
#!sleep 30
In [113]:
from IPython.parallel import Client
client = Client()
lv = client.load_balanced_view()
#lv.set_flags(block = False, retries = 0)
clients=client[:]
Ncores = len(clients)
Ncores
Out[113]:
copy some information to all engines
In [114]:
clients.scatter('engineid', range(Ncores))
Out[114]:
In [115]:
clients['X_shape'] = X.shape
clients['latencies'] = latencies
clients['y'] = y
clients['psegments'] = psegments
clients['nsegments'] = nsegments
clients['target'] = target
load the shared memory on all engines
In [116]:
%%px
import numpy as np
N, NF = X_shape
X = np.memmap('../data-cache/X.mmap', shape=X_shape, dtype=np.float32, mode='r')
Split data to training and validation. Randomly pick one positive segment for validation and a matching number of negative segments
In [117]:
%%px
import random, itertools
def random_train_validation_split(psegments=psegments, nsegments=nsegments, N=N, pratio=1, select=-1):
"""
N - total number of samples.
pratio - 0: no validation, 1: use all training data, 0<pratio<1: bootstrap from all training data
select - the index of the positive segment we want to use for validation, if -1 then make a random selection
"""
if pratio == 0:
return range(N), []
Nps = len(psegments)
assert Nps > 1
Nns = len(nsegments)
assert Nns > 1
npsratio = float(Nns)/Nps
Ntrainps = 1
Ntrainns = min(max(1,int(Ntrainps*npsratio+0.5)), Nns-1) # make sure we have at least one segment in train and validate
if select < 0:
s = random.choice(psegments)
else:
s = psegments[select]
ns = random.sample(nsegments,Ntrainns) # select negative segments
n = list(itertools.chain(*ns)) # .ravel does not work - elements of nsegments are not of equal length
sample_validate = s + n
random.shuffle(sample_validate)
all_p = list(itertools.chain(*psegments))
all_n = list(itertools.chain(*nsegments))
testp = list(set(all_p) - set(s))
if pratio != 1:
# testp *= pratio
ntestp = len(testp)
boot_ntestp = int(ntestp*pratio)
w = np.ones(ntestp)/float(ntestp)
testp = [testp[i] for i, n in enumerate(np.random.multinomial(boot_ntestp, w))
for k in xrange(n)]
testn = list(set(all_n) - set(n))
sample_test = testp + testn
random.shuffle(sample_test)
return sample_test, sample_validate
In [118]:
%%px
from sklearn.ensemble import RandomForestClassifier
def predict(select):
sample_train, sample_validate = random_train_validation_split(pratio=1, select=select)
X_trn = X[sample_train,:]
y_trn = y[sample_train]
latencies_trn = latencies[sample_train]
X_val = X[sample_validate,:]
y_val = y[sample_validate]
latencies_val = latencies[sample_validate]
est = RandomForestClassifier(n_estimators=3000, min_samples_split=1,
bootstrap=False,#max_features='log2',
max_depth=10,n_jobs=2)
est.fit(X_trn, y_trn)
y_val_est = est.predict_proba(X_val)[:,1]
return (sample_validate,latencies_val,y_val,y_val_est)
In [119]:
def work(select):
prediction = predict(select)
from lockfile import LockFile
import cPickle as pickle
lock = LockFile('.lock')
with lock:
with open('../data-cache/validate-%s.spkl'%target,'ab') as fp:
# in a later stage we will use the OOB samples to make predictions on all samples
# so keep a record of the index of each OOB sample
pickle.dump(prediction, fp, -1)
return select
In [120]:
!rm ../data-cache/validate-{target}.spkl
In [121]:
results = lv.map(work, range(Nps)*8)
In [122]:
import IPython
itr = results.__iter__()
while True:
try:
r = itr.next()
except StopIteration:
print 'stopped'
break
except IPython.parallel.error.RemoteError as e:
print e
continue
except Exception as e:
print e.__class__
continue
print r
In [123]:
fp = open('../data-cache/validate-%s.spkl'%target,'rb')
count = 0
y_est = np.zeros(N)
y_true = np.zeros(N)
y_count = np.zeros(N)
vals = []
while True:
try:
sample_validate,latencies_val,y_val,y_val_est = pickle.load(fp)
except:
break
count += 1
y_est[sample_validate] += y_val_est
y_true[sample_validate] += y_val
y_count[sample_validate] += 1
y_est /= y_count
y_true /= y_count
In [124]:
assert np.all(y_count > 0)
In [125]:
y_true.mean(),y_est.mean()
Out[125]:
In [126]:
y_true.shape, y_est.shape
Out[126]:
In [127]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_true, y_est)
Out[127]:
In [127]: