Manually run this notebooks on all targets
In [ ]:
target = 'Dog_5'
In [ ]:
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os
import re
import math
import sys
import random
In [ ]:
import sys
sys.path.append('..')
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')
def read_data(target, data_type, features):
return cached_data_loader.load('data_%s_%s_%s'%(data_type,target,features),None)
In [ ]:
FEATURES = 'gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9'
each target receive a different model
positive examples. The positive examles were upsampled (using gen_ictal=-8
)
In [ ]:
pdata = read_data(target, 'preictal', FEATURES)
Np, NF = pdata.X.shape
Np, NF
negative examples
In [ ]:
ndata = read_data(target, 'interictal', FEATURES)
Nn, NF = ndata.X.shape
Nn, NF
data is broken into segments that should be taken together when splitting to training and validation
In [ ]:
def getsegments(pdata):
segments = []
start = 0
last_l = 0
for i,l in enumerate(pdata.latencies):
if l<last_l:
segments.append(range(start,i))
start = i
last_l = l
segments.append(range(start,i+1))
return segments
In [ ]:
psegments = getsegments(pdata)
Nps = len(psegments)
nsegments = getsegments(ndata)
Nns = len(nsegments)
statistics
In [ ]:
npratio = float(Nn)/Np
print target,1/(1+npratio),Np,Nn
npsratio = float(Nns)/Nps
print target,1/(1+npsratio),Nps,Nns
Ntrainps = 1
Ntrainns = int(Ntrainps*npsratio)
Ntrainns
In [ ]:
X = np.concatenate((pdata.X, ndata.X))
y = np.zeros(X.shape[0])
y[:pdata.X.shape[0]] = 1
nsegments = [[s+Np for s in ns] for ns in nsegments]
In [ ]:
latencies = np.concatenate((pdata.latencies,ndata.latencies))
In [ ]:
N, NF = X.shape
N, NF
I am using RF because it needs very little (hyper) parameter tuning. On purpose I am using a small depth, because I am not interested in the best prediction (which is already high) but with the feature importance after taking into account pair interactions.
In [ ]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, oob_score=True, max_depth=2)
rf.fit(X, y)
In [ ]:
rf.oob_score_
In [ ]:
pnweights = rf.feature_importances_
pn_importance_order = pnweights.argsort()[::-1]
pl.plot(rf.feature_importances_[pn_importance_order]);
we will use Gradient Boosting Classifier which usually gives better results than L1 or RF. In addition, like RF, it does not require normalization or PCA. However, unlike RF or L1 it has many hyper parameters that can effect its performance. In addition we need to decide how many features we want to use which is another hyper-parameter. Instead of manually guessing, we can use the hyperopt to do the guesing for us
we will perform several hyperopt search in parallel each running on a different bootstrap sample of the data
The data itself is identical so there is no need to duplicate it for each process and we will use shared memory (shmem)
In [ ]:
!rm ../data-cache/X.mmap
mmX = np.memmap('../data-cache/X.mmap', shape=X.shape, dtype=np.float32, mode='w+')
mmX[:,:] = X[:,pn_importance_order]
del mmX # flush to disk
We will use ipython parallel processing infrastructure. Visit Clusters tab in the Home page of ipython and start 8 engines (or as many cores you have on your machine) from the default profile
OR you can run the command line:
ipcluster start --n=8
wait a little bit (otherwise you will get an error on next cell)
In [ ]:
#!sleep 30
In [ ]:
from IPython.parallel import Client
client = Client()
lv = client.load_balanced_view()
#lv.set_flags(block = False, retries = 0)
clients=client[:]
Ncores = len(clients)
Ncores
copy some information to all engines
In [ ]:
clients['X_shape'] = X.shape
clients['latencies'] = latencies
clients['y'] = y
clients['psegments'] = psegments
clients['nsegments'] = nsegments
load the shared memory on all engines
In [ ]:
%%px
import numpy as np
N, NF = X_shape
X = np.memmap('../data-cache/X.mmap', shape=X_shape, dtype=np.float32, mode='r')
Split data to training and validation. Randomly pick one positive segment for validation and a matching number of negative segments
In [ ]:
%%px
import random, itertools
def random_train_validation_split(psegments=psegments, nsegments=nsegments, N=N, pratio=1):
"""
N - total number of samples.
pratio - 0: no validation, 1: use all training data, 0<pratio<1: bootstrap from all training data
"""
if pratio == 0:
return range(N), []
Nps = len(psegments)
assert Nps > 1
Nns = len(nsegments)
assert Nns > 1
npsratio = float(Nns)/Nps
Ntrainps = 1
Ntrainns = min(max(1,int(Ntrainps*npsratio+0.5)), Nns-1) # make sure we have at least one segment in train and validate
s = random.choice(psegments)
ns = random.sample(nsegments,Ntrainns) # sequence based
n = list(itertools.chain(*ns)) # .ravel does not work - elements of nsegments are not of equal length
sample_validate = s + n
random.shuffle(sample_validate)
all_p = list(itertools.chain(*psegments))
all_n = list(itertools.chain(*nsegments))
testp = list(set(all_p) - set(s))
if pratio != 1:
# testp *= pratio
ntestp = len(testp)
boot_ntestp = int(ntestp*pratio)
w = np.ones(ntestp)/float(ntestp)
testp = [testp[i] for i, n in enumerate(np.random.multinomial(boot_ntestp, w))
for k in xrange(n)]
testn = list(set(all_n) - set(n))
sample_test = testp + testn
random.shuffle(sample_test)
return sample_test, sample_validate
We will find args which will optimize AUC for a given train and validation data
In [ ]:
%%px
from sklearn.ensemble import RandomForestClassifier
def target(args, X_trn, y_trn, latencies_trn, X_val, y_val):
est = RandomForestClassifier()
params = dict((k,int(v) if isinstance(v, float) and abs(v - int(v)) < 1e-3 else v)
for k, v in args.iteritems() if k not in ['nf', 'pratio', 'alpha'])
est.set_params(**params)
nf = int(args['nf'])
sample_weight = np.exp(args['alpha']*(6.-latencies_trn))
est.fit(X_trn[:,:nf], y_trn, sample_weight)
y_train = est.predict_proba(X_trn[:,:nf])[:,1]
y_validate = est.predict_proba(X_val[:,:nf])[:,1]
from sklearn.metrics import roc_auc_score
train_score = roc_auc_score(y_trn, y_train)
validate_score = roc_auc_score(y_val, y_validate)
return train_score, validate_score
In [ ]:
%%px
def hyperopt_work(args):
from lockfile import LockFile
space = args.get('space')
pratio = int(space['pratio'])
sample_train, sample_validate = random_train_validation_split(pratio=pratio)
X_trn = X[sample_train,:]
y_trn = y[sample_train]
latencies_trn = latencies[sample_train]
assert y_trn.mean() > 0.01 and y_trn.mean() < 0.99
X_val = X[sample_validate,:]
y_val = y[sample_validate]
def t_est(args):
try:
train_score, validate_score = target(args, X_trn, y_trn, latencies_trn, X_val, y_val)
lock = LockFile('.lock')
with lock:
with open('../data-cache/hyperopt.txt','a') as fp:
print >>fp, validate_score, train_score, args
from hyperopt import STATUS_OK
return {'loss':1.-validate_score, 'status':STATUS_OK, 'train_score':train_score, 'validate_score':validate_score}
except Exception as e:
lock = LockFile('.lock')
with lock:
with open('../data-cache/hyperopt.txt','a') as fp:
print >>fp, 'failed', e, args
from hyperopt import STATUS_FAIL
return {'status':STATUS_FAIL, 'loss':1.} # 'loss' is mandatory
max_evals = args.get('max_evals', 10)
from hyperopt import fmin, tpe, Trials
# trials = Trials()
best = fmin( t_est, space, algo=tpe.suggest, max_evals=max_evals) #, trials=trials)
# import cPickle as pickle
# lock = LockFile('.lock')
# with lock:
# with open('../data-cache/hyperopt.spkl','ab') as fp:
# pickle.dump(trials, fp, -1)
return best
Define statistical space in which we will do our hyper-parameter search
In [ ]:
%%px
from hyperopt import hp
from math import log
space = {
'pratio': 1,
'nf': hp.quniform( 'nf', 10, NF, 1),
'alpha': hp.loguniform('alpha', np.log(1e-5), np.log(1.)),
'n_estimators': hp.qloguniform('n_estimators', np.log(50), np.log(4000), 1),
'criterion': hp.choice('criterion',['gini', 'entropy']),
'max_depth': hp.choice('max_depth',
[None, hp.qlognormal('max_depth_int', np.log(5), 1, 1)]),
'min_samples_split': hp.qloguniform('min_samples_split', np.log(1.), np.log(5.), 1),
'min_samples_leaf': hp.qloguniform('min_samples_leaf', np.log(1.), np.log(5.), 1),
'bootstrap': hp.choice('bootstrap',[False, True]),
}
In [ ]:
!rm ../data-cache/hyperopt.*
run hyperopt searches in parallel on all cores. Each hyperopt search will do 100 evaluations of the hyper parameters.
In [ ]:
%%px
hyperopt_work({'space':space, 'max_evals':100})
wait for the jobs to end. This will take some time. Also your computer can get really hot, so use the time to arange some cooling to it.
In [ ]:
!sort -n -r ../data-cache/hyperopt.txt > ../data-cache/hyperopt.sort.txt
!head -n 5 ../data-cache/hyperopt.sort.txt
In [ ]:
fp = open('../data-cache/hyperopt.sort.txt')
hyeropt_results = []
for l in fp:
if l.startswith('failed'):
continue
l = l.split()
validate_score = l[0]
train_score = l[1]
args = eval(''.join(l[2:]))
hyeropt_results.append((validate_score, train_score, args))
fp.close()
len(hyeropt_results)
In [ ]:
tdata = read_data(target, 'test', FEATURES)
Nt, NFF = tdata.X.shape
Nt, NFF
In [ ]:
!rm /tmp/Xt.mmap
Xt_shape = (tdata.X.shape[0], NF)
mmXt = np.memmap('../data-cache/Xt.mmap', shape=Xt_shape, dtype=np.float32, mode='w+')
mmXt[:,:] = tdata.X[:,pn_importance_order]
del mmXt # flush to disk
In [ ]:
clients['Xt_shape'] = Xt_shape
clients['target'] = target
In [ ]:
%%px
Xt = np.memmap('../data-cache/Xt.mmap', shape=Xt_shape, dtype=np.float32, mode='r')
In [ ]:
def predict_work(args):
from lockfile import LockFile
from sklearn.ensemble import GradientBoostingClassifier
import cPickle as pickle
N = X_shape[0]
NF = int(args.get('nf', X_shape[1]))
pratio = int(args.get('pratio',1))
# use out-of-bag samples to estimate the generalization error
sample_train, sample_validate = random_train_validation_split(pratio=0)
X_trn = X[sample_train,:NF]
y_trn = y[sample_train]
latencies_trn = latencies[sample_train]
if sample_validate:
X_val = X[sample_validate,:NF]
y_val = y[sample_validate]
X_test = Xt[:,:NF]
from sklearn.ensemble import RandomForestClassifier
est = RandomForestClassifier()
params = dict((k,int(v) if isinstance(v, float) and abs(v - int(v)) < 1e-3 else v)
for k, v in args.iteritems() if k not in ['nf', 'pratio', 'alpha'])
est.set_params(**params)
sample_weight = np.exp(args['alpha']*(6.-latencies_trn))
try:
est.fit(X_trn, y_trn, sample_weight)
if sample_validate:
y_val_est = est.predict_proba(X_val)[:,1]
else:
y_val_est = None
y_test_est = est.predict_proba(X_test)[:,1]
lock = LockFile('.lock')
with lock:
with open('../data-cache/validate.spkl','ab') as fp:
# in a later stage we will use the OOB samples to make predictions on all samples
# so keep a record of the index of each OOB sample
pickle.dump((sample_validate,y_val_est), fp, -1)
with open('../data-cache/%s_test.spkl'%target,'ab') as fp:
# in a later stage we will use the OOB samples to make predictions on all samples
# so keep a record of the index of each OOB sample
pickle.dump(y_test_est, fp, -1)
except Exception as e:
return e
from sklearn.metrics import roc_auc_score
# (their p_ratio will be different) so this error measure is not completely accurate
if sample_validate:
return roc_auc_score(y_val, y_val_est)
else:
return 0
In [ ]:
!rm ../data-cache/validate.spkl
!rm ../data-cache/{target}_test.spkl
In [ ]:
args_list = []
for res in hyeropt_results:
args = res[2]
print args
args_list.append(args)
if len(args_list) >= 16:
break
len(args_list)
In [ ]:
results = lv.map(predict_work, args_list)
In [ ]:
import IPython
itr = results.__iter__()
while True:
try:
r = itr.next()
except StopIteration:
print 'stopped'
break
except IPython.parallel.error.RemoteError as e:
print e
continue
except Exception as e:
print e.__class__
continue
# print r
In [ ]:
fp = open('../data-cache/validate.spkl','rb')
count = 0
y_est = np.zeros(N)
y_count = np.zeros(N)
vals = []
while True:
try:
sample_validate,y_val_est = pickle.load(fp)
except:
break
if not sample_validate:
break
count += 1
y_est[sample_validate] += y_val_est
y_count[sample_validate] += 1
idx = y_val_est.argsort()[::-1]
n = len(y_val_est)
val_recall_support = np.zeros(n)
p_sum = 0.
for i,j in enumerate(idx):
p_sum += float(y[sample_validate[j]])
val_recall_support[i] = p_sum
val_x = np.linspace(0.,100.,n)
vals.append((val_x, val_recall_support))
y_est /= y_count
In [ ]:
from sklearn.metrics import roc_auc_score
if np.any(np.isnan(y_est)):
print np.sum(y_count == 0),len(y_count)
else:
y_no_overlap = [r for r,l in zip(y, latencies) if abs(l-int(l)) < 0.01]
y_est_no_overlap = [r for r,l in zip(y_est, latencies) if abs(l-int(l)) < 0.01]
print roc_auc_score(y_no_overlap, y_est_no_overlap)
with open('../data-cache/%s_predict.spkl'%target,'wb') as fp:
pickle.dump((y_no_overlap, y_est_no_overlap), fp, -1)
after running the entire notebook, again and again, on all targets - continue to 140929-target-combine
In [ ]:
!ls -l ../data-cache/*_test.spkl
In [ ]:
with open('../submissions/141017-predict.2.csv','w') as fpout:
print >>fpout,'clip,preictal'
for target in ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2']:
with open('../data-cache/%s_test.spkl'%target,'rb') as fp:
y_proba = pickle.load(fp)
# write results
for i,p in enumerate(y_proba):
print >>fpout,'%s_test_segment_%04d.mat,%.15f' % (target, i+1, p)
In [ ]:
#!rm ../data-cache/*_test.spkl