Manually run this notebooks on all targets


In [38]:
target = 'Patient_2'

In [39]:
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os
import re
import math
import sys
import random
from collections import defaultdict

In [40]:
import sys
sys.path.append('..')
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')
def read_data(target, data_type, features):
    return cached_data_loader.load('data_%s_%s_%s'%(data_type,target,features),None)

Read data


In [41]:
FEATURES = 'gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70'
FEATURES1 = 'gen-8_maxdiff-60'

nbands = 0
nwindows = 0
for p in FEATURES.split('-'):
    if p[0] == 'b':
        nbands += 1
    elif p[0] == 'w':
        nwindows = int(p[1:])

nbands -= 1
nbands, nwindows


Out[41]:
(5, 60)

each target receive a different model

positive examples. The positive examles were upsampled (using gen_ictal=-8)


In [42]:
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')

def read_data(target, data_type, features=FEATURES):
    fname = 'data_%s_%s_%s'%(data_type,target,features)
    print fname
    return cached_data_loader.load(fname,None)

def process(X, X1, percentile=[0.05,0.95], nunits=2, mask_level=7000):
    N, Nf = X.shape
    print '# samples',N,'# power points', Nf
    nchannels = Nf / (nbands*nwindows)
    print '# channels', nchannels

    fix = defaultdict(int)
    newX = []
    for i in range(N):
        nw = nwindows//nunits
        windows = X[i,:].reshape((nunits,nw,-1))
        mask = X1[i,:].reshape((nunits,nw,-1)) # max value for each channel
        for j in range(nunits):
            for k in range(nchannels):
                m = mask[j,:,k] > mask_level # find large windows
                if np.any(m):
#                     print 'FIX', sum(m)
                    fix[sum(m)] += 1
                    if not np.all(m): # make sure we had at least one good window so we can re use its values
                        # replace the bands of a large windows with the mean of the bands in all other windows
                        windows[j,m,k*nbands:(k+1)*nbands] = np.mean(windows[j,~m,k*nbands:(k+1)*nbands], axis=0)
        sorted_windows = np.sort(windows, axis=1)
        features = np.concatenate([sorted_windows[:,int(p*nw),:] for p in percentile], axis=-1)
        newX.append(features.ravel())
    newX = np.array(newX)
    print sorted(fix.items())
    return newX

def getsegments(pdata):
    segments = []
    start = 0
    last_l = 0
    for i,l in enumerate(pdata.latencies):
        if l<last_l:
            segments.append(range(start,i))
            start = i
        last_l = l
    segments.append(range(start,i+1))
    return segments

def getdata():
    pdata = read_data(target, 'preictal') # positive examples
    Np, _ = pdata.X.shape
    print 'Positive examples',Np,
    psegments = getsegments(pdata)
    Nps = len(psegments)
    print 'sequences:',Nps
    

    ndata = read_data(target, 'interictal') # negative examples
    Nn, _ = ndata.X.shape
    print 'Negative',Nn,
    nsegments = getsegments(ndata)
    Nns = len(nsegments)
    print 'sequences:',Nns

    X = np.concatenate((pdata.X, ndata.X))
    print 'p-ratio',float(Np)/(Np+Nn), 'sequences p-ratio:',float(Nps)/(Nps+Nns)
    nsegments = [[s+Np for s in ns] for ns in nsegments]
    latencies = np.concatenate((pdata.latencies,ndata.latencies))

    pdata1 = read_data(target, 'preictal', FEATURES1) # positive examples
    ndata1 = read_data(target, 'interictal', FEATURES1) # negative examples
    X1 = np.concatenate((pdata1.X, ndata1.X))

    print 'Training:'
    X = process(X, X1)
    
    
    y = np.zeros(X.shape[0])
    y[:Np] = 1
    
    print 'Test:'
    tdata = read_data(target, 'test') # test examples
    tdata1 = read_data(target, 'test', FEATURES1) # test examples
    Xt = process(tdata.X, tdata1.X)
    
    return X,y,Xt,psegments,nsegments,latencies

data is broken into segments that should be taken together when splitting to training and validation


In [43]:
X,y,Xt,psegments,nsegments,latencies = getdata()


data_preictal_Patient_2_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
Positive examples 138 sequences: 3
data_interictal_Patient_2_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
Negative 42 sequences: 7
p-ratio 0.766666666667 sequences p-ratio: 0.3
data_preictal_Patient_2_gen-8_maxdiff-60
data_interictal_Patient_2_gen-8_maxdiff-60
Training:
# samples 180 # power points 7200
# channels 24
[(1, 189)]
Test:
data_test_Patient_2_gen-8_allbands2-usf-w60-b0.2-b4-b8-b12-b30-b70
data_test_Patient_2_gen-8_maxdiff-60
# samples 150 # power points 7200
# channels 24
[(1, 3), (2, 1), (3, 5), (4, 2), (5, 4), (6, 4), (7, 6), (8, 9), (9, 4), (10, 5), (12, 3), (13, 3), (14, 12), (16, 4)]

In [44]:
N, NF = X.shape
N, NF


Out[44]:
(180, 480)

Feature Importance

Positive/Negative feature importance

I am using RF because it needs very little (hyper) parameter tuning. On purpose I am using a small depth, because I am not interested in the best prediction (which is already high) but with the feature importance after taking into account pair interactions.


In [45]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, oob_score=True, max_depth=2)
rf.fit(X, y)


Out[45]:
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=2, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=1000, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0)

In [46]:
rf.oob_score_


Out[46]:
0.8833333333333333

In [47]:
pnweights = rf.feature_importances_
pn_importance_order = pnweights.argsort()[::-1]
pl.plot(rf.feature_importances_[pn_importance_order])


Out[47]:
[<matplotlib.lines.Line2D at 0x114791f10>]

Classification

hyperopt

we will use Gradient Boosting Classifier which usually gives better results than L1 or RF. In addition, like RF, it does not require normalization or PCA. However, unlike RF or L1 it has many hyper parameters that can effect its performance. In addition we need to decide how many features we want to use which is another hyper-parameter. Instead of manually guessing, we can use the hyperopt to do the guesing for us

we will perform several hyperopt search in parallel each running on a different bootstrap sample of the data

shared memory

The data itself is identical so there is no need to duplicate it for each process and we will use shared memory (shmem)


In [48]:
!rm /tmp/X.mmap
mmX = np.memmap('/tmp/X.mmap', shape=X.shape, dtype=np.float32, mode='w+')
mmX[:,:] = X[:,pn_importance_order]
del mmX # flush to disk

parallel

We will use ipython parallel processing infrastructure. Visit Clusters tab in the Home page of ipython and start 8 engines (or as many cores you have on your machine) from the default profile

OR you can run the command line:

ipcluster start --n=8

wait a little bit (otherwise you will get an error on next cell)


In [49]:
#!sleep 30

In [50]:
from IPython.parallel import Client
client = Client()
lv = client.load_balanced_view()
#lv.set_flags(block = False, retries = 0)
clients=client[:]
Ncores = len(clients)
Ncores


Out[50]:
8

copy some information to all engines


In [51]:
clients['X_shape'] = X.shape
clients['y'] = y
clients['psegments'] = psegments
clients['nsegments'] = nsegments

load the shared memory on all engines


In [52]:
%%px
import numpy as np
N, NF = X_shape
X = np.memmap('/tmp/X.mmap', shape=X_shape, dtype=np.float32, mode='r')

In [53]:
%%px
import random, itertools
def random_train_validation_split(psegments=psegments, nsegments=nsegments, N=N, pratio=1):
    """Randomly pick one positive segment for validation and a matching number of negative segments"""
    Nps = len(psegments)
    assert Nps > 1
    Nns = len(nsegments)
    assert Nns > 1
    npsratio = float(Nns)/Nps
    Ntrainps = 1
    Ntrainns = min(max(1,int(Ntrainps*npsratio+0.5)), Nns-1) # make sure we have something to train
    
    s = random.choice(psegments)
    ns = random.sample(nsegments,Ntrainns) # sequence based
    n = list(itertools.chain(*ns)) # .ravel does not work - elements of nsegments are not of equal length
    sample_validate = s + n
    random.shuffle(sample_validate)
    
    
    all_p = list(itertools.chain(*psegments))
    all_n = list(itertools.chain(*nsegments))

    testp = list(set(all_p) - set(s))
    if pratio != 1:
        testp *= pratio
#         ntestp = len(testp)
#         boot_ntestp = int(ntestp*pratio)
#         w = np.ones(ntestp)/float(ntestp)
#         testp = [testp[i] for i, n in enumerate(np.random.multinomial(boot_ntestp, w))
#                  for k in xrange(n)]
        
    testn = list(set(all_n) - set(n))
    sample_test = testp + testn
    random.shuffle(sample_test)

    return sample_test, sample_validate

We will optimize AUC


In [54]:
%%px
from gradient_boost_loss import NAUC
import os
def hyperopt_work(args):
    from lockfile import LockFile
    space = args.get('space')
    pratio = int(space['pratio'])
    sample_train, sample_validate = random_train_validation_split(pratio=pratio)    

    X_trn = X[sample_train,:]
    y_trn = y[sample_train]
    assert y_trn.mean() > 0.01 and y_trn.mean() < 0.99

    X_val = X[sample_validate,:]
    y_val = y[sample_validate]
    
    def t_est(args):
        try:
            from sklearn.ensemble import GradientBoostingClassifier 
            est = GradientBoostingClassifier()
            params = dict((k,int(v) if isinstance(v, float) and abs(v - int(v)) < 1e-3 else v)
                          for k, v in args.iteritems() if k not in ['nf', 'pratio'])
            est.set_params(**params)
#             est.loss__ = NAUC(2)

            nf = int(args['nf'])
            est.fit(X_trn[:,:nf], y_trn)
            
            y_train = est.predict_proba(X_trn[:,:nf])[:,1]
            y_validate = est.predict_proba(X_val[:,:nf])[:,1]
            
            from sklearn.metrics import roc_auc_score
            train_score = roc_auc_score(y_trn, y_train)
            validate_score = roc_auc_score(y_val, y_validate)
            
            lock = LockFile('.lock')
            with lock:
                with open('../data-cache/hyperopt.txt','a') as fp:
                    print >>fp, validate_score, train_score, os.getpid(), args
            from hyperopt import STATUS_OK
            return {'loss':1.-validate_score, 'status':STATUS_OK, 'pid':os.getpid(),
                    'train_score':train_score, 'validate_score':validate_score}
        except Exception as e:
            lock = LockFile('.lock')
            with lock:
                with open('../data-cache/hyperopt.txt','a') as fp:
                    print >>fp, 'failed', e, args
            from hyperopt import STATUS_FAIL
            return {'status':STATUS_FAIL, 'loss':1.} # 'loss' is mandatory
            
    
    max_evals = args.get('max_evals', 100)
    from hyperopt import fmin, tpe, Trials
#     trials = Trials()
    best = fmin( t_est, space, algo=tpe.suggest, max_evals=max_evals) #, trials=trials)
#     import cPickle as pickle
#     lock = LockFile('.lock')
#     with lock:
#         with open('../data-cache/hyperopt.spkl','ab') as fp:
#                 pickle.dump(trials, fp, -1)
    return best

Define statistical space in which we will do our hyper-parameter search


In [55]:
%%px
from hyperopt import hp
from math import log
space = {
    'n_estimators': 1000,
    'pratio': 1,
    'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(1.)),
    'nf': hp.quniform( 'nf', 10, NF, 1),
    'max_features': hp.quniform( 'max_features', 10, 30, 1),
    'max_depth': hp.quniform( 'max_depth', 2, 18, 1),
    'min_samples_leaf': hp.quniform( 'min_samples_leaf', 2, 30, 1),
#     'subsample': hp.uniform( 'subsample', 0.2, 0.9),
}

In [56]:
!rm ../data-cache/hyperopt.*

run hyperopt searches in parallel on all cores. Each hyperopt search will do 100 evaluations of the hyper parameters.


In [57]:
%%px
hyperopt_work({'space':space, 'max_evals':100})


Out[0:46]: 
{'learning_rate': 0.016273346783129052,
 'max_depth': 6.0,
 'max_features': 14.0,
 'min_samples_leaf': 3.0,
 'nf': 172.0}
Out[1:46]: 
{'learning_rate': 0.7477400914297095,
 'max_depth': 6.0,
 'max_features': 18.0,
 'min_samples_leaf': 8.0,
 'nf': 153.0}
Out[2:46]: 
{'learning_rate': 0.056923904333768996,
 'max_depth': 6.0,
 'max_features': 10.0,
 'min_samples_leaf': 14.0,
 'nf': 11.0}
Out[3:46]: 
{'learning_rate': 0.035907527443119265,
 'max_depth': 14.0,
 'max_features': 18.0,
 'min_samples_leaf': 18.0,
 'nf': 18.0}
Out[4:46]: 
{'learning_rate': 0.26840910401979023,
 'max_depth': 10.0,
 'max_features': 23.0,
 'min_samples_leaf': 21.0,
 'nf': 121.0}
Out[5:46]: 
{'learning_rate': 0.833610267905131,
 'max_depth': 3.0,
 'max_features': 22.0,
 'min_samples_leaf': 10.0,
 'nf': 278.0}
Out[6:46]: 
{'learning_rate': 0.0074829147858297045,
 'max_depth': 9.0,
 'max_features': 10.0,
 'min_samples_leaf': 23.0,
 'nf': 15.0}
Out[7:46]: 
{'learning_rate': 0.743130006310607,
 'max_depth': 6.0,
 'max_features': 16.0,
 'min_samples_leaf': 22.0,
 'nf': 96.0}

wait for the jobs to end. This will take some time. Also your computer can get really hot, so use the time to arange some cooling to it.


In [58]:
!sort -n -r ../data-cache/hyperopt.txt > ../data-cache/hyperopt.sort.txt
!head -n 5 ../data-cache/hyperopt.sort.txt


1.0 1.0 1387 {'n_estimators': 1000, 'pratio': 1, 'max_features': 16.0, 'learning_rate': 0.743130006310607, 'max_depth': 6.0, 'nf': 96.0, 'min_samples_leaf': 22.0}
1.0 1.0 1377 {'n_estimators': 1000, 'pratio': 1, 'max_features': 23.0, 'learning_rate': 0.26840910401979023, 'max_depth': 10.0, 'nf': 121.0, 'min_samples_leaf': 21.0}
1.0 1.0 1372 {'n_estimators': 1000, 'pratio': 1, 'max_features': 14.0, 'learning_rate': 0.016273346783129052, 'max_depth': 6.0, 'nf': 172.0, 'min_samples_leaf': 3.0}
0.998188405797 1.0 1386 {'n_estimators': 1000, 'pratio': 1, 'max_features': 10.0, 'learning_rate': 0.0074829147858297045, 'max_depth': 9.0, 'nf': 15.0, 'min_samples_leaf': 23.0}
0.998188405797 1.0 1377 {'n_estimators': 1000, 'pratio': 1, 'max_features': 22.0, 'learning_rate': 0.08421933314518218, 'max_depth': 15.0, 'nf': 176.0, 'min_samples_leaf': 26.0}

In [59]:
fp = open('../data-cache/hyperopt.sort.txt')
hyeropt_results = []
for l in fp:
    if l.startswith('failed'):
        continue
    l = l.split()
    validate_score = float(l[0])
    train_score = float(l[1])
    pid = int(l[2])
    args = eval(''.join(l[3:]))
    hyeropt_results.append((validate_score, train_score, pid, args))
fp.close()
len(hyeropt_results)


Out[59]:
774

Predicition/Bagging/Validation


In [60]:
Nt, NFF = Xt.shape
assert NF == NFF
Nt, NFF


Out[60]:
(150, 480)

In [61]:
!rm /tmp/Xt.mmap
Xt_shape = (Nt, NF)
mmXt = np.memmap('/tmp/Xt.mmap', shape=Xt_shape, dtype=np.float32, mode='w+')
mmXt[:,:] = Xt[:,pn_importance_order]
del mmXt # flush to disk

In [62]:
clients['Xt_shape'] = Xt_shape
clients['target'] = target

In [63]:
%%px
Xt = np.memmap('/tmp/Xt.mmap', shape=Xt_shape, dtype=np.float32, mode='r')

In [64]:
def predict_work(args):
    from lockfile import LockFile
    from sklearn.ensemble import GradientBoostingClassifier 
    import cPickle as pickle

    N = X_shape[0]
    NF = int(args.get('nf', X_shape[1]))
    pratio = int(args.get('pratio',1))
    # use out-of-bag samples to estimate the generalization error
    sample_train, sample_validate = random_train_validation_split(pratio=pratio)    

    X_trn = X[sample_train,:NF]
    y_trn = y[sample_train]
    
    X_val = X[sample_validate,:NF]
    y_val = y[sample_validate]
    
    X_test = Xt[:,:NF]
    

    from sklearn.ensemble import GradientBoostingClassifier 
    est = GradientBoostingClassifier()

    params = dict((k,int(v) if isinstance(v, float) and abs(v - int(v)) < 1e-3 else v)
                          for k, v in args.iteritems() if k not in ['nf', 'pratio'])
    est.set_params(**params)
#     est.loss__ = NAUC(2)
    try:
        est.fit(X_trn, y_trn)

        y_val_est = est.predict_proba(X_val)[:,1]
        
        y_test_est = est.predict_proba(X_test)[:,1]

        lock = LockFile('.lock')
        with lock:
            with open('../data-cache/validate.spkl','ab') as fp:
                # in a later stage we will use the OOB samples to make predictions on all samples
                # so keep a record of the index of each OOB sample
                pickle.dump((sample_validate,y_val_est), fp, -1)
            with open('../data-cache/%s_test.spkl'%target,'ab') as fp:
                # in a later stage we will use the OOB samples to make predictions on all samples
                # so keep a record of the index of each OOB sample
                pickle.dump(y_test_est, fp, -1)
    except Exception as e:
        return e
    from sklearn.metrics import roc_auc_score
    #  (their p_ratio will be different) so this error measure is not completely accurate
    return roc_auc_score(y_val, y_val_est)

In [65]:
!rm ../data-cache/validate.spkl
!rm ../data-cache/{target}_test.spkl

In [66]:
from collections import defaultdict
pid2args_list = defaultdict(list)
for res in hyeropt_results:
    validation_score = res[0]
    test_score = res[1]
    pid = res[2]
    args = res[3]
    pid2args_list[pid].append((validation_score, args))

In [67]:
args_list = []
for v in pid2args_list.values():
    print v[1]
    for vv in v[:4]:
        args_list.append(vv[1])


(0.923913043478, {'learning_rate': 0.3127277707970103, 'nf': 16.0, 'min_samples_leaf': 14.0, 'n_estimators': 1000, 'pratio': 1, 'max_features': 12.0, 'max_depth': 18.0})
(0.998188405797, {'learning_rate': 0.08421933314518218, 'nf': 176.0, 'min_samples_leaf': 26.0, 'n_estimators': 1000, 'pratio': 1, 'max_features': 22.0, 'max_depth': 15.0})
(0.86231884058, {'learning_rate': 0.0051556466791173195, 'nf': 300.0, 'min_samples_leaf': 2.0, 'n_estimators': 1000, 'pratio': 1, 'max_features': 26.0, 'max_depth': 7.0})
(0.996376811594, {'learning_rate': 0.009379599636176498, 'nf': 16.0, 'min_samples_leaf': 25.0, 'n_estimators': 1000, 'pratio': 1, 'max_features': 15.0, 'max_depth': 9.0})
(0.994565217391, {'learning_rate': 0.16266020779091664, 'nf': 158.0, 'min_samples_leaf': 20.0, 'n_estimators': 1000, 'pratio': 1, 'max_features': 18.0, 'max_depth': 10.0})
(0.998188405797, {'learning_rate': 0.012325309500310029, 'nf': 177.0, 'min_samples_leaf': 5.0, 'n_estimators': 1000, 'pratio': 1, 'max_features': 17.0, 'max_depth': 3.0})
(0.965579710145, {'learning_rate': 0.9885380412204035, 'nf': 151.0, 'min_samples_leaf': 14.0, 'n_estimators': 1000, 'pratio': 1, 'max_features': 18.0, 'max_depth': 7.0})
(0.916666666667, {'learning_rate': 0.4339857336888007, 'nf': 27.0, 'min_samples_leaf': 15.0, 'n_estimators': 1000, 'pratio': 1, 'max_features': 10.0, 'max_depth': 14.0})

In [68]:
args_list


Out[68]:
[{'learning_rate': 0.035907527443119265,
  'max_depth': 14.0,
  'max_features': 18.0,
  'min_samples_leaf': 18.0,
  'n_estimators': 1000,
  'nf': 18.0,
  'pratio': 1},
 {'learning_rate': 0.3127277707970103,
  'max_depth': 18.0,
  'max_features': 12.0,
  'min_samples_leaf': 14.0,
  'n_estimators': 1000,
  'nf': 16.0,
  'pratio': 1},
 {'learning_rate': 0.05785381661813288,
  'max_depth': 6.0,
  'max_features': 17.0,
  'min_samples_leaf': 14.0,
  'n_estimators': 1000,
  'nf': 20.0,
  'pratio': 1},
 {'learning_rate': 0.2015009395794558,
  'max_depth': 18.0,
  'max_features': 14.0,
  'min_samples_leaf': 12.0,
  'n_estimators': 1000,
  'nf': 15.0,
  'pratio': 1},
 {'learning_rate': 0.26840910401979023,
  'max_depth': 10.0,
  'max_features': 23.0,
  'min_samples_leaf': 21.0,
  'n_estimators': 1000,
  'nf': 121.0,
  'pratio': 1},
 {'learning_rate': 0.08421933314518218,
  'max_depth': 15.0,
  'max_features': 22.0,
  'min_samples_leaf': 26.0,
  'n_estimators': 1000,
  'nf': 176.0,
  'pratio': 1},
 {'learning_rate': 0.014994719213255552,
  'max_depth': 7.0,
  'max_features': 29.0,
  'min_samples_leaf': 23.0,
  'n_estimators': 1000,
  'nf': 164.0,
  'pratio': 1},
 {'learning_rate': 0.06586998196882567,
  'max_depth': 7.0,
  'max_features': 25.0,
  'min_samples_leaf': 20.0,
  'n_estimators': 1000,
  'nf': 220.0,
  'pratio': 1},
 {'learning_rate': 0.833610267905131,
  'max_depth': 3.0,
  'max_features': 22.0,
  'min_samples_leaf': 10.0,
  'n_estimators': 1000,
  'nf': 278.0,
  'pratio': 1},
 {'learning_rate': 0.0051556466791173195,
  'max_depth': 7.0,
  'max_features': 26.0,
  'min_samples_leaf': 2.0,
  'n_estimators': 1000,
  'nf': 300.0,
  'pratio': 1},
 {'learning_rate': 0.003708153712561447,
  'max_depth': 10.0,
  'max_features': 26.0,
  'min_samples_leaf': 2.0,
  'n_estimators': 1000,
  'nf': 272.0,
  'pratio': 1},
 {'learning_rate': 0.011131561641445346,
  'max_depth': 7.0,
  'max_features': 26.0,
  'min_samples_leaf': 2.0,
  'n_estimators': 1000,
  'nf': 280.0,
  'pratio': 1},
 {'learning_rate': 0.0074829147858297045,
  'max_depth': 9.0,
  'max_features': 10.0,
  'min_samples_leaf': 23.0,
  'n_estimators': 1000,
  'nf': 15.0,
  'pratio': 1},
 {'learning_rate': 0.009379599636176498,
  'max_depth': 9.0,
  'max_features': 15.0,
  'min_samples_leaf': 25.0,
  'n_estimators': 1000,
  'nf': 16.0,
  'pratio': 1},
 {'learning_rate': 0.012415662877131658,
  'max_depth': 7.0,
  'max_features': 11.0,
  'min_samples_leaf': 20.0,
  'n_estimators': 1000,
  'nf': 13.0,
  'pratio': 1},
 {'learning_rate': 0.004753262257803044,
  'max_depth': 6.0,
  'max_features': 15.0,
  'min_samples_leaf': 27.0,
  'n_estimators': 1000,
  'nf': 17.0,
  'pratio': 1},
 {'learning_rate': 0.743130006310607,
  'max_depth': 6.0,
  'max_features': 16.0,
  'min_samples_leaf': 22.0,
  'n_estimators': 1000,
  'nf': 96.0,
  'pratio': 1},
 {'learning_rate': 0.16266020779091664,
  'max_depth': 10.0,
  'max_features': 18.0,
  'min_samples_leaf': 20.0,
  'n_estimators': 1000,
  'nf': 158.0,
  'pratio': 1},
 {'learning_rate': 0.5194164418581994,
  'max_depth': 6.0,
  'max_features': 26.0,
  'min_samples_leaf': 19.0,
  'n_estimators': 1000,
  'nf': 115.0,
  'pratio': 1},
 {'learning_rate': 0.26961474915074457,
  'max_depth': 7.0,
  'max_features': 15.0,
  'min_samples_leaf': 20.0,
  'n_estimators': 1000,
  'nf': 91.0,
  'pratio': 1},
 {'learning_rate': 0.016273346783129052,
  'max_depth': 6.0,
  'max_features': 14.0,
  'min_samples_leaf': 3.0,
  'n_estimators': 1000,
  'nf': 172.0,
  'pratio': 1},
 {'learning_rate': 0.012325309500310029,
  'max_depth': 3.0,
  'max_features': 17.0,
  'min_samples_leaf': 5.0,
  'n_estimators': 1000,
  'nf': 177.0,
  'pratio': 1},
 {'learning_rate': 0.2099907912434229,
  'max_depth': 14.0,
  'max_features': 12.0,
  'min_samples_leaf': 4.0,
  'n_estimators': 1000,
  'nf': 99.0,
  'pratio': 1},
 {'learning_rate': 0.01139410422156201,
  'max_depth': 3.0,
  'max_features': 22.0,
  'min_samples_leaf': 3.0,
  'n_estimators': 1000,
  'nf': 229.0,
  'pratio': 1},
 {'learning_rate': 0.7477400914297095,
  'max_depth': 6.0,
  'max_features': 18.0,
  'min_samples_leaf': 8.0,
  'n_estimators': 1000,
  'nf': 153.0,
  'pratio': 1},
 {'learning_rate': 0.9885380412204035,
  'max_depth': 7.0,
  'max_features': 18.0,
  'min_samples_leaf': 14.0,
  'n_estimators': 1000,
  'nf': 151.0,
  'pratio': 1},
 {'learning_rate': 0.833610267905131,
  'max_depth': 3.0,
  'max_features': 22.0,
  'min_samples_leaf': 10.0,
  'n_estimators': 1000,
  'nf': 278.0,
  'pratio': 1},
 {'learning_rate': 0.3578316550030738,
  'max_depth': 6.0,
  'max_features': 16.0,
  'min_samples_leaf': 19.0,
  'n_estimators': 1000,
  'nf': 28.0,
  'pratio': 1},
 {'learning_rate': 0.056923904333768996,
  'max_depth': 6.0,
  'max_features': 10.0,
  'min_samples_leaf': 14.0,
  'n_estimators': 1000,
  'nf': 11.0,
  'pratio': 1},
 {'learning_rate': 0.4339857336888007,
  'max_depth': 14.0,
  'max_features': 10.0,
  'min_samples_leaf': 15.0,
  'n_estimators': 1000,
  'nf': 27.0,
  'pratio': 1},
 {'learning_rate': 0.05429269285657218,
  'max_depth': 7.0,
  'max_features': 10.0,
  'min_samples_leaf': 20.0,
  'n_estimators': 1000,
  'nf': 11.0,
  'pratio': 1},
 {'learning_rate': 0.04928345549155947,
  'max_depth': 3.0,
  'max_features': 15.0,
  'min_samples_leaf': 19.0,
  'n_estimators': 1000,
  'nf': 40.0,
  'pratio': 1}]

In [69]:
results = lv.map(predict_work, args_list*Ncores)

In [70]:
import IPython
itr = results.__iter__()
while True:
    try:
        r = itr.next()
    except StopIteration:
        print 'stopped'
        break
    except IPython.parallel.error.RemoteError as e:
        print e
        continue
    except Exception as e:
        print e.__class__
        continue
    print r


0.965579710145
0.880434782609
0.990942028986
0.891304347826
0.686594202899
0.115942028986
0.75
0.967391304348
0.266304347826
0.182971014493
0.974637681159
0.836956521739
0.856884057971
0.945652173913
1.0
0.740942028986
0.545289855072
0.972826086957
0.826086956522
0.563405797101
0.971014492754
0.980072463768
0.905797101449
0.650362318841
0.786231884058
0.295289855072
0.184782608696
0.505434782609
0.938405797101
0.26268115942
0.855072463768
0.833333333333
0.675724637681
0.842391304348
0.938405797101
0.659420289855
0.771739130435
0.0326086956522
0.480072463768
0.971014492754
0.579710144928
0.989130434783
0.235507246377
0.143115942029
0.871376811594
0.998188405797
0.688405797101
0.646739130435
0.530797101449
0.648550724638
0.217391304348
0.523550724638
0.632246376812
0.184782608696
0.721014492754
0.54347826087
0.724637681159
0.695652173913
0.463768115942
0.608695652174
0.485507246377
0.860507246377
0.884963768116
0.759057971014
0.882246376812
0.972826086957
1.0
0.842391304348
0.833333333333
0.858695652174
0.994565217391
0.54347826087
0.885869565217
0.253623188406
0.833333333333
0.833333333333
0.275362318841
1.0
1.0
0.983695652174
0.967391304348
0.742753623188
0.452898550725
0.619565217391
0.550724637681
0.822463768116
0.963768115942
0.990942028986
0.54347826087
0.594202898551
0.150362318841
0.226449275362
0.574275362319
0.980072463768
0.657608695652
0.717391304348
0.889492753623
0.692028985507
0.143115942029
0.869565217391
0.827898550725
0.726449275362
0.467391304348
0.815217391304
0.679347826087
0.541666666667
0.400362318841
0.547101449275
0.79347826087
0.951086956522
0.949275362319
0.248188405797
0.731884057971
0.981884057971
0.822463768116
0.992753623188
0.650362318841
0.530797101449
0.856884057971
0.527173913043
0.684782608696
0.889492753623
0.958333333333
0.985507246377
0.846014492754
0.902173913043
0.853260869565
0.751811594203
0.925724637681
0.784420289855
0.972826086957
0.773550724638
0.733695652174
0.0724637681159
0.68115942029
0.188405797101
0.516304347826
0.856884057971
0.927536231884
0.217391304348
0.672101449275
0.961956521739
0.998188405797
0.916666666667
0.626811594203
0.43115942029
0.936594202899
0.846014492754
0.539855072464
0.876811594203
0.161231884058
0.653985507246
0.860507246377
0.652173913043
0.746376811594
1.0
0.884057971014
0.846014492754
1.0
0.610507246377
0.461956521739
0.95652173913
0.969202898551
0.465579710145
0.289855072464
1.0
0.746376811594
0.942028985507
0.541666666667
0.925724637681
0.91847826087
0.342391304348
0.996376811594
0.949275362319
0.742753623188
0.869565217391
0.978260869565
0.878623188406
0.965579710145
0.813405797101
0.452898550725
0.748188405797
0.764492753623
0.976449275362
0.88768115942
0.530797101449
0.13768115942
0.692028985507
0.798913043478
0.771739130435
0.967391304348
0.58152173913
0.900362318841
0.753623188406
0.13768115942
0.684782608696
0.780797101449
0.971014492754
0.920289855072
0.480072463768
0.628623188406
0.83152173913
0.186594202899
0.992753623188
0.969202898551
0.786231884058
0.753623188406
0.659420289855
0.573369565217
0.992753623188
0.934782608696
0.989130434783
1.0
0.5
0.385869565217
0.135869565217
0.768115942029
0.289855072464
0.690217391304
0.95652173913
0.677536231884
0.869565217391
0.670289855072
0.994565217391
0.940217391304
0.784420289855
0.335144927536
0.98731884058
0.297101449275
0.596014492754
0.711956521739
0.0108695652174
0.467391304348
0.867753623188
0.740942028986
0.708333333333
0.829710144928
0.920289855072
0.75
0.989130434783
0.472826086957
0.534420289855
0.161231884058
0.536231884058
0.518115942029
0.798913043478
0.672101449275
0.985507246377
0.630434782609
1.0
0.0615942028986
0.590579710145
0.474637681159
0.811594202899
0.922101449275
0.454710144928
stopped

In [71]:
fp = open('../data-cache/validate.spkl','rb')
count = 0
y_est = np.zeros(N)
y_count = np.zeros(N)
vals = []
while True:
    try:
        sample_validate,y_val_est = pickle.load(fp)
    except:
        break
    count += 1
    y_est[sample_validate] += y_val_est
    y_count[sample_validate] += 1

    idx = y_val_est.argsort()[::-1]
    n = len(y_val_est)
    val_recall_support = np.zeros(n)
    p_sum = 0.
    for i,j in enumerate(idx):
        p_sum += float(y[sample_validate[j]])
        val_recall_support[i] = p_sum
    val_x = np.linspace(0.,100.,n)
    vals.append((val_x, val_recall_support))

y_est /= y_count

In [72]:
np.sum(y_count == 0)


Out[72]:
0

In [73]:
from sklearn.metrics import roc_auc_score
y_no_overlap = [r for r,l in zip(y, latencies) if abs(l-int(l)) < 0.01]
y_est_no_overlap = [r for r,l in zip(y_est, latencies) if abs(l-int(l)) < 0.01]
roc_auc_score(y_no_overlap, y_est_no_overlap)


Out[73]:
0.74867724867724872

In [74]:
with open('../data-cache/%s_predict.spkl'%target,'wb') as fp:
    pickle.dump((y_no_overlap, y_est_no_overlap), fp, -1)

after running the entire notebook, again and again, on all targets - continue to 141107-GBC-combine


In [ ]: