In [3]:
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os
import random

In [4]:
import sys
sys.path.append('..')

In [5]:
def prb2logit(x):
    return np.log(x/(1.-x))
def logit2prb(x):
    return 1./(1+np.exp(-x))

Read CV for best features/model

From 140907-CV


In [6]:
fnamecv = '../data-cache/140907-CV.n3.gen8_medianwindow-fft-with-time-freq-corr-1-48-r400-usf-w600.pkl'
with open(fnamecv, 'rb') as fp:
    target2iter2ys = pickle.load(fp)

In [7]:
from sklearn.metrics import roc_auc_score
def p(a,b):
    return '%d E%d'%(1000*a,1000*b)

target2ys = {}

all_ytest = all_y_proba =None
all_aucs = []
# iterate over all targets (patients/dogs)
for target, iter2ys in target2iter2ys.iteritems():
    target_ytest = target_y_proba =None # accumulate all results for that segment
    target_aucs = []
    print target,
    # iterate over all segments of that patient/dog, each time a different single segment
    # is used for testing
    for ys in iter2ys.itervalues():
        ytest = y_proba =None # accumulate all results for that segment
        aucs = []
        # iterate over 3 different shuffles to generate random training samples
        for yt, yp in ys: # real/estimated
            ytest = yt if ytest is None else np.concatenate((ytest,yt))
            y_proba = yp if y_proba is None else np.concatenate((y_proba,yp))
            aucs.append(roc_auc_score(yt, yp))
        print p(roc_auc_score(ytest, y_proba), np.mean(aucs)), # segment results
        target_aucs += aucs
        target_ytest = ytest if target_ytest is None else np.concatenate((target_ytest,ytest))
        target_y_proba = y_proba if target_y_proba is None else np.concatenate((target_y_proba,y_proba))
    print target,p(roc_auc_score(target_ytest, target_y_proba),np.mean(target_aucs)) # target results
    all_aucs += target_aucs        
    all_ytest = target_ytest if all_ytest is None else np.concatenate((all_ytest,target_ytest))
    all_y_proba = target_y_proba if all_y_proba is None else np.concatenate((all_y_proba,target_y_proba))
#     if target == 'Dog_3':
#         pl.hist(target_aucs,alpha=0.5)
    target2ys[target] = (target_ytest, prb2logit(target_y_proba))
print p(roc_auc_score(all_ytest, all_y_proba),np.mean(all_aucs)) # all data results
print


Dog_2 884 E916 908 E925 901 E927 Dog_2 898 E923
Dog_3 700 E707 705 E713 706 E702 Dog_3 703 E707
Dog_1 553 E551 541 E534 607 E610 Dog_1 568 E565
Dog_4 652 E697 657 E685 663 E680 Dog_4 656 E688
Dog_5 947 E954 930 E927 936 E934 Dog_5 938 E938
Patient_2 953 E970 971 E976 966 E998 Patient_2 964 E981
Patient_1 975 E998 950 E985 987 E981 Patient_1 970 E988
775 E774

Optimize target weights

Use hyperopt to optimize the complex ROC_AUC function


In [14]:
WEIGHTS = ['Dog_1','Dog_2','Dog_3','Dog_4','Dog_5','Patient_1','Patient_2',]
def objective(args):
    all_ytest = all_y_proba = None
    for target, (target_ytest, target_y_proba) in target2ys.iteritems():
        target_y_proba *= args[WEIGHTS.index(target)]
        all_ytest = target_ytest if all_ytest is None else np.concatenate((all_ytest,target_ytest))
        all_y_proba = target_y_proba if all_y_proba is None else np.concatenate((all_y_proba, target_y_proba))
    all_y_proba = logit2prb(all_y_proba)
    return roc_auc_score(all_ytest, all_y_proba)

define a search space for the different weights for different targets


In [18]:
from hyperopt import hp
space = (
        hp.normal('Dog_1',1,1),
        hp.normal('Dog_2',1,1),
        hp.normal('Dog_3',1,1),
        hp.normal('Dog_4',1,1),
        hp.normal('Dog_5',1,1),
        hp.normal('Patient_1',1,1),
        hp.normal('Patient_2',1,1),
    )

minimize the objective over the space


In [19]:
from hyperopt import fmin, tpe
best = fmin(objective, space, algo=tpe.suggest, max_evals=100)

In [20]:
best


Out[20]:
{'Dog_1': 0.2088648108506187,
 'Dog_2': 2.3152497125253957,
 'Dog_3': 1.459127443822347,
 'Dog_4': -0.2532805342575184,
 'Dog_5': 1.6983434661128765,
 'Patient_1': 1.6191735748997687,
 'Patient_2': 0.6000010008044052}

Read best submission


In [21]:
fname = '../submissions/140906-predict-direct.2.csv'

In [37]:
!wc -l {fname}
!head {fname}


    3936 ../submissions/140906-predict-direct.2.csv
clip,preictal
Dog_1_test_segment_0001.mat,0.528314974292854
Dog_1_test_segment_0002.mat,0.095106974579729
Dog_1_test_segment_0003.mat,0.086835065708654
Dog_1_test_segment_0004.mat,0.205226974175210
Dog_1_test_segment_0005.mat,0.159707544705429
Dog_1_test_segment_0006.mat,0.234699807847022
Dog_1_test_segment_0007.mat,0.143023108985720
Dog_1_test_segment_0008.mat,0.167520311762236
Dog_1_test_segment_0009.mat,0.099032089496216

In [22]:
submission = pd.read_csv(fname)

In [29]:
def calibrate(r):
    target = '_'.join(r['clip'].split('_')[:2])
    w = best[target]
    return logit2prb(prb2logit(r.preictal) * w)

In [31]:
submission.preictal = submission.apply(calibrate, axis=1)

In [34]:
submission.to_csv('../submissions/140912-rank-calibrate.csv', index=False)

In [38]:
!wc -l ../submissions/140912-rank-calibrate.csv
!head ../submissions/140912-rank-calibrate.csv


    3936 ../submissions/140912-rank-calibrate.csv
clip,preictal
Dog_1_test_segment_0001.mat,0.5059200592493357
Dog_1_test_segment_0002.mat,0.38448994144518656
Dog_1_test_segment_0003.mat,0.3795545722553353
Dog_1_test_segment_0004.mat,0.42976979939093596
Dog_1_test_segment_0005.mat,0.41415854923256656
Dog_1_test_segment_0006.mat,0.4385940393093643
Dog_1_test_segment_0007.mat,0.4075862934134161
Dog_1_test_segment_0008.mat,0.41705521956547725
Dog_1_test_segment_0009.mat,0.3867061999944999

In [ ]: