In [37]:
import feather
import os
import re
import pickle
import time
import datetime
import random

import numpy as np
import pandas as pd

from numba import jit

from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import matthews_corrcoef

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix, hstack, vstack

from ml_toolbox.xgboostmonitor_utils import *
import ml_toolbox.xgboostmonitor_utils as xgbm

%matplotlib inline

import xgboost as xgb
import subprocess

# Custom modules
import const
import func

In [2]:
# Based on: https://www.kaggle.com/c/caterpillar-tube-pricing/forums/t/15748/strategies-to-encode-categorical-variables-with-many-categories/88207

Load data


In [239]:
print const.TRAIN_FILES


['train_numeric', 'train_categorical_to_num', 'train_date']

In [3]:
# Load look-up table
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.head(3)


Out[3]:
line station feature_nr feat_nr_dat name_dat name_cat name_num col_dat col_num col_cat station_V2 line_V2
0 0 0 0 1.0 L0_S0_D1 NaN L0_S0_F0 0.0 0.0 NaN 0.0 1.0
1 0 0 2 3.0 L0_S0_D3 NaN L0_S0_F2 1.0 1.0 NaN 0.0 1.0
2 0 0 4 5.0 L0_S0_D5 NaN L0_S0_F4 2.0 2.0 NaN 0.0 1.0

In [87]:
# Load cluster info
cluster_info = pd.read_csv(os.path.join(const.DATA_PATH, 'eda_sample_clusters.csv'), index_col=0)
print cluster_info.shape
cluster_info.head(3)


(2367495, 7)
Out[87]:
unique_path cluster_n8 cluster_n15 cluster_n25 cluster_n50 cluster_n150 cluster_n500
Id
4 13409 1 2 3 47 36 369
6 7029 2 10 0 35 63 14
7 12763 1 2 19 3 132 477

In [12]:
# Load timestamps
date_train = func.load_data_file(const.TRAIN_FILES[2])
date_test = func.load_data_file(const.TEST_FILES[2])


Returning <open file '/Volumes/My Book/kaggle_bosch/train_date.pkl', mode 'rb' at 0x1184441e0>.pkl
Returning <open file '/Volumes/My Book/kaggle_bosch/test_date.pkl', mode 'rb' at 0x1184441e0>.pkl

In [15]:
date_data = vstack([date_train['data']['features'],date_test['data']['features']], format='csr')
ids = pd.concat([date_train['data']['ids'], date_test['data']['ids']])
y = date_train['data']['y']
del date_train, date_test

In [16]:
# Load response
#y = func.read_last_column(os.path.join(const.BASE_PATH, const.TRAIN_FILES[0] + '.csv'))
print y.shape
y.head(3)


(1183747, 1)
Out[16]:
Response
Id
4 0
6 0
7 0

In [17]:
# Load IDs of train + test
#ids = pd.concat([func.read_first_column(os.path.join(const.BASE_PATH, const.TRAIN_FILES[0])),
#                 func.read_first_column(os.path.join(const.BASE_PATH, const.TEST_FILES[0]))],
#                axis=0)
print ids.shape
ids.head(3)


(2367495, 1)
Out[17]:
Id
0 4
1 6
2 7

In [427]:
# Add response to cluster info
cluster_info['R'] = y.Response

# Add sample time to cluster info
def max_element_row(X):
    ''' Return maximum value of each row of sparse csr matrix X'''
    ''' nan values are assumed to be encoded as zero'''
    
    output = X.max(1).todense().A1
    
    output[output==0] = 0
    
    return output

cluster_info['tmax'] = (max_element_row(date_data)*5).astype(int)

Calculate features based on cluster 500


In [592]:
cluster_mean = cluster_info.groupby(['cluster_n500'])['R'].agg(['mean','count','sum'])
cluster_n500 = cluster_info.merge(cluster_mean, left_on='cluster_n500', right_index=True, how='left')

In [605]:
def loo_mean(mean, count, sample_val):
    output = (mean * count - sample_val) / (count - 1)
    
    # Return all sample mean
    output[count<=1] = 0.0058
    
    return output

def adjust_low_count_mean(count, mean):
    # Adjust mean for samples with low count
    # Use cut-off point to trim samples with low amount of samples
    cutoff = 1000
    train_mean = 0.0058
    r = pd.Series([1 + random.random()/5 - 0.1 for x in mean], index=mean.index)
    output = (count * mean + (cutoff-count)*train_mean)/cutoff
    
    output[count>(cutoff - 10 * count * mean)] = mean[count>(cutoff - 10 * count * mean)]
    
    return output

def cut_off_loo_mean(r1_count, mean):
    train_mean = 0.0058
    
    output = mean.copy()
    output[r1_count<15] = train_mean
    
    return output

def taper_mean_bin_prob(mean, bin_prob):
    train_mean = 0.0058
    
    output = bin_prob * train_mean + (1 - bin_prob) * mean
    
    return output

def random_loo_mean(mean, count, sample_val):
    # Random number between 0.9 and 1.1
    train_mean = 0.0058
    r = pd.Series([1 + random.random()/5 - 0.1 for x in mean], index=mean.index)
    #print r
    # Train samples have out of sample mean
    output = r * loo_mean(mean, count, sample_val)
    
    # Test samples have in-sample mean
    output[sample_val.isnull()] = mean[sample_val.isnull()]
    
    # Samples with mean null (categorical values not in train) set to all train sample mean
    output[mean.isnull()] = train_mean
    
    return output


def bin_prob(n, k, p):
    return scipy.misc.comb(n,k)*(p**k)*((1-p)**(n-k))

from scipy import special, exp, log
lgam = special.gammaln

def binomial2(n, k, p):
    return exp(lgam(n+1) - lgam(n-k+1) - lgam(k+1) + k*log(p) + (n-k)*log(1.-p))

In [ ]:
cluster_n500['bin_prob'] = cluster_n500[['count','sum']].apply(lambda x: binomial2(x[0], x[1], 0.0058), axis=1)

In [606]:
cluster_n500['loo_mean'] = random_loo_mean(cluster_n500['mean'],
                                                  cluster_n500['count'],
                                                  cluster_n500['R'])

cluster_n500['loo_mean_tapered'] = adjust_low_count_mean(cluster_n500['count'],
                                                  cluster_n500['loo_mean'])

cluster_n500['loo_mean_cutoff'] = cut_off_loo_mean(cluster_n500['sum'],
                                                  cluster_n500['loo_mean'])

cluster_n500['loo_mean_prob_bin'] = taper_mean_bin_prob(cluster_n500['loo_mean'],
                                                  cluster_n500['bin_prob'])

In [595]:
cluster_n500.isnull().sum()


Out[595]:
unique_path               0
cluster_n8                0
cluster_n15               0
cluster_n25               0
cluster_n50               0
cluster_n150              0
cluster_n500              0
R                   1183748
tmax                      0
mean                      0
count                     0
sum                       0
loo_mean                  0
loo_mean_tapered          0
loo_mean_cutoff           0
dtype: int64

In [611]:
cluster_n500[['loo_mean', 
              'loo_mean_tapered', 
              'loo_mean_cutoff', 
              'loo_mean_prob_bin']].to_csv(os.path.join(const.DATA_PATH, 'feat_set_cluster_n500_loo.csv'), 
                                         index_label='ID')

In [607]:
cluster_n500.sort_values('loo_mean', ascending=False)


Out[607]:
unique_path cluster_n8 cluster_n15 cluster_n25 cluster_n50 cluster_n150 cluster_n500 R tmax mean count sum loo_mean loo_mean_tapered loo_mean_cutoff bin_prob loo_mean_prob_bin
Id
434784 405 0 9 20 4 12 234 0.0 5114 0.384615 26 10.0 0.436645 0.017002 0.005800 2.084954e-16 0.436645
578446 771 0 0 10 25 11 459 0.0 3693 0.375000 16 6.0 0.431751 0.012615 0.005800 2.876270e-10 0.431751
1620745 407 0 9 20 4 12 234 0.0 5034 0.384615 26 10.0 0.428182 0.016782 0.005800 2.084954e-16 0.428182
653587 650 0 0 10 25 11 459 0.0 3693 0.375000 16 6.0 0.424711 0.012503 0.005800 2.876270e-10 0.424711
436625 700 0 0 10 25 51 459 0.0 3693 0.375000 16 6.0 0.422635 0.012469 0.005800 2.876270e-10 0.422635
1525812 866 0 0 10 25 51 459 0.0 5143 0.375000 16 6.0 0.416566 0.012372 0.005800 2.876270e-10 0.416566
1680715 933 0 9 20 4 12 234 0.0 5202 0.384615 26 10.0 0.414360 0.016423 0.005800 2.084954e-16 0.414360
290451 398 0 9 20 4 12 234 0.0 5031 0.384615 26 10.0 0.410902 0.016333 0.005800 2.084954e-16 0.410902
1912444 404 0 9 20 4 12 234 0.0 5121 0.384615 26 10.0 0.409994 0.016309 0.005800 2.084954e-16 0.409994
669024 404 0 9 20 4 12 234 0.0 5226 0.384615 26 10.0 0.399589 0.016039 0.005800 2.084954e-16 0.399589
375437 663 0 0 10 25 11 459 0.0 5877 0.375000 16 6.0 0.396234 0.012047 0.005800 2.876270e-10 0.396234
841735 2923 4 7 13 14 5 279 0.0 3806 0.356688 157 56.0 0.394859 0.066882 0.394859 5.494214e-83 0.394859
1225536 2921 4 7 13 14 64 279 0.0 3812 0.356688 157 56.0 0.392828 0.066563 0.392828 5.494214e-83 0.392828
1630236 404 0 9 20 4 12 234 1.0 5143 0.384615 26 10.0 0.392537 0.015855 0.005800 2.084954e-16 0.392537
1581577 2885 4 7 13 14 5 279 0.0 3970 0.356688 157 56.0 0.392479 0.066509 0.392479 5.494214e-83 0.392479
840241 2921 4 7 13 14 64 279 0.0 3695 0.356688 157 56.0 0.392415 0.066499 0.392415 5.494214e-83 0.392415
1686399 2922 4 7 13 14 64 279 0.0 3862 0.356688 157 56.0 0.390499 0.066198 0.390499 5.494214e-83 0.390499
1291622 2844 4 7 13 14 5 279 0.0 2533 0.356688 157 56.0 0.389624 0.066060 0.389624 5.494214e-83 0.389624
1107921 2771 4 7 13 14 5 279 0.0 3789 0.356688 157 56.0 0.387737 0.065764 0.387737 5.494214e-83 0.387737
2205450 3083 4 7 13 14 5 279 0.0 3811 0.356688 157 56.0 0.387248 0.065687 0.387248 5.494214e-83 0.387248
1451077 3081 4 7 13 14 64 279 1.0 3695 0.356688 157 56.0 0.386882 0.065630 0.386882 5.494214e-83 0.386882
1093018 3083 4 7 13 14 5 279 1.0 3812 0.356688 157 56.0 0.386551 0.065578 0.386551 5.494214e-83 0.386551
255091 2922 4 7 13 14 64 279 0.0 3854 0.356688 157 56.0 0.386187 0.065521 0.386187 5.494214e-83 0.386187
1607980 2921 4 7 13 14 64 279 0.0 3809 0.356688 157 56.0 0.385965 0.065486 0.385965 5.494214e-83 0.385965
1769234 3082 4 7 13 14 64 279 0.0 3862 0.356688 157 56.0 0.384745 0.065294 0.384745 5.494214e-83 0.384745
339285 123 0 9 20 4 12 234 NaN 2348 0.384615 26 10.0 0.384615 0.015649 0.005800 2.084954e-16 0.384615
1594602 434 0 9 20 4 12 234 NaN 3700 0.384615 26 10.0 0.384615 0.015649 0.005800 2.084954e-16 0.384615
640289 434 0 9 20 4 12 234 NaN 5023 0.384615 26 10.0 0.384615 0.015649 0.005800 2.084954e-16 0.384615
1042518 437 0 9 20 4 12 234 NaN 5019 0.384615 26 10.0 0.384615 0.015649 0.005800 2.084954e-16 0.384615
519428 1101 0 9 20 4 12 234 NaN 5052 0.384615 26 10.0 0.384615 0.015649 0.005800 2.084954e-16 0.384615
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1485907 1867 7 6 14 37 72 355 0.0 5281 0.000000 26 0.0 0.000000 0.005649 0.005800 8.596422e-01 0.004986
729985 134 0 9 20 4 139 168 0.0 2347 0.000000 188 0.0 0.000000 0.004710 0.005800 3.350169e-01 0.001943
730002 9002 3 5 15 30 23 452 0.0 2758 0.000000 738 0.0 0.000000 0.001520 0.005800 1.366577e-02 0.000079
418630 8823 6 13 22 36 111 171 NaN 8290 0.000000 90 0.0 0.000000 0.005278 0.005800 5.924317e-01 0.003436
673415 2388 3 14 2 13 20 340 NaN 3708 0.000000 410 0.0 0.000000 0.003422 0.005800 9.209607e-02 0.000534
673378 3731 2 12 24 16 96 377 NaN 7454 0.000000 242 0.0 0.000000 0.004396 0.005800 2.447088e-01 0.001419
1953379 10075 2 8 16 44 73 48 0.0 7235 0.000000 20 0.0 0.000000 0.005684 0.005800 8.901746e-01 0.005163
150705 14724 2 8 16 41 116 399 0.0 1003 0.000000 23 0.0 0.000000 0.005667 0.005800 8.747752e-01 0.005074
490161 14997 2 8 16 15 141 341 0.0 1003 0.000000 19 0.0 0.000000 0.005690 0.005800 8.953677e-01 0.005193
1486124 9002 3 5 15 30 23 452 0.0 6896 0.000000 738 0.0 0.000000 0.001520 0.005800 1.366577e-02 0.000079
218371 6712 5 11 9 17 84 283 NaN 6999 0.000000 18 0.0 0.000000 0.005696 0.005800 9.005911e-01 0.005223
1953399 4207 2 12 24 16 107 474 0.0 617 0.000000 547 0.0 0.000000 0.002627 0.005800 4.150936e-02 0.000241
1953404 1885 7 0 10 24 53 342 0.0 5157 0.000000 34 0.0 0.000000 0.005603 0.005800 8.205552e-01 0.004759
1485923 3998 5 11 9 17 125 35 0.0 7000 0.000000 14 0.0 0.000000 0.005719 0.005800 9.217913e-01 0.005346
1078751 12939 5 11 17 5 29 215 NaN 7000 0.000000 24 0.0 0.000000 0.005661 0.005800 8.697015e-01 0.005044
151088 14921 6 13 22 8 121 281 0.0 8295 0.000000 64 0.0 0.000000 0.005429 0.005800 6.891608e-01 0.003997
1485877 8908 2 8 16 44 40 285 0.0 7235 0.000000 22 0.0 0.000000 0.005672 0.005800 8.798785e-01 0.005103
490011 2208 3 14 2 13 20 389 0.0 2461 0.000000 280 0.0 0.000000 0.004176 0.005800 1.961790e-01 0.001138
218422 13788 6 13 22 8 121 396 NaN 7480 0.000000 37 0.0 0.000000 0.005585 0.005800 8.063602e-01 0.004677
1733918 2209 3 14 2 13 20 340 NaN 2461 0.000000 410 0.0 0.000000 0.003422 0.005800 9.209607e-02 0.000534
1078916 13768 1 0 10 25 71 398 NaN 1998 0.000000 17 0.0 0.000000 0.005701 0.005800 9.058450e-01 0.005254
150935 8249 7 3 18 18 83 71 0.0 2858 0.000000 30 0.0 0.000000 0.005626 0.005800 8.398713e-01 0.004871
1485627 9996 6 13 22 36 3 39 0.0 5708 0.000000 42 0.0 0.000000 0.005556 0.005800 7.832454e-01 0.004543
1485506 1270 0 9 6 11 124 19 0.0 5884 0.000000 34 0.0 0.000000 0.005603 0.005800 8.205552e-01 0.004759
730417 2988 3 14 2 38 20 268 0.0 2355 0.000000 6 0.0 0.000000 0.005765 0.005800 9.657007e-01 0.005601
489930 536 0 6 14 45 76 490 0.0 5122 0.000000 288 0.0 0.000000 0.004130 0.005800 1.872589e-01 0.001086
1079124 10707 3 5 15 1 23 452 NaN 4193 0.000000 738 0.0 0.000000 0.001520 0.005800 1.366577e-02 0.000079
730421 11159 3 5 8 39 82 243 0.0 6997 0.000000 12 0.0 0.000000 0.005730 0.005800 9.325779e-01 0.005409
1079175 12661 6 13 22 36 47 123 NaN 6055 0.000000 64 0.0 0.000000 0.005429 0.005800 6.891608e-01 0.003997
787089 9119 1 0 10 25 71 248 NaN 1998 0.000000 17 0.0 0.000000 0.005701 0.005800 9.058450e-01 0.005254

2367495 rows × 17 columns


In [609]:
cluster_n500.sample(20)


Out[609]:
unique_path cluster_n8 cluster_n15 cluster_n25 cluster_n50 cluster_n150 cluster_n500 R tmax mean count sum loo_mean loo_mean_tapered loo_mean_cutoff bin_prob loo_mean_prob_bin
Id
756634 4038 2 12 5 16 55 81 0.0 5619 0.005950 9411 56.0 0.005896 0.005896 0.005896 0.052421 0.005891
1252852 13051 1 2 19 3 4 429 NaN 5558 0.004620 25974 120.0 0.004620 0.004620 0.004620 0.001252 0.004621
966057 1485 7 3 18 31 9 266 NaN 4979 0.003510 10827 38.0 0.003510 0.003510 0.003510 0.000209 0.003510
1642451 2583 6 4 4 9 58 410 0.0 3872 0.008061 22825 184.0 0.008662 0.008662 0.008662 0.000004 0.008662
1111737 10957 3 5 15 1 44 199 0.0 4717 0.002106 1899 4.0 0.002100 0.002100 0.005800 0.009977 0.002137
2216066 13747 1 2 3 47 8 63 0.0 4401 0.005264 12727 67.0 0.005133 0.005133 0.005133 0.035203 0.005157
1641067 13387 1 2 3 47 36 369 NaN 3044 0.004285 12602 54.0 0.004285 0.004285 0.004285 0.003442 0.004290
920530 10745 1 2 12 49 19 139 NaN 859 0.004174 13177 55.0 0.004174 0.004174 0.004174 0.001890 0.004177
850754 5494 2 12 24 21 98 30 0.0 4931 0.005973 9375 56.0 0.006472 0.006472 0.006472 0.052116 0.006437
1792954 2580 6 4 4 9 58 97 NaN 5444 0.007751 32642 253.0 0.007751 0.007751 0.007751 0.000001 0.007751
999584 9020 1 2 19 48 22 28 NaN 7304 0.005012 11771 59.0 0.005012 0.005012 0.005012 0.026770 0.005033
642997 13385 1 2 3 47 36 369 NaN 1094 0.004285 12602 54.0 0.004285 0.004285 0.004285 0.003442 0.004290
1530015 2412 6 4 4 43 99 231 NaN 5986 0.006969 25687 179.0 0.006969 0.006969 0.006969 0.001719 0.006966
2237506 4509 2 12 24 16 107 291 0.0 6334 0.005089 4913 25.0 0.004734 0.004734 0.004734 0.063664 0.004802
495154 9017 1 2 19 48 61 144 0.0 1615 0.005170 12379 64.0 0.005514 0.005514 0.005514 0.032088 0.005523
1477151 5776 2 12 24 21 16 30 0.0 1258 0.005973 9375 56.0 0.006292 0.006292 0.006292 0.052116 0.006266
2284423 9270 1 2 19 48 61 122 0.0 6450 0.004721 12921 61.0 0.004424 0.004424 0.004424 0.012694 0.004441
1089204 3496 5 11 9 34 10 143 NaN 5309 0.004211 9023 38.0 0.004211 0.004211 0.004211 0.007293 0.004223
509362 10115 1 2 12 47 49 406 NaN 8295 0.003307 11188 37.0 0.003307 0.003307 0.003307 0.000052 0.003307
1678350 9868 1 2 12 47 36 496 NaN 1865 0.003933 12967 51.0 0.003933 0.003933 0.003933 0.000672 0.003934

In [582]:
cluster_n500.groupby('R')['loo_mean'].mean()


Out[582]:
R
0.0    0.005316
1.0    0.015070
Name: loo_mean, dtype: float64

Calculate features based on unique path


In [612]:
cluster_mean = cluster_info.groupby(['unique_path'])['R'].agg(['mean','count','sum'])
cluster_upath = cluster_info.merge(cluster_mean, left_on='unique_path', right_index=True, how='left')

In [614]:
cluster_upath['bin_prob'] = cluster_upath[['count','sum']].apply(lambda x: binomial2(x[0], x[1], 0.0058), axis=1)

In [615]:
is_train = ~cluster_upath['R'].isnull()
cluster_upath['loo_mean'] = random_loo_mean(cluster_upath['mean'],
                                                  cluster_upath['count'],
                                                  cluster_upath['R'])

cluster_upath['loo_mean_tapered'] = adjust_low_count_mean(cluster_upath['count'],
                                                  cluster_upath['loo_mean'])

cluster_upath['loo_mean_cutoff'] = cut_off_loo_mean(cluster_upath['sum'],
                                                  cluster_upath['loo_mean'])

cluster_upath['loo_mean_prob_bin'] = taper_mean_bin_prob(cluster_upath['loo_mean'],
                                                  cluster_upath['bin_prob'])

In [616]:
cluster_upath.isnull().sum()


Out[616]:
unique_path                0
cluster_n8                 0
cluster_n15                0
cluster_n25                0
cluster_n50                0
cluster_n150               0
cluster_n500               0
R                    1183748
tmax                       0
mean                    4984
count                      0
sum                     4984
loo_mean                   0
loo_mean_tapered           0
loo_mean_cutoff            0
bin_prob                4984
loo_mean_prob_bin       4984
dtype: int64

In [620]:
cluster_upath.sort_values('loo_mean', ascending=False).head(20)


Out[620]:
unique_path cluster_n8 cluster_n15 cluster_n25 cluster_n50 cluster_n150 cluster_n500 R tmax mean count sum loo_mean loo_mean_tapered loo_mean_cutoff bin_prob loo_mean_prob_bin
Id
1402461 12614 1 2 19 3 91 417 0.0 4395 0.500000 2 1.0 1.098499 0.007985 0.0058 1.153272e-02 1.085897
1003981 14802 1 2 1 3 4 0 0.0 3435 0.500000 2 1.0 1.094450 0.007977 0.0058 1.153272e-02 1.081895
1717223 13113 1 2 19 0 91 296 0.0 6531 0.666667 3 2.0 1.093473 0.009063 0.0058 1.003347e-04 1.093364
1912156 7078 2 10 0 6 93 373 0.0 4698 0.500000 2 1.0 1.089647 0.007968 0.0058 1.153272e-02 1.077148
435640 1101 0 9 20 4 12 234 1.0 5052 1.000000 3 3.0 1.085780 0.009040 0.0058 1.951120e-07 1.085780
291416 12158 3 5 15 22 136 269 0.0 2840 0.500000 2 1.0 1.083662 0.007956 0.0058 1.153272e-02 1.071231
2145106 294 0 9 20 4 12 15 1.0 5886 1.000000 2 2.0 1.083298 0.007955 0.0058 3.364000e-05 1.083261
2120566 11099 7 3 18 18 122 25 0.0 6054 0.500000 2 1.0 1.083061 0.007955 0.0058 1.153272e-02 1.070637
159085 1746 7 3 18 31 9 221 0.0 3715 0.500000 2 1.0 1.080949 0.007950 0.0058 1.153272e-02 1.068550
435639 1101 0 9 20 4 12 234 1.0 5052 1.000000 3 3.0 1.080126 0.009023 0.0058 1.951120e-07 1.080126
550071 4221 3 1 21 32 102 161 1.0 7042 1.000000 2 2.0 1.080073 0.007949 0.0058 3.364000e-05 1.080037
384319 12602 1 2 19 3 91 417 0.0 1842 0.666667 3 2.0 1.074458 0.009006 0.0058 1.003347e-04 1.074351
1689187 1073 0 0 10 25 51 106 1.0 6046 1.000000 2 2.0 1.071058 0.007931 0.0058 3.364000e-05 1.071022
991960 7938 1 2 19 48 91 129 1.0 7376 1.000000 2 2.0 1.070149 0.007929 0.0058 3.364000e-05 1.070113
1857410 4937 2 10 5 10 55 445 0.0 6359 0.500000 2 1.0 1.067127 0.007923 0.0058 1.153272e-02 1.054887
330848 4390 2 12 5 16 149 202 0.0 6616 0.500000 2 1.0 1.065797 0.007920 0.0058 1.153272e-02 1.053572
1689886 11915 1 2 3 12 75 119 0.0 1407 0.500000 2 1.0 1.059528 0.007907 0.0058 1.153272e-02 1.047376
1784281 12005 1 2 3 12 75 401 1.0 4395 1.000000 2 2.0 1.059220 0.007907 0.0058 3.364000e-05 1.059184
2027663 11059 1 2 1 49 138 415 0.0 3556 0.500000 2 1.0 1.056117 0.007901 0.0058 1.153272e-02 1.044004
1313111 2500 6 4 4 9 58 491 0.0 3687 0.500000 2 1.0 1.050627 0.007890 0.0058 1.153272e-02 1.038577

In [629]:
cluster_upath.head(20)


Out[629]:
unique_path cluster_n8 cluster_n15 cluster_n25 cluster_n50 cluster_n150 cluster_n500 R tmax mean count sum loo_mean loo_mean_tapered loo_mean_cutoff bin_prob loo_mean_prob_bin
Id
4 13409 1 2 3 47 36 369 0.0 436 0.002041 490 1.0 0.002042 0.003958 0.005800 0.165307 0.002663
6 7029 2 10 0 35 63 14 0.0 6578 0.003097 3552 11.0 0.003035 0.003035 0.005800 0.007936 0.003057
7 12763 1 2 19 3 132 477 0.0 8122 0.004139 10147 42.0 0.004474 0.004474 0.004474 0.004149 0.004479
9 13658 1 2 3 19 8 232 0.0 5770 0.003854 10380 40.0 0.003703 0.003703 0.003703 0.001316 0.003705
11 9865 1 2 12 27 74 165 0.0 3030 0.005020 10159 51.0 0.005224 0.005224 0.005224 0.031896 0.005242
13 10112 1 2 12 27 134 3 0.0 6698 0.005476 10043 55.0 0.005307 0.005307 0.005307 0.049082 0.005331
14 3893 2 12 5 16 55 423 0.0 8320 0.004569 3940 18.0 0.004515 0.004515 0.004515 0.053640 0.004584
16 2415 6 4 4 43 14 231 0.0 4021 0.007089 12272 87.0 0.007029 0.007029 0.007029 0.008224 0.007019
18 13658 1 2 3 19 8 232 0.0 2590 0.003854 10380 40.0 0.003928 0.003928 0.003928 0.001316 0.003930
23 6206 2 12 5 6 135 151 0.0 789 0.004264 3752 16.0 0.004202 0.004202 0.004202 0.042706 0.004270
26 13658 1 2 3 19 8 232 0.0 5529 0.003854 10380 40.0 0.003785 0.003785 0.003785 0.001316 0.003788
27 9538 1 2 19 48 112 331 0.0 2007 0.003067 10432 32.0 0.003357 0.003357 0.003357 0.000020 0.003357
28 10401 1 2 12 47 49 455 0.0 310 0.004321 9951 43.0 0.004142 0.004142 0.004142 0.007675 0.004155
31 14813 1 2 1 3 4 52 0.0 498 0.000000 473 0.0 0.000000 0.003057 0.005800 0.063839 0.000370
34 13048 1 2 19 0 131 349 0.0 1826 0.004556 10536 48.0 0.004717 0.004717 0.004717 0.012511 0.004730
38 13942 1 2 3 19 8 169 0.0 8181 0.004497 10229 46.0 0.004590 0.004590 0.004590 0.011501 0.004604
41 5494 2 12 24 21 98 30 0.0 2382 0.004969 3824 19.0 0.004911 0.004911 0.004911 0.071792 0.004975
44 11913 1 2 3 12 75 119 0.0 7710 0.005416 9970 54.0 0.005822 0.005822 0.005822 0.047718 0.005821
47 5036 2 10 0 35 62 190 0.0 1317 0.004074 3682 15.0 0.004464 0.004464 0.004464 0.035485 0.004512
49 10982 1 2 12 49 147 435 0.0 3388 0.003125 10241 32.0 0.002944 0.002944 0.002944 0.000034 0.002944

In [621]:
cluster_upath[['loo_mean', 
              'loo_mean_tapered', 
              'loo_mean_cutoff', 
              'loo_mean_prob_bin']].to_csv(os.path.join(const.DATA_PATH, 'feat_set_cluster_upath_loo.csv'), 
                                         index_label='ID')

Calculate features based on max data


In [492]:
cluster_mean = cluster_info.groupby(['tmax'])['R'].agg(['mean','count','sum'])
cluster_tmax = cluster_info.merge(cluster_mean, left_on='tmax', right_index=True, how='left')

In [493]:
#cluster_tmax['adj_mean'] = adjust_mean(cluster_tmax['count'], cluster_tmax['mean'])

In [622]:
cluster_tmax['bin_prob'] = cluster_tmax[['count','sum']].apply(lambda x: binomial2(x[0], x[1], 0.0058), axis=1)

In [623]:
cluster_tmax['loo_mean'] = random_loo_mean(cluster_tmax['mean'],
                                                  cluster_tmax['count'],
                                                  cluster_tmax['R'])

cluster_tmax['loo_mean_tapered'] = adjust_low_count_mean(cluster_tmax['count'],
                                                  cluster_tmax['loo_mean'])

cluster_tmax['loo_mean_cutoff'] = cut_off_loo_mean(cluster_tmax['sum'],
                                                  cluster_tmax['loo_mean'])

cluster_tmax['loo_mean_prob_bin'] = taper_mean_bin_prob(cluster_tmax['loo_mean'],
                                                  cluster_tmax['bin_prob'])

In [624]:
cluster_tmax.sort_values('loo_mean', ascending=False)


Out[624]:
unique_path cluster_n8 cluster_n15 cluster_n25 cluster_n50 cluster_n150 cluster_n500 R tmax mean count sum loo_mean bin_prob loo_mean_tapered loo_mean_cutoff loo_mean_prob_bin
Id
195987 14271 1 2 1 19 59 216 1.0 4188 1.000000 2 2.0 1.001012e+00 0.000034 0.007790 0.0058 1.000979
38011 9545 1 2 19 48 112 135 1.0 4188 1.000000 2 2.0 9.402211e-01 0.000034 0.007669 0.0058 0.940190
453838 14264 1 2 1 19 59 363 0.0 1570 0.250000 4 1.0 3.408419e-01 0.022799 0.007140 0.0058 0.333203
1096292 12520 1 2 19 0 59 170 0.0 7498 0.272727 11 3.0 3.263494e-01 0.000031 0.009326 0.0058 0.326340
1253487 8416 1 2 3 27 104 80 0.0 8244 0.272727 11 3.0 3.224451e-01 0.000031 0.009283 0.0058 0.322435
909291 10399 1 2 12 27 74 294 0.0 7498 0.272727 11 3.0 3.204419e-01 0.000031 0.009261 0.0058 0.320432
1096291 12520 1 2 19 0 59 170 0.0 7498 0.272727 11 3.0 3.186799e-01 0.000031 0.009242 0.0058 0.318670
529721 14788 1 2 1 19 4 286 0.0 7498 0.272727 11 3.0 3.180068e-01 0.000031 0.009234 0.0058 0.317997
1188093 8138 1 2 3 27 123 354 0.0 8244 0.272727 11 3.0 3.160392e-01 0.000031 0.009213 0.0058 0.316030
538140 9866 1 2 12 27 74 165 0.0 1570 0.250000 4 1.0 3.154071e-01 0.022799 0.007038 0.0058 0.308349
1657388 14869 1 2 1 3 4 235 0.0 7498 0.272727 11 3.0 3.147206e-01 0.000031 0.009198 0.0058 0.314711
2265160 3891 2 12 24 10 15 466 0.0 1913 0.266667 15 4.0 3.134736e-01 0.000001 0.010415 0.0058 0.313473
1901565 6204 2 12 24 6 31 421 0.0 7498 0.272727 11 3.0 3.120910e-01 0.000031 0.009169 0.0058 0.312082
743217 14512 1 2 1 19 147 76 0.0 1570 0.250000 4 1.0 3.102650e-01 0.022799 0.007018 0.0058 0.303324
901395 6465 2 10 24 21 96 146 0.0 7498 0.272727 11 3.0 3.093432e-01 0.000031 0.009139 0.0058 0.309334
1148466 14350 1 2 1 3 126 482 0.0 1913 0.266667 15 4.0 3.019557e-01 0.000001 0.010242 0.0058 0.301955
412065 11910 1 2 3 0 39 284 0.0 1913 0.266667 15 4.0 3.000050e-01 0.000001 0.010213 0.0058 0.300005
1351072 10116 1 2 12 47 49 406 0.0 8244 0.272727 11 3.0 2.974407e-01 0.000031 0.009008 0.0058 0.297432
1188094 8138 1 2 3 27 123 354 0.0 8244 0.272727 11 3.0 2.953896e-01 0.000031 0.008985 0.0058 0.295381
1095278 11270 1 2 1 49 101 289 0.0 8244 0.272727 11 3.0 2.934070e-01 0.000031 0.008964 0.0058 0.293398
2083199 6753 2 10 0 35 80 368 0.0 1913 0.266667 15 4.0 2.895584e-01 0.000001 0.010056 0.0058 0.289558
2138160 4331 2 12 24 16 15 56 0.0 1913 0.266667 15 4.0 2.879552e-01 0.000001 0.010032 0.0058 0.287955
1736361 10983 1 2 12 49 147 435 0.0 8244 0.272727 11 3.0 2.865670e-01 0.000031 0.008888 0.0058 0.286558
420688 14252 1 2 1 19 78 180 0.0 7498 0.272727 11 3.0 2.863901e-01 0.000031 0.008886 0.0058 0.286382
2213190 11270 1 2 1 49 101 289 0.0 1913 0.266667 15 4.0 2.738120e-01 0.000001 0.009820 0.0058 0.273812
2225166 8696 1 2 3 12 49 255 0.0 7458 0.200000 5 1.0 2.730458e-01 0.028333 0.007136 0.0058 0.265474
594050 5820 2 12 24 21 16 30 NaN 7498 0.272727 11 3.0 2.727273e-01 0.000031 0.008736 0.0058 0.272719
1856924 2372 6 4 4 43 14 388 NaN 7498 0.272727 11 3.0 2.727273e-01 0.000031 0.008736 0.0058 0.272719
559251 4484 2 12 24 16 107 291 NaN 7498 0.272727 11 3.0 2.727273e-01 0.000031 0.008736 0.0058 0.272719
1468853 12536 1 2 19 0 59 170 NaN 8244 0.272727 11 3.0 2.727273e-01 0.000031 0.008736 0.0058 0.272719
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
876745 3079 4 7 13 14 5 98 1.0 2084 0.002571 389 1.0 -3.090203e-19 0.236154 0.003544 0.0058 0.001370
987828 3896 2 12 24 10 15 110 1.0 6453 0.002571 389 1.0 -3.093728e-19 0.236154 0.003544 0.0058 0.001370
507244 11654 1 2 3 12 75 31 1.0 7505 0.002674 374 1.0 -3.099436e-19 0.247749 0.003631 0.0058 0.001437
503530 13749 1 2 3 19 120 312 1.0 7040 0.002571 389 1.0 -3.141376e-19 0.236154 0.003544 0.0058 0.001370
113992 3744 2 12 5 16 55 299 1.0 270 0.003106 322 1.0 -3.206869e-19 0.288643 0.003932 0.0058 0.001674
2147439 12763 1 2 19 3 132 477 1.0 5613 0.002882 347 1.0 -3.209943e-19 0.268954 0.003787 0.0058 0.001560
62028 14263 1 2 1 19 59 363 1.0 2446 0.003106 322 1.0 -3.374715e-19 0.288643 0.003932 0.0058 0.001674
370468 5929 2 12 5 6 135 343 1.0 2379 0.003106 322 1.0 -3.412502e-19 0.288643 0.003932 0.0058 0.001674
196412 1883 7 0 10 24 53 104 1.0 3702 0.003106 322 1.0 -3.471352e-19 0.288643 0.003932 0.0058 0.001674
718422 7078 2 10 0 6 93 373 1.0 4701 0.002882 347 1.0 -3.495385e-19 0.268954 0.003787 0.0058 0.001560
1811752 2261 6 4 4 43 14 388 1.0 5382 0.003106 322 1.0 -3.499000e-19 0.288643 0.003932 0.0058 0.001674
1140622 6611 2 10 24 10 109 230 1.0 907 0.003106 322 1.0 -3.608497e-19 0.288643 0.003932 0.0058 0.001674
60121 11909 1 2 3 0 39 284 1.0 5481 0.004016 249 1.0 -4.131749e-19 0.341287 0.004356 0.0058 0.001979
250576 5177 2 10 0 35 129 65 1.0 788 0.003953 253 1.0 -4.204765e-19 0.338794 0.004333 0.0058 0.001965
163736 13946 1 2 3 47 97 183 1.0 1868 0.004184 239 1.0 -4.273701e-19 0.347201 0.004414 0.0058 0.002014
2053535 14514 1 2 1 3 126 182 1.0 25 0.004184 239 1.0 -4.393003e-19 0.347201 0.004414 0.0058 0.002014
1210425 14566 1 2 1 3 126 182 1.0 4725 0.004219 237 1.0 -4.435166e-19 0.348324 0.004425 0.0058 0.002020
2340935 14797 1 2 1 3 4 0 1.0 6630 0.004184 239 1.0 -4.502037e-19 0.347201 0.004414 0.0058 0.002014
1604801 1399 7 3 18 31 67 134 1.0 2516 0.004219 237 1.0 -4.639356e-19 0.348324 0.004425 0.0058 0.002020
175878 8418 1 2 3 12 75 141 1.0 744 0.004016 249 1.0 -4.711797e-19 0.341287 0.004356 0.0058 0.001979
773894 13651 1 2 3 19 8 232 1.0 2749 0.004016 249 1.0 -4.838576e-19 0.341287 0.004356 0.0058 0.001979
1209214 14539 1 2 1 3 91 383 1.0 668 0.004016 249 1.0 -4.842042e-19 0.341287 0.004356 0.0058 0.001979
1619808 14797 1 2 1 3 4 0 1.0 6191 0.004184 239 1.0 -5.012462e-19 0.347201 0.004414 0.0058 0.002014
755357 9283 1 2 19 48 91 326 1.0 5576 0.004184 239 1.0 -5.126730e-19 0.347201 0.004414 0.0058 0.002014
1336901 796 0 0 10 25 11 75 1.0 5950 0.004854 206 1.0 -5.373106e-19 0.362590 0.004605 0.0058 0.002103
921558 12522 1 2 19 3 132 256 1.0 1405 0.005102 196 1.0 -6.069290e-19 0.365651 0.004663 0.0058 0.002121
1116294 6890 2 10 0 6 93 373 1.0 923 0.006211 161 1.0 -6.604763e-19 0.368176 0.004866 0.0058 0.002135
1610117 11651 1 2 3 0 39 456 1.0 8126 0.010204 98 1.0 -1.079356e-18 0.323302 0.005232 0.0058 0.001875
2174335 12199 1 2 3 12 97 82 1.0 2663 0.009709 103 1.0 -1.111768e-18 0.330057 0.005203 0.0058 0.001914
717801 13048 1 2 19 0 131 349 1.0 20 0.020408 49 1.0 -2.491280e-18 0.214963 0.005516 0.0058 0.001247

2367495 rows × 17 columns


In [625]:
cluster_tmax[['loo_mean', 
              'loo_mean_tapered', 
              'loo_mean_cutoff', 
              'loo_mean_prob_bin']].to_csv(os.path.join(const.DATA_PATH, 'feat_set_cluster_tmax_loo.csv'), 
                                         index_label='ID')

In [626]:
cols = ['loo_mean', 
              'loo_mean_tapered', 
              'loo_mean_cutoff', 
              'loo_mean_prob_bin']

for col in cols:
    print(col)
    print(cluster_tmax.groupby('R')[col].mean())
    print('')


loo_mean
R
0.0    0.005755
1.0    0.015315
Name: loo_mean, dtype: float64

loo_mean_tapered
R
0.0    0.005775
1.0    0.008709
Name: loo_mean_tapered, dtype: float64

loo_mean_cutoff
R
0.0    0.006053
1.0    0.009864
Name: loo_mean_cutoff, dtype: float64

loo_mean_prob_bin
R
0.0    0.006216
1.0    0.015583
Name: loo_mean_prob_bin, dtype: float64


In [627]:
cols = ['loo_mean', 
              'loo_mean_tapered', 
              'loo_mean_cutoff', 
              'loo_mean_prob_bin']

for col in cols:
    print(col)
    print(cluster_n500.groupby('R')[col].mean())
    print('')


loo_mean
R
0.0    0.005688
1.0    0.026806
Name: loo_mean, dtype: float64

loo_mean_tapered
R
0.0    0.005204
1.0    0.011705
Name: loo_mean_tapered, dtype: float64

loo_mean_cutoff
R
0.0    0.005627
1.0    0.023568
Name: loo_mean_cutoff, dtype: float64

loo_mean_prob_bin
R
0.0    0.005718
1.0    0.026825
Name: loo_mean_prob_bin, dtype: float64


In [628]:
cols = ['loo_mean', 
              'loo_mean_tapered', 
              'loo_mean_cutoff', 
              'loo_mean_prob_bin']

for col in cols:
    print(col)
    print(cluster_upath.groupby('R')[col].mean())
    print('')


loo_mean
R
0.0    0.005543
1.0    0.041153
Name: loo_mean, dtype: float64

loo_mean_tapered
R
0.0    0.005071
1.0    0.006438
Name: loo_mean_tapered, dtype: float64

loo_mean_cutoff
R
0.0    0.005349
1.0    0.013677
Name: loo_mean_cutoff, dtype: float64

loo_mean_prob_bin
R
0.0    0.00578
1.0    0.04125
Name: loo_mean_prob_bin, dtype: float64


In [ ]: