Trying to figure out why tks's and my .0232 kernel are so good... and show how R measurements can differ a lot!


In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import xgboost as xgb
import math

import time

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge

import kagglegym

%pylab inline
import pylab as pl
pylab.rcParams['figure.figsize'] = (10, 6)

%matplotlib inline
import matplotlib.pyplot as plt

import os


Populating the interactive namespace from numpy and matplotlib

In [2]:
def rscore(y, y_hat, y_mean = None):
    ymean = y_mean if y_mean else np.mean(y)
    
    num = np.sum((y_hat - y) ** 2.0)
    denom = np.sum((y - ymean) ** 2.0) 

    ra = 1 - (num / denom)
    r = np.sign(ra) * np.sqrt(np.abs(1 - (num / denom)))

    return r

In [3]:
train = pd.read_hdf('../../twosigma/input/train.h5')

In [4]:
subs = {}

for f in os.listdir():
    if f[-4:] != '.pkl':
        continue
    
    name = f[:-4]
    
    print(name)
    
    subs[name] = pd.read_pickle(f)


sub-232017
kernel-sdfoley-results
sub-227738
sub-212725
kernel-tks-results

In [5]:
subs2 = {}

for s in subs.keys():
    ymean = subs[s].y.mean()

    scores = pd.DataFrame(columns=['day', 'r_day', 'r_all'])
    
    for g in subs[s].groupby('timestamp'):
        r_day = rscore(g[1].y, g[1].y_hat)
        r_all = rscore(g[1].y, g[1].y_hat, y_mean = ymean)
        
        scores.loc[len(scores)] = (g[0], r_day, r_all)
        
    subs2[s] = scores.copy()

In [6]:
subs_merge = pd.DataFrame(columns=['day'])

In [7]:
for s in subs2.keys():
    print(s, subs2[s].r_day.mean(), subs2[s].r_all.mean())


sub-232017 -0.13046021980854716 -0.0020613085262296688
kernel-sdfoley-results -0.12922836648276378 0.0013319721446613084
sub-227738 -0.12900475630503958 0.00393841143448099
sub-212725 -0.129474654647052 0.004040313185385112
kernel-tks-results -0.1275840320143353 0.004212891911545318

In [8]:
rename = {'sub-232017': '232017', 'kernel-sdfoley-results': 'sdfoley', 'sub-227738': '227738', 'kernel-tks-results': 'tks', 'sub-212725': '212725'}

In [9]:
subs_merge2 = subs2['kernel-tks-results'].copy()

In [10]:
subs_merge2


Out[10]:
day r_day r_all
0 906.0 -0.183311 0.026283
1 907.0 -0.059661 -0.015746
2 908.0 -0.120507 -0.086362
3 909.0 -0.136579 0.013083
4 910.0 -0.047105 0.030730
5 911.0 -0.127329 -0.048265
6 912.0 -0.335744 -0.021051
7 913.0 -0.028031 0.054133
8 914.0 -0.124021 -0.012643
9 915.0 -0.035447 -0.022003
10 916.0 -0.072309 0.085680
11 917.0 -0.166175 0.018368
12 918.0 0.036984 0.044621
13 919.0 -0.073433 -0.071922
14 920.0 -0.171200 0.034727
15 921.0 -0.151689 0.026493
16 922.0 -0.102259 -0.043820
17 923.0 -0.280307 -0.024122
18 924.0 -0.038343 -0.026769
19 925.0 -0.082469 -0.066442
20 926.0 -0.149004 0.030366
21 927.0 0.016337 0.088838
22 928.0 -0.209951 0.054318
23 929.0 0.023494 0.063979
24 930.0 -0.145000 -0.050474
25 931.0 0.032550 0.038997
26 932.0 -0.356693 0.061398
27 933.0 -0.157885 -0.074434
28 934.0 -0.145108 -0.069145
29 935.0 -0.158932 0.058535
... ... ... ...
877 1783.0 -0.337499 -0.028208
878 1784.0 -0.378984 0.013550
879 1785.0 -0.127223 0.065666
880 1786.0 -0.177241 -0.046829
881 1787.0 -0.194152 0.023619
882 1788.0 -0.206989 0.061783
883 1789.0 -0.146364 0.066139
884 1790.0 -0.301258 -0.041831
885 1791.0 -0.201527 -0.069497
886 1792.0 -0.065996 -0.063092
887 1793.0 0.060053 0.060526
888 1794.0 -0.254613 -0.024919
889 1795.0 -0.080778 -0.033617
890 1796.0 0.055703 0.061091
891 1797.0 -0.122034 0.054330
892 1798.0 -0.110273 -0.087801
893 1799.0 -0.004993 0.034732
894 1800.0 -0.070474 -0.053972
895 1801.0 -0.176687 -0.049853
896 1802.0 -0.090477 -0.053515
897 1803.0 -0.060231 -0.052480
898 1804.0 -0.093063 0.060022
899 1805.0 -0.159031 0.066305
900 1806.0 -0.114054 0.021440
901 1807.0 0.030247 0.038482
902 1808.0 -0.059556 0.031197
903 1809.0 -0.154562 -0.049881
904 1810.0 -0.322419 -0.025527
905 1811.0 -0.240517 0.009408
906 1812.0 -0.019743 0.027571

907 rows × 3 columns


In [11]:
for s in subs2.keys():
    if s == 'kernel-tks-results':
        continue
        
    subs_merge2 = pd.merge(subs_merge2, subs2[s], on='day', suffixes=['', '_'+rename[s]])

In [12]:
c = list(subs_merge2.columns)
c[1] = 'r_day_tks'
c[2] = 'r_all_tks'
subs_merge2.columns = c

In [13]:
subs_merge2


Out[13]:
day r_day_tks r_all_tks r_day_232017 r_all_232017 r_day_sdfoley r_all_sdfoley r_day_227738 r_all_227738 r_day_212725 r_all_212725
0 906.0 -0.183311 0.026283 -0.203395 -0.082573 -0.200028 -0.074194 -0.202150 -0.079561 -0.189934 -0.041222
1 907.0 -0.059661 -0.015746 -0.102192 -0.084315 -0.072236 -0.043602 -0.048544 0.030838 -0.030153 0.048924
2 908.0 -0.120507 -0.086362 -0.112052 -0.074202 -0.070354 0.045246 -0.136509 -0.107436 -0.130978 -0.100366
3 909.0 -0.136579 0.013083 -0.115924 0.072733 -0.122671 0.060910 -0.119729 0.066408 -0.115499 0.073394
4 910.0 -0.047105 0.030730 -0.064683 -0.031850 -0.072465 -0.045590 -0.041521 0.037916 -0.040868 0.038618
5 911.0 -0.127329 -0.048265 -0.143146 -0.080927 -0.132084 -0.059549 -0.123194 -0.036163 -0.120597 -0.026137
6 912.0 -0.335744 -0.021051 -0.349328 -0.093862 -0.349178 -0.093361 -0.335796 -0.021784 -0.337298 -0.037197
7 913.0 -0.028031 0.054133 -0.081523 -0.053927 -0.045704 0.040399 -0.049566 0.035574 -0.055592 0.025183
8 914.0 -0.124021 -0.012643 -0.133517 -0.050681 -0.134558 -0.053326 -0.139938 -0.065562 -0.140375 -0.066474
9 915.0 -0.035447 -0.022003 -0.037944 -0.025830 -0.017317 0.021719 0.050304 0.057445 -0.017170 0.021836
10 916.0 -0.072309 0.085680 -0.070515 0.087144 -0.097300 0.056173 -0.052404 0.098957 -0.061290 0.093781
11 917.0 -0.166175 0.018368 -0.162752 0.037850 -0.154666 0.062686 -0.160560 0.046067 -0.161080 0.044267
12 918.0 0.036984 0.044621 -0.023234 0.009199 -0.037491 -0.027939 -0.057312 -0.051560 -0.062803 -0.057599
13 919.0 -0.073433 -0.071922 -0.121979 -0.121066 -0.028204 -0.024014 -0.061539 -0.059729 -0.059769 -0.057905
14 920.0 -0.171200 0.034727 -0.153456 0.082437 -0.162084 0.064452 -0.169703 0.041247 -0.168440 0.045998
15 921.0 -0.151689 0.026493 -0.156509 -0.027371 -0.166215 -0.061716 -0.137240 0.069135 -0.139132 0.065338
16 922.0 -0.102259 -0.043820 -0.120970 -0.077857 -0.105482 -0.050833 -0.109676 -0.058981 -0.110850 -0.061120
17 923.0 -0.280307 -0.024122 -0.273877 0.052189 -0.267495 0.076998 -0.278871 0.012760 -0.277979 0.024977
18 924.0 -0.038343 -0.026769 -0.071321 -0.065805 -0.015821 0.022414 -0.032726 -0.017824 -0.031143 -0.014719
19 925.0 -0.082469 -0.066442 -0.096250 -0.082895 -0.059501 -0.034081 -0.077077 -0.059635 -0.075139 -0.057113
20 926.0 -0.149004 0.030366 -0.155850 -0.033431 -0.141414 0.055466 -0.170466 -0.076021 -0.168470 -0.071540
21 927.0 0.016337 0.088838 0.063328 0.107737 -0.093437 -0.032194 -0.060889 0.062834 -0.065798 0.057714
22 928.0 -0.209951 0.054318 -0.216247 0.019698 -0.245388 -0.111618 -0.208107 0.060718 -0.208585 0.059132
23 929.0 0.023494 0.063979 0.042321 0.072993 -0.044643 0.039464 -0.078175 -0.050462 -0.080458 -0.053920
24 930.0 -0.145000 -0.050474 -0.149416 -0.061839 -0.124055 0.054644 -0.163337 -0.089995 -0.158972 -0.081961
25 931.0 0.032550 0.038997 -0.040708 -0.034563 -0.033782 -0.026055 0.019718 0.029162 0.018122 0.028108
26 932.0 -0.356693 0.061398 -0.367884 -0.058281 -0.392811 -0.141968 -0.352607 0.079571 -0.354367 0.072329
27 933.0 -0.157885 -0.074434 -0.173539 -0.103104 -0.159476 -0.077691 -0.170007 -0.097159 -0.168453 -0.094466
28 934.0 -0.145108 -0.069145 -0.143012 -0.064704 -0.116388 0.051084 -0.158164 -0.093152 -0.167917 -0.108659
29 935.0 -0.158932 0.058535 -0.151957 0.074391 -0.159288 0.057586 -0.169227 0.011942 -0.169948 -0.009756
... ... ... ... ... ... ... ... ... ... ... ...
877 1783.0 -0.337499 -0.028208 -0.350303 -0.093317 -0.357785 -0.116050 -0.332294 0.048341 -0.334179 0.034754
878 1784.0 -0.378984 0.013550 -0.387587 -0.074712 -0.396362 -0.107687 -0.382695 -0.047831 -0.378754 0.018326
879 1785.0 -0.127223 0.065666 -0.135009 0.048078 -0.171235 -0.092513 -0.154957 -0.057932 -0.153614 -0.054315
880 1786.0 -0.177241 -0.046829 -0.185808 -0.072216 -0.186853 -0.074790 -0.160633 0.057091 -0.164117 0.046476
881 1787.0 -0.194152 0.023619 -0.180081 0.075030 -0.179138 0.077173 -0.193078 0.030959 -0.194677 0.019008
882 1788.0 -0.206989 0.061783 -0.173432 0.126538 -0.192855 0.095998 -0.204858 0.068230 -0.206500 0.063328
883 1789.0 -0.146364 0.066139 -0.152078 0.052076 -0.177941 -0.074884 -0.184269 -0.088556 -0.182603 -0.085124
884 1790.0 -0.301258 -0.041831 -0.318897 -0.108608 -0.319174 -0.109354 -0.316638 -0.102360 -0.316548 -0.102103
885 1791.0 -0.201527 -0.069497 -0.225846 -0.121926 -0.207792 -0.085475 -0.210203 -0.090991 -0.207774 -0.085434
886 1792.0 -0.065996 -0.063092 -0.063546 -0.060526 -0.039041 -0.033919 -0.051736 -0.047984 -0.049025 -0.045049
887 1793.0 0.060053 0.060526 0.046042 0.046658 -0.050821 -0.050253 0.042840 0.043502 0.043657 0.044307
888 1794.0 -0.254613 -0.024919 -0.272046 -0.096171 -0.272772 -0.098084 -0.266102 -0.079013 -0.266743 -0.081017
889 1795.0 -0.080778 -0.033617 -0.034967 0.064373 -0.051271 0.052396 -0.033289 0.065251 -0.042013 0.060036
890 1796.0 0.055703 0.061091 0.076413 0.080415 -0.071577 -0.066999 0.052944 0.058587 0.051744 0.057506
891 1797.0 -0.122034 0.054330 -0.091450 0.096779 -0.117325 0.063710 -0.111525 0.073233 -0.118145 0.062203
892 1798.0 -0.110273 -0.087801 -0.146972 -0.130798 -0.082595 -0.048931 -0.092243 -0.063826 -0.093251 -0.065268
893 1799.0 -0.004993 0.034732 -0.039867 -0.018873 -0.025826 0.023772 -0.071435 -0.062172 -0.071208 -0.061912
894 1800.0 -0.070474 -0.053972 -0.116119 -0.106830 -0.054222 -0.029843 -0.078201 -0.063713 -0.077864 -0.063301
895 1801.0 -0.176687 -0.049853 -0.192462 -0.090257 -0.178793 -0.056686 -0.174356 -0.041108 -0.169940 -0.014557
896 1802.0 -0.090477 -0.053515 -0.122670 -0.098435 -0.081634 -0.036741 -0.096817 -0.063600 -0.094295 -0.059713
897 1803.0 -0.060231 -0.052480 -0.080148 -0.074482 -0.039972 -0.026943 -0.054303 -0.045561 -0.053461 -0.044555
898 1804.0 -0.093063 0.060022 -0.092356 0.061092 -0.099688 0.048388 -0.060300 0.092554 -0.057257 0.094444
899 1805.0 -0.159031 0.066305 -0.141556 0.097453 -0.162534 0.057468 -0.150713 0.083057 -0.148790 0.086358
900 1806.0 -0.114054 0.021440 -0.133303 -0.065103 -0.127804 -0.053120 -0.148704 -0.092324 -0.148460 -0.091937
901 1807.0 0.030247 0.038482 -0.017454 0.016186 -0.013213 0.019798 -0.067629 -0.063282 -0.066349 -0.061913
902 1808.0 -0.059556 0.031197 -0.095457 -0.067579 -0.079960 -0.043135 -0.045565 0.049370 -0.051737 0.042890
903 1809.0 -0.154562 -0.049881 -0.192382 -0.123836 -0.156803 -0.056314 -0.128301 0.069173 -0.128818 0.068225
904 1810.0 -0.322419 -0.025527 -0.333986 -0.086797 -0.329918 -0.071315 -0.317833 0.044830 -0.317705 0.045648
905 1811.0 -0.240517 0.009408 -0.264850 -0.107402 -0.250902 -0.068814 -0.241116 -0.013569 -0.241225 -0.015304
906 1812.0 -0.019743 0.027571 -0.017762 0.028885 -0.055798 -0.044276 0.004168 0.034159 0.014532 0.036884

907 rows × 11 columns


In [97]:
#subs_merge2c = subs_merge2[np.logical_or(subs_merge2.day <= 905 + 500, subs_merge2.day >= 905 + 600)]
subs_merge2c = subs_merge2[np.logical_and(subs_merge2.day >= 905 + 0, subs_merge2.day <= 905 + 600)]

In [98]:
for c in sorted(subs_merge2.columns[1:]):
    if 'r_day' in c:
        print(c, subs_merge2[c].mean(), subs_merge2c[c].mean())


r_day_212725 -0.129474654647052 -0.1196929946425665
r_day_227738 -0.12900475630503958 -0.11912625702326075
r_day_232017 -0.13046021980854716 -0.12154273805195004
r_day_sdfoley -0.12922836648276378 -0.12139867700628319
r_day_tks -0.1275840320143353 -0.11777282568035356

In [99]:
for c in sorted(subs_merge2.columns[1:]):
    if 'r_all' in c:
        print(c, subs_merge2[c].mean(), subs_merge2c[c].mean())


r_all_212725 0.004040313185385112 0.002791846401086383
r_all_227738 0.00393841143448099 0.0026260301125982867
r_all_232017 -0.0020613085262296688 -0.00362277182804741
r_all_sdfoley 0.0013319721446613084 0.002891123687007352
r_all_tks 0.004212891911545318 0.0023873754028240174

In [20]:
daily_denom = []

for g in train.groupby('timestamp'):
    denom = np.sum((g[1].y - g[1].y.mean()) ** 2.0)
    daily_denom.append(denom)

In [29]:
daily_denom = np.array(daily_denom)
daily_denom_valid = np.array(daily_denom[906:])

In [36]:
share = daily_denom_valid #* np.mean(daily_denom_valid)

In [50]:
for c in sorted(subs_merge2.columns[1:]):
    if 'r_all' in c:
        print(c, subs_merge2[c].mean(), (subs_merge2[c] * (share / np.sum(share))).sum())


r_all_212725 0.004040313185385112 0.006928498836021403
r_all_227738 0.00393841143448099 0.006622734570581542
r_all_232017 -0.0020613085262296688 3.5125009634177076e-05
r_all_sdfoley 0.0013319721446613084 0.003218099996858741
r_all_tks 0.004212891911545318 0.007853709527536907

In [101]:
for s in subs.keys():
    scut = subs[s][subs[s].timestamp < 1505]
    print(s, rscore(subs[s].y, subs[s].y_hat), rscore(scut.y, scut.y_hat))


sub-232017 0.0133364035817 -0.0172549409314
kernel-sdfoley-results 0.0268395574016 0.0206635203453
sub-227738 0.0252797888218 0.00554850601996
sub-212725 0.0245111476451 0.00617512093277
kernel-tks-results 0.0280086079517 0.0127274853258

In [51]:
# take two!

In [88]:
k = 'kernel-tks-results'

s = subs[k]

fullavg = s.y.mean()

#df_nd = pd.DataFrame(columns=['num', 'denom_full', 'denom_local'])

results = []

for g in s.groupby('timestamp'):
    num = np.sum((g[1].y_hat - g[1].y) ** 2.0)
    denom_full = np.sum((g[1].y - fullavg) ** 2.0)
    denom_local = np.sum((g[1].y - g[1].y.mean()) ** 2.0)
    
    results.append((num, denom_full, denom_local))

In [89]:
resultsa = np.array(results)
resultsc = np.array(results)[906:]

In [96]:
num = np.sum(resultsc[:,0]) ** 1
denom = np.sum(resultsc[:,1]) ** 1

ra = 1 - (num / denom)
r = np.sign(ra) * np.sqrt(np.abs(1 - (num / denom)))

r


Out[96]:
0.027570675633338045

In [91]:
rscore(subs[k].y, subs[k].y_hat)


Out[91]:
0.028008607951661255

In [ ]: