In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import xgboost as xgb
import math
import time
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
import kagglegym
%pylab inline
import pylab as pl
pylab.rcParams['figure.figsize'] = (10, 6)
%matplotlib inline
import matplotlib.pyplot as plt
import os
In [2]:
def rscore(y, y_hat, y_mean = None):
ymean = y_mean if y_mean else np.mean(y)
num = np.sum((y_hat - y) ** 2.0)
denom = np.sum((y - ymean) ** 2.0)
ra = 1 - (num / denom)
r = np.sign(ra) * np.sqrt(np.abs(1 - (num / denom)))
return r
In [3]:
train = pd.read_hdf('../../twosigma/input/train.h5')
In [4]:
subs = {}
for f in os.listdir():
if f[-4:] != '.pkl':
continue
name = f[:-4]
print(name)
subs[name] = pd.read_pickle(f)
In [5]:
subs2 = {}
for s in subs.keys():
ymean = subs[s].y.mean()
scores = pd.DataFrame(columns=['day', 'r_day', 'r_all'])
for g in subs[s].groupby('timestamp'):
r_day = rscore(g[1].y, g[1].y_hat)
r_all = rscore(g[1].y, g[1].y_hat, y_mean = ymean)
scores.loc[len(scores)] = (g[0], r_day, r_all)
subs2[s] = scores.copy()
In [6]:
subs_merge = pd.DataFrame(columns=['day'])
In [7]:
for s in subs2.keys():
print(s, subs2[s].r_day.mean(), subs2[s].r_all.mean())
In [8]:
rename = {'sub-232017': '232017', 'kernel-sdfoley-results': 'sdfoley', 'sub-227738': '227738', 'kernel-tks-results': 'tks', 'sub-212725': '212725'}
In [9]:
subs_merge2 = subs2['kernel-tks-results'].copy()
In [10]:
subs_merge2
Out[10]:
In [11]:
for s in subs2.keys():
if s == 'kernel-tks-results':
continue
subs_merge2 = pd.merge(subs_merge2, subs2[s], on='day', suffixes=['', '_'+rename[s]])
In [12]:
c = list(subs_merge2.columns)
c[1] = 'r_day_tks'
c[2] = 'r_all_tks'
subs_merge2.columns = c
In [13]:
subs_merge2
Out[13]:
In [97]:
#subs_merge2c = subs_merge2[np.logical_or(subs_merge2.day <= 905 + 500, subs_merge2.day >= 905 + 600)]
subs_merge2c = subs_merge2[np.logical_and(subs_merge2.day >= 905 + 0, subs_merge2.day <= 905 + 600)]
In [98]:
for c in sorted(subs_merge2.columns[1:]):
if 'r_day' in c:
print(c, subs_merge2[c].mean(), subs_merge2c[c].mean())
In [99]:
for c in sorted(subs_merge2.columns[1:]):
if 'r_all' in c:
print(c, subs_merge2[c].mean(), subs_merge2c[c].mean())
In [20]:
daily_denom = []
for g in train.groupby('timestamp'):
denom = np.sum((g[1].y - g[1].y.mean()) ** 2.0)
daily_denom.append(denom)
In [29]:
daily_denom = np.array(daily_denom)
daily_denom_valid = np.array(daily_denom[906:])
In [36]:
share = daily_denom_valid #* np.mean(daily_denom_valid)
In [50]:
for c in sorted(subs_merge2.columns[1:]):
if 'r_all' in c:
print(c, subs_merge2[c].mean(), (subs_merge2[c] * (share / np.sum(share))).sum())
In [101]:
for s in subs.keys():
scut = subs[s][subs[s].timestamp < 1505]
print(s, rscore(subs[s].y, subs[s].y_hat), rscore(scut.y, scut.y_hat))
In [51]:
# take two!
In [88]:
k = 'kernel-tks-results'
s = subs[k]
fullavg = s.y.mean()
#df_nd = pd.DataFrame(columns=['num', 'denom_full', 'denom_local'])
results = []
for g in s.groupby('timestamp'):
num = np.sum((g[1].y_hat - g[1].y) ** 2.0)
denom_full = np.sum((g[1].y - fullavg) ** 2.0)
denom_local = np.sum((g[1].y - g[1].y.mean()) ** 2.0)
results.append((num, denom_full, denom_local))
In [89]:
resultsa = np.array(results)
resultsc = np.array(results)[906:]
In [96]:
num = np.sum(resultsc[:,0]) ** 1
denom = np.sum(resultsc[:,1]) ** 1
ra = 1 - (num / denom)
r = np.sign(ra) * np.sqrt(np.abs(1 - (num / denom)))
r
Out[96]:
In [91]:
rscore(subs[k].y, subs[k].y_hat)
Out[91]:
In [ ]: