In [ ]:
import kagglegym
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import xgboost as xgb
import math
import time
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from operator import itemgetter
localrun = True
usepublic = True
vmode = False
all_features = False # use w/vmode for feature selection. run twice, cutting # of rounds to peak round
# This is taken from Frans Slothoubers post on the contest discussion forum.
# https://www.kaggle.com/slothouber/two-sigma-financial-modeling/kagglegym-emulation
def r_score(y_true, y_pred, sample_weight=None, multioutput=None):
r2 = r2_score(y_true, y_pred, sample_weight=sample_weight,
multioutput=multioutput)
r = (np.sign(r2)*np.sqrt(np.abs(r2)))
if r <= -1:
return -1
else:
return r
# From the xgboost script (along with the param settings)
# Function XGBOOST ########################################################
def xgb_obj_custom_r(y_pred, dtrain):
y_true = dtrain.get_label()
y_mean = np.mean(y_true)
y_median = np.median(y_true)
c1 = y_true
#c1 = y_true - y_mean
#c1 = y_true - y_median
grad = 2*(y_pred-y_true)/(c1**2)
hess = 2/(c1**2)
return grad, hess
def xgb_eval_custom_r(y_pred, dtrain):
#y_pred = np.clip(y_pred, -0.075, .075)
# y_pred[y_pred > .075] = .075
# y_pred[y_pred < -.075] = -.075
y_true = dtrain.get_label()
ybar = np.sum(y_true)/len(y_true)
ssres = np.sum((y_true - y_pred) ** 2)
sstot = np.sum((y_true - ybar)**2)
r2 = 1 - ssres/sstot
error = np.sign(r2) * np.absolute(r2)**0.5
return 'error', error
env = kagglegym.make()
o = env.reset()
#excl = [env.ID_COL_NAME, env.SAMPLE_COL_NAME, env.TARGET_COL_NAME, env.TIME_COL_NAME]
excl = ['id', 'sample', 'y', 'timestamp']
basecols = [c for c in o.train.columns if c not in excl]
#rcol_orig = ['Dtechnical_20', 'y_prev_pred_avg_diff', 'Dtechnical_21', 'technical_43_prev', 'technical_20', 'y_prev_pred_avgT0', 'y_prev_pred_mavg5', 'y_prev_pred_avgT1', 'fundamental_8_prev', 'Dtechnical_40', 'technical_7_prev', 'technical_7', 'fundamental_5', 'Dtechnical_30', 'technical_32_prev', 'technical_14_prev', 'fundamental_1', 'fundamental_43_prev', 'Dfundamental_22', 'Dtechnical_35', 'Dtechnical_6', 'Dtechnical_17', 'Dtechnical_27', 'Dfundamental_42', 'fundamental_1_prev', 'Dtechnical_0', 'technical_40', 'technical_40_prev', 'fundamental_36', 'Dfundamental_33', 'Dfundamental_48', 'technical_27_prev', 'fundamental_62_prev', 'fundamental_41_prev', 'Dfundamental_50', 'fundamental_48', 'derived_2_prev', 'Dtechnical_18', 'fundamental_35', 'Dfundamental_49', 'fundamental_26_prev', 'technical_28_prev', 'Dfundamental_63', 'fundamental_10_prev', 'fundamental_36_prev', 'fundamental_16', 'Dfundamental_8', 'fundamental_32', 'fundamental_40_prev', 'derived_0', 'Dfundamental_32', 'fundamental_17', 'Dtechnical_7', 'fundamental_25', 'technical_35', 'Dtechnical_19', 'technical_35_prev', 'fundamental_8', 'Dtechnical_32', 'Dfundamental_18', 'Dtechnical_37', 'fundamental_33_prev', 'Dtechnical_28', 'fundamental_46', 'Dfundamental_1', 'Dfundamental_45', 'fundamental_18', 'technical_12', 'technical_44', 'fundamental_22', 'Dtechnical_5', 'technical_17_prev', 'Dfundamental_25']
rcol_orig = ['Dtechnical_20', 'technical_43_prev', 'technical_7', 'y_prev_pred_avg_diff', 'y_prev_pred_avgT0', 'y_prev_pred_mavg5', 'technical_7_prev', 'technical_20', 'technical_40_prev', 'y_prev_pred_avgT1', 'Dtechnical_40', 'Dtechnical_30', 'technical_40', 'fundamental_36_prev', 'fundamental_5', 'fundamental_8_prev', 'technical_35', 'Dtechnical_21', 'fundamental_36', 'fundamental_43_prev', 'fundamental_46', 'fundamental_18', 'Dtechnical_35', 'Dtechnical_0', 'Dfundamental_45', 'fundamental_48', 'fundamental_1_prev', 'Dtechnical_27', 'Dfundamental_50', 'Dfundamental_18', 'fundamental_16', 'Dfundamental_48', 'Dtechnical_6', 'fundamental_40_prev', 'fundamental_26_prev', 'Dfundamental_8', 'Dtechnical_19', 'fundamental_25', 'fundamental_8', 'fundamental_10_prev', 'technical_35_prev', 'technical_14_prev', 'fundamental_1', 'Dtechnical_37', 'Dfundamental_49', 'Dtechnical_18', 'Dfundamental_42', 'fundamental_41_prev', 'fundamental_62_prev', 'technical_12', 'technical_17_prev', 'technical_27_prev', 'Dtechnical_17', 'derived_0', 'fundamental_33_prev', 'fundamental_32', 'fundamental_17', 'Dtechnical_32', 'technical_32_prev', 'Dfundamental_22', 'fundamental_22', 'fundamental_35', 'Dfundamental_32', 'Dtechnical_7', 'Dfundamental_1', 'technical_28_prev', 'Dtechnical_28', 'Dfundamental_25', 'Dfundamental_63', 'Dtechnical_5', 'technical_44', 'Dfundamental_33', 'derived_2_prev']
rcol = rcol_orig.copy()
if all_features:
rcol = []
for c in basecols:
rcol.append(c)
rcol.append(c + '_prev')
rcol.append('D' + c)
rcol += ['y_prev_pred_avg_diff', 'y_prev_pred_avgT0', 'y_prev_pred_mavg5', 'y_prev_pred_avgT1']
rcol_orig = rcol.copy()
backy_fset = ['technical_13', 'technical_20', 'technical_13_prev', 'technical_20_prev', 'technical_30_prev', 'technical_30']
for f in backy_fset:
if f not in rcol:
rcol.append(f)
def get_basecols(rcol):
duse = {}
for r in rcol:
if 'y' in r:
continue
if 'D' in r:
duse[r[1:]] = True
elif '_prev' in r:
duse[r[:-5]] = True
elif r in basecols:
duse[r] = True
return [k for k in duse.keys()]
basecols_touse = get_basecols(rcol)
if vmode:
train = pd.read_hdf('../../twosigma/input/train.h5')
else:
train = o.train.copy()
d_mean = o.train[basecols_touse].median(axis=0)
for c in basecols_touse:
d_mean[c + '_prev'] = d_mean[c]
d_mean['D' + c] = 0
median = {t[0]:t[1] for t in zip(d_mean.index, d_mean.values)}
median['y'] = 0
print('processed medians')
class DataPrep:
def __init__(self, yprev_model = None, keepinput = True, cols = rcol, seed = 2017):
self.previnput = None
self.prevavg = 0
self.cols = cols.copy()
self.basecols = get_basecols(self.cols)
self.keepcols = ['y', 'id', 'timestamp'] + self.basecols
self.allinput = [] if keepinput else None
self.dayavg = []
self.yprev_model = yprev_model
np.random.seed(seed)
self.random_state = np.random.get_state()
def procday(self, day_in):
if 'y' not in day_in and 'y' in self.keepcols:
self.keepcols.remove('y')
day = day_in[self.keepcols].copy()
notinnew = []
if self.previnput is not None:
olen = len(day)
day = pd.merge(day, self.previnput, on='id', how = 'left', suffixes=['', '_prev'])
notinnew = self.previnput[~self.previnput.id.isin(day_in.id)].copy()
#print(day.iloc[0].timestamp, len(notinnew))
else:
for c in self.basecols:
day[c + '_prev'] = np.full_like(day[c], 0, dtype=np.float32)
#day[c + '_prev'] = np.zeros_like(day[c], dtype=np.float32)
for c in self.cols:
if c == 'y_prev_pred':
continue
if c[0] == 'D':
day[c] = day[c[1:]] - day[c[1:] + '_prev']
self.previnput = day_in[self.keepcols].copy()
if len(notinnew) > 0:
self.previnput = self.previnput.append(notinnew[self.keepcols])
if self.yprev_model:
day['y_prev_pred'] = self.yprev_model.predict(day[backy_fset].fillna(d_mean).values.reshape(-1,len(backy_fset)))
avg = day.y_prev_pred.mean()
self.dayavg.append(avg)
day['y_prev_pred_mavg5'] = np.ma.average(np.array(self.dayavg[-5:]))#, weights=range(1, len(self.dayavg[-10:]) + 1))
day['y_prev_pred_min5'] = day.y_prev_pred - day.y_prev_pred_mavg5
day['y_prev_pred_mavg5d'] = avg - np.ma.average(np.array(self.dayavg[-5:]))
day['y_prev_pred_mavg10'] = np.ma.average(np.array(self.dayavg[-10:]))#, weights=range(1, len(self.dayavg[-10:]) + 1))
day['y_prev_pred_mavg20'] = np.ma.average(np.array(self.dayavg[-20:]))
day['y_prev_pred_mavg40'] = np.ma.average(np.array(self.dayavg[-40:]))
day['y_prev_pred_avgT1'] = self.prevavg
day['y_prev_pred_avgT0'] = avg
day['y_prev_pred_avg_diff'] = avg - self.prevavg
self.prevavg = avg
np.random.set_state(self.random_state)
day['random'] = np.random.random(len(day))
self.random_state = np.random.get_state()
if self.allinput is not None:
self.allinput.append(day.copy())
return day
def run(self, df):
assert self.allinput is not None
for g in df.groupby('timestamp'):
self.procday(g[1])
return pd.concat(self.allinput)
yptrain = DataPrep(keepinput=True, cols=backy_fset).run(train)
#yptrain_prep = tmp.run(yptrain)
yptrain.sort_values(['id', 'timestamp'], inplace=True)
ypmodel = LinearRegression(n_jobs=-1)
low_y_cut = -0.0725
high_y_cut = 0.075
mask = np.logical_and(yptrain.y > low_y_cut, yptrain.y < high_y_cut)
for f in backy_fset:
mask = np.logical_and(mask, ~yptrain[f].isnull())
yptraina = yptrain[mask]
ypmodel.fit(yptraina[backy_fset].values.reshape(-1,len(backy_fset)), yptraina.y_prev.fillna(0))
print(len(yptraina), ypmodel.coef_, ypmodel.intercept_)
#630210 [ 4.94327753 10.22880098 -4.53049511 -9.34886941 8.94329596
# -9.83007277] -2.68988841901e-05
d_mean['y'] = 0
start = time.time()
train = DataPrep(keepinput = True, yprev_model = ypmodel).run(train)
print('train proc')
endt = time.time()
print(endt - start)
dcol = [c for c in train.columns if c not in excl]
if usepublic:
data_all = pd.read_hdf('../input/train.h5')
#public = data_all[data_all.timestamp > 905]
allpublic = DataPrep(yprev_model = ypmodel, keepinput=True).run(data_all)
public = DataPrep(yprev_model = ypmodel, keepinput=True).run(data_all[data_all.timestamp > 905])
print(r_score(train.y_prev.fillna(0), train.y_prev_pred))
if usepublic:
print(r_score(public.y_prev.fillna(0), public.y_prev_pred))
train.sort_values(['id', 'timestamp'], inplace=True)
print('prepping xgb now')
#xtrain, xvalid = train_test_split(train, test_size = 0.2, random_state = 2017)
rthreshold = 1.0
xtmask = train.random <= rthreshold
xtmask = np.logical_and(xtmask, train['y'] > -.0189765)
xtmask = np.logical_and(xtmask, train['y'] < .0189765)
xtrain = train[xtmask]
xvalid = train[~xtmask]
xvalidb = train[train.random >= rthreshold]
cols_to_use = [c for c in rcol if c in xtrain.columns and c in rcol_orig]
# Convert to XGB format
to_drop = ['timestamp', 'y']
train_xgb = xgb.DMatrix(data=xtrain[cols_to_use], label=xtrain['y'])
valid_xgb = xgb.DMatrix(data=xvalid[cols_to_use], label=xvalid['y'])
evallist = [(valid_xgb, 'valid'), (train_xgb, 'train')]
if usepublic:
public_xgb = xgb.DMatrix(data=public[cols_to_use], label=public['y'])
evallist = [(train_xgb, 'train'), (valid_xgb, 'xvalid'), (public_xgb, 'public')]
print('xtrain+valid')
params = {
'objective': 'reg:linear'
,'eta': 0.03
,'max_depth': 3
, 'subsample': 0.9
#, 'colsample_bytree': 1
,'min_child_weight': 2**11
#,'gamma': 100
, 'seed': 10
#, 'base_score': xtrain.y.mean()
}
model = []
for seed in [31338]:
params['seed'] = seed
model.append(xgb.train(params.items()
, dtrain=train_xgb
, num_boost_round=241 # 240 was best_ntree_limit from a train/public split run
, evals=evallist
, early_stopping_rounds=50
, maximize=True
, verbose_eval=10
, feval=xgb_eval_custom_r
))
if not localrun:
del train_xgb
del valid_xgb
if usepublic:
del public_xgb
print('xgb done, linear now')
lin_features = ['Dtechnical_20', 'technical_20', 'Dtechnical_21']
def prep_linear(df, c = lin_features):
df_tmp = df.fillna(d_mean)
m2mat = np.zeros((len(df), len(c)))
for i in range(len(c)):
m2mat[:,i] = df_tmp[c[i]].values
return m2mat
# Observed with histograns:
#https://www.kaggle.com/bguberfain/two-sigma-financial-modeling/univariate-model-with-clip/run/482189
low_y_cut = -0.075
high_y_cut = 0.075
traincut = train[np.logical_and(train.y > low_y_cut, train.y < high_y_cut)][['y'] + lin_features].copy().fillna(d_mean)
model2 = LinearRegression(n_jobs=-1)
model2.fit(prep_linear(traincut), traincut.y)
print('linear done')
def update_model(m, params_in, cols_to_use):
params = params_in.copy()
params.update({'process_type': 'update',
'updater' : 'refresh',
'refresh_leaf': False})
m_train = xgb.train(params, train_xgb, m.best_ntree_limit, xgb_model=m)
m_test = xgb.train(params, public_xgb, m.best_ntree_limit, xgb_model=m)
imp = pd.DataFrame(index=cols_to_use)
imp['train'] = pd.Series(m_train.get_score(importance_type='gain'), index=cols_to_use)
# imp['valid'] = pd.Series(m_valid.get_score(importance_type='gain'), index=cols_to_use)
imp['test'] = pd.Series(m_test.get_score(importance_type='gain'), index=cols_to_use)
imp = imp.fillna(0)
return m_train, m_test, imp
if vmode:
preds_xgb = model[0].predict(valid_xgb, ntree_limit=model[0].best_ntree_limit)
preds_linear = model2.predict(prep_linear(xvalid))
preds = (preds_xgb * 0.7) + (preds_linear * 0.3)
#preds = preds_xgb
rs = kagglegym.r_score(xvalid.y, preds)
ID = 'expv-{0}.pkl'.format(int(rs * 10000000))
print(rs, ID)
#ID = 'subv-203172.pkl' # if actual submission
output = xvalid[['id', 'timestamp', 'y']].copy()
output['y_hat'] = preds
output['y_hat_xgb'] = preds_xgb
output['y_hat_linear'] = preds_linear
output.to_pickle(ID)
if all_features:
m = model[0]
fs = m.get_fscore()
fsl = [(f,fs[f]) for f in fs.keys()]
fsl = sorted(fsl, key=itemgetter(1), reverse=True)
print(len(fsl))
#print('rcol =', [f[0] for f in fsl])
m_train, m_test, imp = update_model(model[0], params, cols_to_use)
imp['train_test'] = imp.train * imp.test
imp = imp.sort_values('train_test', ascending=False)
imp.train = imp.train / imp.train.max()
imp.test = imp.test / imp.test.max()
impr = imp[imp.train_test > 0]
print('rcol_orig =', list(impr.index))
#len(impr), len(imp)
start = time.time()
dprep = DataPrep(yprev_model = ypmodel, keepinput=localrun)
if localrun:
env = kagglegym.make()
o = env.reset()
while True:
test_preproc = o.features.copy()
#if c in basecols:
#test_preproc.fillna(d_mean, inplace=True)
test = dprep.procday(test_preproc)
#test.fillna(0, inplace=True)
test_xgb = xgb.DMatrix(data=test.drop(['id', 'timestamp'], axis=1)[cols_to_use])
xgbpreds = np.zeros(len(test), dtype=np.float64)
for m in model:
xgbpreds += m.predict(test_xgb, ntree_limit=m.best_ntree_limit)
xgbpreds /= len(model)
test_y = (xgbpreds * .7) + (model2.predict(prep_linear(test)).clip(low_y_cut, high_y_cut) * 0.3)
o.target['y'] = test_y
target = o.target
timestamp = o.features["timestamp"][0]
if timestamp % 100 == 0:
print("Timestamp #{0} {1}".format(timestamp, time.time() - start))
start = time.time()
# We perform a "step" by making our prediction and getting back an updated "observation":
o, reward, done, info = env.step(target)
if done:
print("Public score: {}".format(info["public_score"]))
break
In [ ]: