In [1]:
import os
import gc
import time
import pickle
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedShuffleSplit
In [21]:
def get_20_cv_splits(data):
#stratify_classes = y
train = pd.read_csv(os.path.join(PATH_TO_DATA, 'input/train.csv'), usecols=['target'])
stratify_classes = train.target.apply(lambda x: int(np.log10(x)))
splits = {}
for random_state in range(20):
column = np.zeros(data.shape[0])
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=random_state)
for i, (_, test_index) in enumerate(sss.split(data, stratify_classes)):
column[test_index] = i
splits["split{}".format(random_state)] = column
pd.DataFrame(splits, index=data.index).to_csv(os.path.join(PATH_TO_DATA, 'folds/cv_splits_cleandata_stat_bin_red.csv'))
In [15]:
# function to generate 100 folds from create_folds_from_cv_splits func
def create_folds_from_cv_splits(in_path):
cv_splits = pd.read_csv(os.path.join(PATH_TO_DATA, in_path))
folds_list = []
for ind, i in enumerate(cv_splits.columns[1:]):
folds = list(set(cv_splits[i].values))
folds_list.append([])
for m in folds:
val_idx = list(cv_splits[cv_splits[i]==m].index)
train_idx = list(set(list(cv_splits.index)) - set(val_idx))
folds_list[ind].append((train_idx, val_idx))
with open(os.path.join(PATH_TO_DATA, 'folds/custom_cv.pkl'), 'wb') as f:
pickle.dump(folds_list, f)
return folds_list
In [7]:
LOAD_CV = True
if LOAD_CV:
with open(os.path.join(PATH_TO_DATA, 'folds/custom_cv.pkl'), 'rb') as f:
cv_folds = pickle.load(f)
else:
get_20_cv_splits(train_df)
cv_folds = create_folds_from_cv_splits(in_path='folds/cv_splits_cleandata_stat_bin_red.csv')
In [8]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
params = {
"objective" : "regression",
"metric" : "rmse",
"num_leaves" : 40,
'max_depth': 8, # -1,
"learning_rate" : 0.005,
"bagging_fraction" : 0.7,
"feature_fraction" : 0.1, # 0.6,
"bagging_frequency" : 6,
"bagging_seed" : 44,
"verbosity" : -1,
'num_threads' : 4,
"seed": 44
}
start_time = time.time()
lgtrain = lgb.Dataset(train_X, label=train_y)
lgval = lgb.Dataset(val_X, label=val_y)
model = lgb.train(params, lgtrain, 5000,
valid_sets=[lgtrain, lgval],
early_stopping_rounds=100,
verbose_eval=150)
print('Model training done in {} seconds.'.format(time.time() - start_time))
pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
pred_oof_log = model.predict(val_X, num_iteration=model.best_iteration)
return pred_test_y, pred_oof_log, model
In [9]:
def run_calculations(X, test, big_cv_folds, func_name = None):
if not func_name:
return print('The function to run is not defined')
else:
y_oof_20_preds = []
fold_errors_20_preds =[]
avg_test_pred_20_preds = []
for ind, cv_folds in enumerate(big_cv_folds):
print('Fitting big fold', ind+1, 'out of', len(big_cv_folds))
y_oof = np.zeros((y.shape[0]))
fold_errors =[]
pred_test_list = []
for i, (train_index, val_index) in enumerate(cv_folds):
print('Fitting sub fold', i+1, 'out of', len(cv_folds))
X_train, X_val = X.iloc[train_index], X.iloc[val_index]
y_train, y_val = y[train_index], y[val_index]
# part to include additional functions
if func_name == 'lgb':
pred_test_y, pred_oof_log, clf = run_lgb(X_train, y_train, X_val, y_val, test)
else:
return print('The function to run is not correct')
y_oof[val_index] = pred_oof_log
curr_fe = np.sqrt(mean_squared_error(y_val, pred_oof_log))
print(f'Fold error {curr_fe}')
fold_errors.append(curr_fe)
pred_test_list.append(list(pred_test_y))
print('Total error', np.sqrt(mean_squared_error(y, y_oof)))
total_fe_std = round(np.std(fold_errors), 5)
print(f'Total std {total_fe_std}')
avg_test_pred = np.mean(pred_test_list, axis=0)
avg_test_pred_20_preds.append(avg_test_pred)
fold_errors_20_preds.append(fold_errors)
y_oof_20_preds.append(y_oof)
return y_oof_20_preds, avg_test_pred_20_preds, fold_errors_20_preds
In [ ]:
print('Length of test predictions:', len(pred_test_list_lgb))
avg_pred_test_list_lgb = np.mean(pred_test_list_lgb, axis=0)
print('Length of avg test predictions:', len(avg_pred_test_list_lgb))
In [ ]:
# ERRORS
# errors = pd.DataFrame(fold_errors)
# errors.to_csv(os.path.join(PATH_TO_DATA, 'output/tenich_20_fold_errors_1dconvnn_cv1620_std0037.pkl'), index=False, header=False)
# 20x oof train preds
with open(os.path.join(PATH_TO_DATA, 'output/tenich_20folds_train_1dconvnn_cv1561_std0021.pkl'), 'wb') as f:
pickle.dump(y_oof_lgb, f)
#20x test preds
with open(os.path.join(PATH_TO_DATA, 'output/tenich_20folds_test_1dconvnn_cv1561_std0021.pkl'), 'wb') as f:
pickle.dump(pred_test_list_lgb, f)
In [ ]:
In [ ]:
import pandas as pd
import pickle
In [ ]:
with open('../../santander_data/output/tenich_20folds_test_1dconvnn_cv1561_std0021.pkl', 'rb') as fin:
test_preds = pickle.load(fin)
with open('../../santander_data/output/tenich_20folds_train_1dconvnn_cv1561_std0021.pkl', 'rb') as fin:
train_preds = pickle.load(fin)
In [ ]:
len(test_preds), test_preds[0].shape
In [ ]:
len(train_preds), train_preds[0].shape