In [1]:
import os
import gc
import time
import pickle

import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedShuffleSplit

CV Folds


In [21]:
def get_20_cv_splits(data):
    #stratify_classes = y
    train = pd.read_csv(os.path.join(PATH_TO_DATA, 'input/train.csv'), usecols=['target'])
    stratify_classes =  train.target.apply(lambda x: int(np.log10(x)))
    splits = {}
    for random_state in range(20):
        column = np.zeros(data.shape[0])
        sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=random_state)
        for i, (_, test_index) in enumerate(sss.split(data, stratify_classes)):
            column[test_index] = i

        splits["split{}".format(random_state)] = column

    pd.DataFrame(splits, index=data.index).to_csv(os.path.join(PATH_TO_DATA, 'folds/cv_splits_cleandata_stat_bin_red.csv'))

In [15]:
# function to generate 100 folds from create_folds_from_cv_splits func
def create_folds_from_cv_splits(in_path):
    
    cv_splits = pd.read_csv(os.path.join(PATH_TO_DATA, in_path))
    folds_list = []
    for ind, i in enumerate(cv_splits.columns[1:]):
        folds = list(set(cv_splits[i].values))
        folds_list.append([])
        for m in folds:
            val_idx = list(cv_splits[cv_splits[i]==m].index)
            train_idx = list(set(list(cv_splits.index)) - set(val_idx))
            folds_list[ind].append((train_idx, val_idx))
    with open(os.path.join(PATH_TO_DATA, 'folds/custom_cv.pkl'), 'wb') as f:
        pickle.dump(folds_list, f)
    return folds_list

In [7]:
LOAD_CV = True

if LOAD_CV:
    with open(os.path.join(PATH_TO_DATA, 'folds/custom_cv.pkl'), 'rb') as f:
        cv_folds = pickle.load(f)
else:
    get_20_cv_splits(train_df)
    cv_folds = create_folds_from_cv_splits(in_path='folds/cv_splits_cleandata_stat_bin_red.csv')

LightGBM


In [8]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        'max_depth': 8, # -1,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.1, # 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 44,
        "verbosity" : -1,
        'num_threads' : 4,
        "seed": 44
    }
    
    start_time = time.time()
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, 5000, 
                      valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=150)
    print('Model training done in {} seconds.'.format(time.time() - start_time))
    
    pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
    pred_oof_log = model.predict(val_X, num_iteration=model.best_iteration)
    return pred_test_y, pred_oof_log, model

In [9]:
def run_calculations(X, test, big_cv_folds, func_name = None):
    if not func_name:
        return print('The function to run is not defined')
    else:
        y_oof_20_preds = []
        fold_errors_20_preds =[]
        avg_test_pred_20_preds = []
        
        for ind, cv_folds in enumerate(big_cv_folds):
            print('Fitting big fold', ind+1, 'out of', len(big_cv_folds))
            y_oof = np.zeros((y.shape[0]))
            fold_errors =[]
            pred_test_list = []
            
            for i, (train_index, val_index) in enumerate(cv_folds):
                print('Fitting sub fold', i+1, 'out of', len(cv_folds))
                X_train, X_val  = X.iloc[train_index], X.iloc[val_index]
                y_train, y_val = y[train_index], y[val_index]

                # part to include additional functions
                if func_name == 'lgb':
                    pred_test_y, pred_oof_log, clf = run_lgb(X_train, y_train, X_val, y_val, test)
                else:
                    return print('The function to run is not correct')

                y_oof[val_index] = pred_oof_log
                curr_fe = np.sqrt(mean_squared_error(y_val, pred_oof_log))
                print(f'Fold error {curr_fe}')
                fold_errors.append(curr_fe)
                pred_test_list.append(list(pred_test_y))

            print('Total error', np.sqrt(mean_squared_error(y, y_oof)))
            total_fe_std = round(np.std(fold_errors), 5)
            print(f'Total std {total_fe_std}')
            avg_test_pred = np.mean(pred_test_list, axis=0)
            
            avg_test_pred_20_preds.append(avg_test_pred)
            fold_errors_20_preds.append(fold_errors)
            y_oof_20_preds.append(y_oof)
            
        return y_oof_20_preds, avg_test_pred_20_preds, fold_errors_20_preds


In [ ]:
print('Length of test predictions:', len(pred_test_list_lgb))
avg_pred_test_list_lgb = np.mean(pred_test_list_lgb, axis=0)
print('Length of avg test predictions:', len(avg_pred_test_list_lgb))

In [ ]:
# ERRORS
# errors = pd.DataFrame(fold_errors)
# errors.to_csv(os.path.join(PATH_TO_DATA, 'output/tenich_20_fold_errors_1dconvnn_cv1620_std0037.pkl'), index=False, header=False)

# 20x oof train preds
with open(os.path.join(PATH_TO_DATA, 'output/tenich_20folds_train_1dconvnn_cv1561_std0021.pkl'), 'wb') as f:
    pickle.dump(y_oof_lgb, f)
    
#20x test preds
with open(os.path.join(PATH_TO_DATA, 'output/tenich_20folds_test_1dconvnn_cv1561_std0021.pkl'), 'wb') as f:
    pickle.dump(pred_test_list_lgb, f)

In [ ]:


In [ ]:
import pandas as pd
import pickle

In [ ]:
with open('../../santander_data/output/tenich_20folds_test_1dconvnn_cv1561_std0021.pkl', 'rb') as fin:
    test_preds = pickle.load(fin)
    
with open('../../santander_data/output/tenich_20folds_train_1dconvnn_cv1561_std0021.pkl', 'rb') as fin:
    train_preds = pickle.load(fin)

In [ ]:
len(test_preds), test_preds[0].shape

In [ ]:
len(train_preds), train_preds[0].shape