In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb

from collections import Counter

import joblib

In [2]:
DATA_DIR = '../../hcdr_data/'
FOLDS_PATH = './folds.pkl'

In [3]:
application_train = pd.read_csv(os.path.join(DATA_DIR, 'application_train.csv'))
application_test = pd.read_csv(os.path.join(DATA_DIR, 'application_test.csv'))
                               
# bureau = pd.read_csv('bureau.csv')
# bureau_balance = pd.read_csv(os.path.join(DATA_DIR, 'bureau_balance.csv'))
# pos_cash_balance = pd.read_csv(os.path.join(DATA_DIR, 'POS_CASH_balance.csv'))
# credit_card_balance = pd.read_csv(os.path.join(DATA_DIR, 'credit_card_balance.csv'))
# previous_application = pd.read_csv(os.path.join(DATA_DIR, 'previous_application.csv'))
# installments_payments = pd.read_csv(os.path.join(DATA_DIR, 'installments_payments.csv'))

In [4]:
if FOLDS_PATH is None:
    FOLDS_PATH = './folds.pkl'
    
    folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    folds_idx = [(train_idx, val_idx) 
                 for train_idx, val_idx in folds.split(application_train, y=application_train['TARGET'])]

    joblib.dump(folds_idx, FOLDS_PATH)
    
folds_idx = joblib.load(FOLDS_PATH)

In [5]:
train_df = application_train
test_df = application_test

features = [col 
            for col in train_df.columns[2:] 
            if train_df.dtypes[col] in (int, float)]

In [6]:
def get_model():
    return lgb.LGBMClassifier(nthread=4)

In [7]:
# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
test_preds = []
final_preds = np.zeros(test_df.shape[0])
auc_scores = []
    
for n_fold, (train_idx, valid_idx) in enumerate(folds_idx):
    train_x, train_y = train_df[features].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
    valid_x, valid_y = train_df[features].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
    
    clf = get_model()
    
    clf.fit(train_x, train_y, 
            eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric='auc', verbose=100, early_stopping_rounds=100)
    
    oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
    y_pred = clf.predict_proba(test_df[features], num_iteration=clf.best_iteration_)[:, 1]
    final_preds += pd.Series(y_pred).rank().values
    test_preds.append(y_pred)
    auc_scores.append(roc_auc_score(valid_y, oof_preds[valid_idx]))
    
    print('Fold %2d AUC : %.6f' % (n_fold + 1, auc_scores[-1]))
    
print("\n", np.mean(auc_scores), np.std(auc_scores), roc_auc_score(train_df['TARGET'], oof_preds))

final_preds /= final_preds.max()


Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.792876	valid_1's auc: 0.753894
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.792876	valid_1's auc: 0.753894
Fold  1 AUC : 0.753894
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.792793	valid_1's auc: 0.746623
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.792793	valid_1's auc: 0.746623
Fold  2 AUC : 0.746623
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.791736	valid_1's auc: 0.751605
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.791736	valid_1's auc: 0.751605
Fold  3 AUC : 0.751605
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.792275	valid_1's auc: 0.752933
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.792275	valid_1's auc: 0.752933
Fold  4 AUC : 0.752933
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.792586	valid_1's auc: 0.752032
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.792586	valid_1's auc: 0.752032
Fold  5 AUC : 0.752032
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.791195	valid_1's auc: 0.757011
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.791195	valid_1's auc: 0.757011
Fold  6 AUC : 0.757011
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.791961	valid_1's auc: 0.756533
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.791961	valid_1's auc: 0.756533
Fold  7 AUC : 0.756533
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.792759	valid_1's auc: 0.745905
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.792759	valid_1's auc: 0.745905
Fold  8 AUC : 0.745905
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.79192	valid_1's auc: 0.757058
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.79192	valid_1's auc: 0.757058
Fold  9 AUC : 0.757058
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.793092	valid_1's auc: 0.745861
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.793092	valid_1's auc: 0.745861
Fold 10 AUC : 0.745861

 0.7519456456827782 0.004237515231585296 0.751920904144396

In [10]:
sub = pd.DataFrame({'SK_ID_CURR': test_df['SK_ID_CURR'],
                    'TARGET': final_preds})
sub.to_csv('./tenich_lgb_cv07519_std_0004.csv', index=None)

for_blending = {'train': oof_preds,
                'test': test_preds}
joblib.dump(for_blending, './tenich_lgb_cv07519_std_0004.pkl')


Out[10]:
['./tenich_lgb_cv07519_std_0004.pkl']

In [11]:
!ls


folds.pkl	  submit.csv	    tenich_lgb_cv07519_std_0004.csv
starterkit.ipynb  tenich_lgb_07519  tenich_lgb_cv07519_std_0004.pkl