In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import time
import lightgbm as lgb
import warnings
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_score, recall_score
import seaborn as sns
%matplotlib inline

warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [2]:
import os

import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb

from collections import Counter

import time 
import joblib

FOLDS_PATH = './folds/folds.pkl'
folds_idx = joblib.load(FOLDS_PATH)

In [3]:
train_fet = pd.read_csv('./neptune/train_features.csv.gz')
test = pd.read_csv('./neptune/test_features.csv.gz')

In [4]:
def reduce_mem_usage(data, verbose = True):
    start_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))
    
    for col in data.columns:
        col_type = data[col].dtype
        
        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)

    end_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return data

In [5]:
train_fet = reduce_mem_usage(train_fet)
test = reduce_mem_usage(test)


Memory usage of dataframe: 2754.35 MB
C:\Users\dex\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:22: RuntimeWarning: invalid value encountered in less
C:\Users\dex\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:24: RuntimeWarning: invalid value encountered in less
Memory usage after optimization: 885.66 MB
Decreased by 67.8%
Memory usage of dataframe: 436.60 MB
Memory usage after optimization: 138.06 MB
Decreased by 68.4%

In [6]:
train_gp = reduce_mem_usage(pd.read_csv('./input/train_df_gp.zip'))
test_gp = reduce_mem_usage(pd.read_csv('./input/test_df_gp.zip'))


Memory usage of dataframe: 2043.45 MB
Memory usage after optimization: 909.70 MB
Decreased by 55.5%
Memory usage of dataframe: 323.91 MB
C:\Users\dex\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:22: RuntimeWarning: invalid value encountered in less
C:\Users\dex\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:24: RuntimeWarning: invalid value encountered in less
Memory usage after optimization: 144.01 MB
Decreased by 55.5%

In [7]:
application_train = reduce_mem_usage(pd.read_csv('./input/application_train.csv')[['SK_ID_CURR', 'TARGET']])


Memory usage of dataframe: 4.69 MB
Memory usage after optimization: 1.47 MB
Decreased by 68.7%

In [8]:
application_test = reduce_mem_usage(pd.read_csv('./input/application_test.csv')[['SK_ID_CURR']])
test = pd.concat([application_test.SK_ID_CURR, test], axis=1)


Memory usage of dataframe: 0.37 MB
Memory usage after optimization: 0.19 MB
Decreased by 50.0%

In [9]:
train_fet = pd.concat([application_train.SK_ID_CURR, train_fet], axis=1)

In [10]:
train_fet = pd.merge(train_fet, train_gp, how='left', on=['SK_ID_CURR'])
del train_gp
gc.collect()


Out[10]:
28

In [11]:
test = pd.merge(test, test_gp, how='left', on=['SK_ID_CURR'])
del test_gp
gc.collect()


Out[11]:
21

In [12]:
test.shape, train_fet.shape


Out[12]:
((48744, 2045), (307511, 2045))

In [13]:
application_train = reduce_mem_usage(pd.read_csv('./input/application_train.csv')[[ 'TARGET']])

train_fet.drop('TARGET', axis=1, inplace=True)
train_fet = pd.concat([application_train.TARGET, train_fet], axis=1)


train_df = train_fet
test_df = test

features = [col for col in train_df.columns if col != 'TARGET']


Memory usage of dataframe: 2.35 MB
Memory usage after optimization: 0.29 MB
Decreased by 87.5%

In [14]:
del train_fet, test
gc.collect()


Out[14]:
14

In [15]:
del application_train, application_test
gc.collect()


Out[15]:
14

In [46]:
gc.collect()


Out[46]:
729

In [19]:
def get_model():
    lgbm_params = {
            'objective': 'binary',
            'nthread': 15,
            'metric': 'auc',
            #'n_estimators': 10000,
            'learning_rate': .01,
            'num_leaves': 35,
            'colsample_bytree': .2,
            'subsample': .8715623,
            'max_depth': -1,
            'reg_alpha': .0,
            'reg_lambda': 100.0,
            #'silent': -1,
            'verbose': 100,
            'max_bin': 277,
            'scale_pos_weight' :1,
            'reg_alpha': 0.0,
            #'number_boosting_rounds': 777,
            #'early_stopping_rounds': 100,
            'min_child_samples': 50,
            'subsample': 1.0,
            'subsample_freq': 1,
            'min_gain_to_split': 0.5 
    }
    return lgb.LGBMClassifier(**lgbm_params, n_estimators = 2500)

In [20]:
#del train_x, train_y, valid_x, valid_y
gc.collect()


Out[20]:
0

In [21]:
oof_preds = np.zeros(train_df.shape[0])
test_preds = []
final_preds = np.zeros(test_df.shape[0])
auc_scores = []
    
for n_fold, (train_idx, valid_idx) in enumerate(folds_idx):
    print('Fold', n_fold, 'started at', time.ctime())
    #train_x, train_y = train_df[features].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
    #valid_x, valid_y = train_df[features].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
    
    clf = get_model()
    gc.collect()
    clf.fit(train_df[features].iloc[train_idx], train_df['TARGET'].iloc[train_idx], 
            eval_set=[(train_df[features].iloc[train_idx], train_df['TARGET'].iloc[train_idx]), (train_df[features].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx])], 
            eval_metric='auc', verbose=100, early_stopping_rounds=77)
    
    oof_preds[valid_idx] = clf.predict_proba(train_df[features].iloc[valid_idx], num_iteration=clf.best_iteration_)[:, 1]
    y_pred = clf.predict_proba(test_df[features], num_iteration=clf.best_iteration_)[:, 1]
    final_preds += pd.Series(y_pred).rank().values
    test_preds.append(y_pred)
    auc_scores.append(roc_auc_score(train_df['TARGET'].iloc[valid_idx], oof_preds[valid_idx]))
    
    print('Fold %2d AUC : %.6f' % (n_fold + 1, auc_scores[-1]))

print("\n", np.mean(auc_scores), np.std(auc_scores), roc_auc_score(train_df['TARGET'], oof_preds))

final_preds /= final_preds.max()


Fold 0 started at Tue Aug 28 23:25:07 2018
Training until validation scores don't improve for 77 rounds.
Early stopping, best iteration is:
[1]	valid_0's auc: 0.793652	valid_1's auc: 0.798035
Fold  1 AUC : 0.798035
Fold 1 started at Tue Aug 28 23:26:23 2018
Training until validation scores don't improve for 77 rounds.
Early stopping, best iteration is:
[1]	valid_0's auc: 0.794039	valid_1's auc: 0.78372
Fold  2 AUC : 0.783720
Fold 2 started at Tue Aug 28 23:27:39 2018
Training until validation scores don't improve for 77 rounds.
Early stopping, best iteration is:
[18]	valid_0's auc: 0.794682	valid_1's auc: 0.79023
Fold  3 AUC : 0.790230
Fold 3 started at Tue Aug 28 23:28:58 2018
Training until validation scores don't improve for 77 rounds.
Early stopping, best iteration is:
[1]	valid_0's auc: 0.792857	valid_1's auc: 0.789631
Fold  4 AUC : 0.789631
Fold 4 started at Tue Aug 28 23:30:13 2018
Training until validation scores don't improve for 77 rounds.
Early stopping, best iteration is:
[1]	valid_0's auc: 0.792229	valid_1's auc: 0.787238
Fold  5 AUC : 0.787238
Fold 5 started at Tue Aug 28 23:31:29 2018
Training until validation scores don't improve for 77 rounds.
Early stopping, best iteration is:
[1]	valid_0's auc: 0.792327	valid_1's auc: 0.783957
Fold  6 AUC : 0.783957
Fold 6 started at Tue Aug 28 23:32:45 2018
Training until validation scores don't improve for 77 rounds.
Early stopping, best iteration is:
[3]	valid_0's auc: 0.792765	valid_1's auc: 0.791011
Fold  7 AUC : 0.791011
Fold 7 started at Tue Aug 28 23:34:01 2018
Training until validation scores don't improve for 77 rounds.
Early stopping, best iteration is:
[20]	valid_0's auc: 0.794839	valid_1's auc: 0.787506
Fold  8 AUC : 0.787506
Fold 8 started at Tue Aug 28 23:35:23 2018
Training until validation scores don't improve for 77 rounds.
Early stopping, best iteration is:
[1]	valid_0's auc: 0.794064	valid_1's auc: 0.797604
Fold  9 AUC : 0.797604
Fold 9 started at Tue Aug 28 23:36:38 2018
Training until validation scores don't improve for 77 rounds.
Early stopping, best iteration is:
[1]	valid_0's auc: 0.792445	valid_1's auc: 0.783486
Fold 10 AUC : 0.783486

 0.7892417887832485 0.005000662000601599 0.7682839842247293

In [22]:
sub = pd.DataFrame({'SK_ID_CURR': test_df['SK_ID_CURR'],
                    'TARGET': final_preds})
sub.to_csv('./insaf_nept_gb_lgb_cv07892_std_0050.csv', index=None)

for_blending = {'train': oof_preds,
                'test': test_preds}
joblib.dump(for_blending, './insaf_nept_gb_lgb_cv07892_std_0050.pkl')


Out[22]:
['./insaf_nept_gb_lgb_cv07892_std_0050.pkl']

In [ ]: