In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import time
import lightgbm as lgb
import warnings
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_score, recall_score
import seaborn as sns
%matplotlib inline
warnings.simplefilter(action = 'ignore', category = FutureWarning)
In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from collections import Counter
import time
import joblib
FOLDS_PATH = './folds/folds.pkl'
folds_idx = joblib.load(FOLDS_PATH)
In [3]:
train_fet = pd.read_csv('./neptune/train_features.csv.gz')
test = pd.read_csv('./neptune/test_features.csv.gz')
In [4]:
def reduce_mem_usage(data, verbose = True):
start_mem = data.memory_usage().sum() / 1024**2
if verbose:
print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))
for col in data.columns:
col_type = data[col].dtype
if col_type != object:
c_min = data[col].min()
c_max = data[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
data[col] = data[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
data[col] = data[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
data[col] = data[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
data[col] = data[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
data[col] = data[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
data[col] = data[col].astype(np.float32)
else:
data[col] = data[col].astype(np.float64)
end_mem = data.memory_usage().sum() / 1024**2
if verbose:
print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return data
In [5]:
train_fet = reduce_mem_usage(train_fet)
test = reduce_mem_usage(test)
In [6]:
train_gp = reduce_mem_usage(pd.read_csv('./input/train_df_gp.zip'))
test_gp = reduce_mem_usage(pd.read_csv('./input/test_df_gp.zip'))
In [7]:
application_train = reduce_mem_usage(pd.read_csv('./input/application_train.csv')[['SK_ID_CURR', 'TARGET']])
In [8]:
application_test = reduce_mem_usage(pd.read_csv('./input/application_test.csv')[['SK_ID_CURR']])
test = pd.concat([application_test.SK_ID_CURR, test], axis=1)
In [9]:
train_fet = pd.concat([application_train.SK_ID_CURR, train_fet], axis=1)
In [10]:
train_fet = pd.merge(train_fet, train_gp, how='left', on=['SK_ID_CURR'])
del train_gp
gc.collect()
Out[10]:
In [11]:
test = pd.merge(test, test_gp, how='left', on=['SK_ID_CURR'])
del test_gp
gc.collect()
Out[11]:
In [12]:
test.shape, train_fet.shape
Out[12]:
In [13]:
application_train = reduce_mem_usage(pd.read_csv('./input/application_train.csv')[[ 'TARGET']])
train_fet.drop('TARGET', axis=1, inplace=True)
train_fet = pd.concat([application_train.TARGET, train_fet], axis=1)
train_df = train_fet
test_df = test
features = [col for col in train_df.columns if col != 'TARGET']
In [14]:
del train_fet, test
gc.collect()
Out[14]:
In [15]:
del application_train, application_test
gc.collect()
Out[15]:
In [46]:
gc.collect()
Out[46]:
In [19]:
def get_model():
lgbm_params = {
'objective': 'binary',
'nthread': 15,
'metric': 'auc',
#'n_estimators': 10000,
'learning_rate': .01,
'num_leaves': 35,
'colsample_bytree': .2,
'subsample': .8715623,
'max_depth': -1,
'reg_alpha': .0,
'reg_lambda': 100.0,
#'silent': -1,
'verbose': 100,
'max_bin': 277,
'scale_pos_weight' :1,
'reg_alpha': 0.0,
#'number_boosting_rounds': 777,
#'early_stopping_rounds': 100,
'min_child_samples': 50,
'subsample': 1.0,
'subsample_freq': 1,
'min_gain_to_split': 0.5
}
return lgb.LGBMClassifier(**lgbm_params, n_estimators = 2500)
In [20]:
#del train_x, train_y, valid_x, valid_y
gc.collect()
Out[20]:
In [21]:
oof_preds = np.zeros(train_df.shape[0])
test_preds = []
final_preds = np.zeros(test_df.shape[0])
auc_scores = []
for n_fold, (train_idx, valid_idx) in enumerate(folds_idx):
print('Fold', n_fold, 'started at', time.ctime())
#train_x, train_y = train_df[features].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
#valid_x, valid_y = train_df[features].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
clf = get_model()
gc.collect()
clf.fit(train_df[features].iloc[train_idx], train_df['TARGET'].iloc[train_idx],
eval_set=[(train_df[features].iloc[train_idx], train_df['TARGET'].iloc[train_idx]), (train_df[features].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx])],
eval_metric='auc', verbose=100, early_stopping_rounds=77)
oof_preds[valid_idx] = clf.predict_proba(train_df[features].iloc[valid_idx], num_iteration=clf.best_iteration_)[:, 1]
y_pred = clf.predict_proba(test_df[features], num_iteration=clf.best_iteration_)[:, 1]
final_preds += pd.Series(y_pred).rank().values
test_preds.append(y_pred)
auc_scores.append(roc_auc_score(train_df['TARGET'].iloc[valid_idx], oof_preds[valid_idx]))
print('Fold %2d AUC : %.6f' % (n_fold + 1, auc_scores[-1]))
print("\n", np.mean(auc_scores), np.std(auc_scores), roc_auc_score(train_df['TARGET'], oof_preds))
final_preds /= final_preds.max()
In [22]:
sub = pd.DataFrame({'SK_ID_CURR': test_df['SK_ID_CURR'],
'TARGET': final_preds})
sub.to_csv('./insaf_nept_gb_lgb_cv07892_std_0050.csv', index=None)
for_blending = {'train': oof_preds,
'test': test_preds}
joblib.dump(for_blending, './insaf_nept_gb_lgb_cv07892_std_0050.pkl')
Out[22]:
In [ ]: