In [1]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import xgboost as xgb
from scipy.stats import skew
np.warnings.filterwarnings('ignore', category=DeprecationWarning)
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
# set Jupyter to display ALL output from a cell (not just last output)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# set pandas and numpy options to make print format nicer
pd.set_option("display.width",100)
pd.set_option("display.max_columns",1000)
pd.set_option('display.max_colwidth', 80)
pd.set_option('display.max_rows', 500)
np.set_printoptions(linewidth=120, threshold=5000, edgeitems=50, suppress=True)
seed = 42
In [2]:
%%time
print('Reading: loan_stat542.csv into loans dataframe...')
loans = pd.read_csv('loan_stat542.csv')
print('Loans dataframe:', loans.shape)
test_ids = pd.read_csv('Project3_test_id.csv', dtype={'test1':int,'test2':int, 'test3':int,})
print('ids dataframe:', test_ids.shape)
trains = []
tests = []
labels = []
for i, col in enumerate(test_ids.columns):
trains.append(loans.loc[~loans.id.isin(test_ids[col]),:])
tests.append( loans.loc[ loans.id.isin(test_ids[col]), loans.columns!='loan_status'])
labels.append(loans.loc[ loans.id.isin(test_ids[col]), ['id','loan_status']])
labels[i]["y"] = (labels[i].loan_status != 'Fully Paid').astype(int)
labels[i].drop('loan_status', axis=1, inplace=True)
print('Fold', i, trains[i].shape, tests[i].shape, labels[i].shape)
print('Writing train, test, labels csv files...')
# fold=0
# _ = trains[fold].to_csv('train.csv', index=False)
# _ = tests [fold].to_csv('test.csv', index=False)
# _ = labels[fold].to_csv('label.csv', index=False)
print('Done!')
In [3]:
def process(data):
data['emp_length'] = data.emp_length.fillna('Unknown').str.replace('<','LT')
data['dti'] = data.dti.fillna(0)
data['revol_util'] = data.revol_util.fillna(0)
data['mort_acc'] = data.mort_acc.fillna(0)
data['pub_rec_bankruptcies'] = data.pub_rec_bankruptcies.fillna(0)
temp = pd.to_datetime(data.earliest_cr_line)
data['earliest_cr_line'] = temp.dt.year*12 - 1950*12 + temp.dt.month
data.drop(['emp_title','title','zip_code','grade','fico_range_high'], axis=1, inplace=True)
return data
def logloss(y, p):
loglosses = np.where(y==1, -np.log(p+1e-15), -np.log(1-p+1e-15))
return np.mean(loglosses)
def prep_train_test(train, test):
train = process(train)
X_train = train.drop(['loan_status'], axis=1)
X_train = pd.get_dummies(X_train) # create dataframe with dummy variables replacing categoricals
X_train = X_train.reindex(sorted(X_train.columns), axis=1) # sort columns to be in same sequence as test
y_train = (train.loan_status!='Fully Paid').astype(int)
test = process(test)
X_test = pd.get_dummies(test) # create dataframe with dummy variables replacing categoricals
all_columns = X_train.columns.union(X_test.columns) # add columns to test that are in train but not test
X_test = X_test.reindex(columns=all_columns).fillna(0)
X_test = X_test.reindex(sorted(X_train.columns), axis=1) # sort columns to be in same sequence at train
return X_train, y_train, X_test
In [26]:
%%time
import time
seed=42
models = [
LogisticRegression(penalty='l1',C=1, random_state=seed),
# GradientBoostingClassifier(max_features='sqrt', learning_rate=0.055, n_estimators=780, max_depth=7,
# min_samples_leaf=2, subsample=0.9, min_samples_split=4,
# min_weight_fraction_leaf=0, random_state=seed),
xgb.XGBClassifier(learning_rate=0.037, n_estimators=860, min_child_weight=8, max_depth=7, gamma=0.3,
subsample=0.52, colsample_bytree=0.92, reg_lambda=0.67, reg_alpha=0.03,
objective= 'binary:logistic', n_jobs=-1, random_state=seed, eval_metric='logloss'),
]
num_models, num_folds = len(models), len(test_ids.columns)
errors = np.zeros([num_models, num_folds])
for fold in range(num_folds):
np.random.seed(seed=seed)
train = trains[fold].copy()
test = tests [fold].copy()
label = labels[fold].copy()
fraction = 1
if fraction < 1:
train = train.sample(frac=fraction, random_state=seed)
test = test.sample(frac=fraction*4, random_state=seed)
print()
X_train, y_train, X_test = prep_train_test(train, test)
# print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
for i, model in enumerate(models):
start_time = time.time()
_ = model.fit(X_train, y_train)
probs = model.predict_proba(X_test)[:,1]
y_test = pd.merge(tests[fold][['id']], label, how='left', on='id')
errors[i, fold] = logloss(y_test.y, probs)
print('{:26.26} Fold={}, Runime={:8.2f} seconds, logloss={:8.5f}'.format(
type(model).__name__, fold, round(time.time()-start_time,2), errors[i,fold]))
# df = pd.DataFrame({'id': test.id, 'prob': probs.round(5)})
# df.to_csv('mysubmission'+str(i+1)+'.txt', index=False)
# print('Created mysubmission'+str(i+1)+'.txt, rows=', df.shape[0])
print("\nSUMMARY:")
for i, m in enumerate(models):
print('{:26.26} mean logloss={:8.5f}'.format(type(m).__name__, errors[i,:].mean()))
In [ ]:
In [17]:
from skopt import gp_minimize, gbrt_minimize
from skopt.plots import plot_convergence
import datetime, warnings
def objective(values):
index = str(values)
if index in cache:
print('GET FROM CACHE:', index, round(cache[index],4))
return cache[index]
if model_type == 'LogisticRegression':
params = {'penalty': values[0], 'C': values[1],}
model = LogisticRegression(**params, random_state=seed, n_jobs=-1)
# if model_type == 'RandomForestClassifier':
# params = {'n_estimators': values[0], 'max_features': values[1], 'max_depth': values[2],}
# model = RandomForestClassifier(**params, n_jobs=-1)
if model_type == 'GradientBoostingClassifier':
params = {'learning_rate': values[0], 'n_estimators': values[1], 'max_depth': values[2],
'min_samples_split': values[3], 'min_samples_leaf': values[4],
'min_weight_fraction_leaf' : values[5], 'subsample': values[6], 'max_features': values[7] }
model = GradientBoostingClassifier(**params, random_state=seed)
if model_type == 'XGBClassifier':
params = {'learning_rate': values[0], 'n_estimators': int(values[1]), 'min_child_weight': int(values[2]),
'max_depth': int(values[3]), 'gamma': values[4], 'subsample': values[5],
'colsample_bytree': values[6], 'lambda': values[7], 'alpha': values[8], 'eval_metric':'logloss'}
model = xgb.XGBClassifier(**params, random_state=seed, nthread=-1, n_jobs=-1,silent=1)
print(datetime.datetime.now().time().replace(microsecond=0), ', Params',params)
# scores = -cross_val_score(model, X_train, y_train, scoring="neg_log_loss", cv=5, n_jobs=-1)
_ = model.fit(X_train, y_train)
probs = model.predict_proba(X_test)[:,1]
y_test = pd.merge(test[['id']], label, how='left', on='id')
cache[index] = np.mean( logloss(y_test.y, probs) )
return cache[index]
In [ ]:
# %%time
import warnings
np.random.seed(seed)
warnings.filterwarnings("ignore", category=UserWarning) # turn off already evaluated errors
params={'LogisticRegression': [
['l1',],
(1e-1,1e+1,'uniform'),
],
'GradientBoostingClassifier': [
(0.04, 0.10, 'uniform'), # learning rate
(500, 900), # n_estimators
(3, 7), # max_depth
(2, 5), # min_samples_split
(2, 5), # min_samples_leaf
(0, 0.3), # min_weight_fraction_leaf
(0.8, 1.0,'uniform'), # subsample
('sqrt',), # max_features
],
'XGBClassifier': [
(0.01, 0.05, 'uniform'), # learning_rate 0.05, 0.3,
(300, 700), # n_estimators
(5, 9), # min_child_weight
(4, 8), # max_depth 3-10
(0, 0.5, 'uniform'), # gamma 0-0.4
(0.4, 1.0, 'uniform'), # subsample 0.5 - 0.99
(0.8, 1.0, 'uniform'), # colsample_bytree 0.5 - 0.99
(0.8, 1.0, 'uniform'), # reg_lambda
(0.0, 0.5, 'uniform'), # reg_alpha
],}
train = trains[0].copy()
test = tests[0].copy()
label = labels[0].copy()
fraction = 1
if fraction < 1:
train = train.sample(frac=fraction, random_state=seed)
test = test.sample(frac=fraction, random_state=seed)
print(train.shape)
X_train, y_train, X_test = prep_train_test(train, test)
print(X_train.shape, y_train.shape, X_test.shape)
model_types = params.keys()
model_types = ['GradientBoostingClassifier']
for model_type in model_types:
cache = {}
space = params[model_type]
result = gbrt_minimize(objective,space,n_random_starts=15, n_calls=300, random_state=seed,verbose=True,n_jobs=-1)
print('\n', model_type, ', Best Params=', result.x, ' Best Score=', round(result.fun,6),'\n')
_ = plt.figure(figsize=(15,8))
_ = plot_convergence(result, yscale='log')
warnings.filterwarnings("default", category=UserWarning) # turn on already evaluated errors
In [ ]:
In [ ]:
sorted_d = sorted(cache.items(), key=lambda x: x[1])
temp = []
for i in range(len(sorted_d)):
temp.append((sorted_d[i][0], round(sorted_d[i][1],5)))
print('{} {}'.format(round(sorted_d[i][1],5), sorted_d[i][0]))
In [ ]:
sorted_d = sorted(cache.items(), key=lambda x: x[1])
temp = []
for i in range(len(sorted_d)):
temp.append((sorted_d[i][0], round(sorted_d[i][1],5)))
print('{} {}'.format(round(sorted_d[i][1],5), sorted_d[i][0]))
In [ ]:
In [ ]:
In [75]:
%%time
XGB_params = {
'learning_rate': np.linspace(0.05, 1, 1), # 0.03 to 0.2, tuned to 0.05
'min_child_weight': np.linspace(5, 10, 1, dtype=int), # 1 to 6, tuned to 2
'max_depth': np.linspace(4, 8, 1, dtype=int), # 3 to 10, tuned to 3
'gamma': np.linspace(0.2, 0.4, 1), # 0 to 0.4, tuned to 0
'subsample': np.linspace(1, 1, 1), # 0.6 to 1, tuned to 1.0
'colsample_bytree': np.linspace(0.75, 1, 1), # 0.6 to 1, tuned to 0.6
'reg_lambda': np.linspace(0.25, 0.6, 1), # 0 to 1, tuned to 1.0
'reg_alpha': np.linspace(0.15, 0.5, 1), # 0 to 1, tuned to 0.5
'silent': [1,],
}
train = trains[0].copy()
test = tests[0].copy()
fraction = 0.05
if fraction < 1:
train = train.sample(frac=fraction, random_state=seed)
test = test.sample(frac=fraction*4, random_state=seed)
print(train.shape, test.shape)
X_train, y_train, X_test= prep_train_test(train, test)
print(X_train.shape, y_train.shape, X_test.shape)
tune_cv=False # perform CV just to get number of boost rounds/estimators before using GridSearchCV
if tune_cv:
xgtrain = xgb.DMatrix(X_train, label=y_train)
cvresult = xgb.cv(objective=binary:logistic, learning_rate=0.05,min_child_weight=5, max_depth=4, gamma=0.2,
subsample=1.0, colsample_bytree=0.75, reg_lambda=0.25, reg_alpha=0.15, silent=1,
xgtrain, num_boost_round=1000, early_stopping_rounds=50, nfold=10,
metrics='logloss', verbose_eval=10, show_stdv=False)
else:
XGBbase = xgb.XGBClassifier(n_estimators=150, objective= 'binary:logistic', random_state=seed, n_jobs=-1)
XGBmodel = GridSearchCV(XGBbase, cv=10, n_jobs=-1, param_grid=XGB_params, scoring='neg_log_loss' ,verbose=5)
_ = XGBmodel.fit(X_train, y_train)
print(XGBmodel.best_estimator_)
print(XGBmodel.best_params_)
print(-round(XGBmodel.best_score_,6))
# fig, ax = plt.subplots(1,1,figsize=(16,26))
# _ = xgb.plot_importance(model, ax=ax)
In [101]:
# rejected models
# KNeighborsClassifier(500), # tuned on 5% of data
# SVC(gamma='auto', C=1, probability=True, random_state=seed), # quite slow
# DecisionTreeClassifier(max_depth=2, random_state=seed),
# RandomForestClassifier(n_estimators=300, random_state=seed),
# AdaBoostClassifier(random_state=seed),
# GaussianNB(),
# LinearDiscriminantAnalysis(),
In [40]:
temp = X_train.corr()
temp = temp[temp>0.50]
temp.values[[np.arange(len(temp.columns))]*2] = np.NaN
temp = temp.dropna(how='all', axis=0)
temp = temp.dropna(how='all', axis=1)
temp
Out[40]:
In [19]:
m1 = models[-1]
df = pd.DataFrame({'id': test.id, 'prob': probs.round(11)})
df.to_csv('mysubmission1.txt', index=False)
print('Created mysubmission1.txt, rows=', df.shape[0], ', Model=', type(m1).__name__)
m2 = models[-1]
df = pd.DataFrame({'id': test.id, 'prob': probs.round(11)})
df.to_csv('mysubmission2.txt', index=False)
print('Created mysubmission2.txt, rows=', df.shape[0], ', Model=', type(m2).__name__)
m3 = models[-1]
df = pd.DataFrame({'id': test.id, 'prob': probs.round(11)})
df.to_csv('mysubmission3.txt', index=False)
print('Created mysubmission3.txt, rows=', df.shape[0], ', Model=', type(m3).__name__)
In [ ]:
%%time
loans['emp_title'] = loans.emp_title.fillna('_unknown').str.lower()
loans['emp_title1'] = categorise_emp_title(loans.emp_title)
loans['default'] = (loans.loan_status!='Fully Paid').astype(int)
g1 = loans.groupby(['emp_title','emp_title1'])['default'].agg(['sum','count'])
g1.columns = ['sum1','count1']
g1['rate1'] = g1['sum1'] / g1['count1']
g1 = g1.sort_values("rate1", ascending=False)
In [24]:
features = X_train.columns
importances = model.feature_importances_
indices = np.argsort(importances*-1)
num_features = 40
indices = indices[0:num_features]
_ = plt.figure(figsize=(16,7))
_ = plt.title('XGBoost: Most Important Features')
_ = plt.bar(range(len(indices)), importances[indices], color='steelblue')
_ = plt.xticks(range(len(indices)), [features[i] for i in indices], rotation=85, fontsize = 14)
_ = plt.ylabel('Relative Importance')
_ = plt.xlim(-1,num_features-1)
_ = plt.tight_layout()
_ = plt.savefig('features.png',dpi=100)
In [ ]:
np.random.seed(seed)
# TRAIN LASSO MODEL AND GENERATE SUBMISSION FILE
model = ElasticNet(alpha=0.001, l1_ratio=0.4, max_iter=5000, tol=0.0001, random_state=seed),
_ = model.fit(X_train, y_train)
XGBpreds = model.predict(X_test)
XGB_df = pd.DataFrame({'PID': y_test.index, 'Sale_Price': np.expm1(XGBpreds).round(1)})
XGB_df.to_csv('mysubmission1.txt', index=False)
print('Created mysubmission1.txt, rows=', XGB_df.shape[0],
', Model=', type(model).__name__, ', RMSElogPrice =', round( rmse(y_test, XGBpreds),6 ))
# TRAIN GBM MODEL AND GENERATE SUBMISSION FILE
model = GradientBoostingRegressor(learning_rate=0.03, n_estimators=550, max_depth=5, min_samples_split=4,
min_samples_leaf=3, min_weight_fraction_leaf=0, subsample=0.64, max_features='sqrt', random_state=seed)
_ = model.fit(X_train, y_train)
ENet_preds = model.predict(X_test)
ENet_df = pd.DataFrame({'PID': y_test.index, 'Sale_Price': np.expm1(ENet_preds).round(1)})
ENet_df.to_csv('mysubmission2.txt', index=False)
print('Created mysubmission2.txt, rows=', XGB_df.shape[0],
', Model=', type(model).__name__, ', RMSElogPrice =', round( rmse(y_test, ENet_preds),6 ))
# RE-READ SUBMISSION FILES AND CHECK FOR CORRECTNESS
temp = pd.read_csv('mysubmission1.txt')
print('\nChecking mysubmission1 file, RMSE=', round(rmse(np.log1p(temp.Sale_Price), y_test.values),6) )
temp = pd.read_csv('mysubmission2.txt')
print('Checking mysubmission2 file, RMSE=', round(rmse(np.log1p(temp.Sale_Price), y_test.values),6) )
loan_amnt: could round by 100 to 500 as categorical (NO IMPROVE)
term: ok as-is, two values and categorical (object)
int_rate: could round by 0.1 to 0.5 as categorical, Note: 629 values, 5.31 to 30.99 (TINY IMPROVE)
installment: could try log transform but unlikely, Note lots of values from 4.93 to 1714.54
grade: ok as-is, 7 values categorical
sub_grade: ok as-is, 35 values categorical
emp_title: emp_title.fillna('unknown').lower() - tuning or consolidation didnt help results
emp_length: emp_length.fillna('unknown'), otherwise ok as-is, 11 values categorical
home_ownership: ok as-is, 6 values categorical
annual_inc: Note: 0 to 9,550,000, could try log1p transform but probably ok as-is
verification_status: ok as-is, 3 values categorical
loan_status: On train only: (train.loan_status != 'Fully Paid').astype(int)
purpose: ok as-is 14 values categorical
title: title.fillna('Unknown').lower() - tuning or consolidation didnt help results
zip_code: ok as-is, consolidation made it worse
addr_state: ok as-is, 51 values categorical
dti: is a ratio of debt payment to income, Note: 5362 values from -1 to 999 (only 1 is negative)
earliest_cr_line: maybe convert to an integer, currently mon-yyyy, from apr-1955 to sep-2014
fico_range_low: ok as-is, integer from 625 to 845 in steps of 5 (40 values)
fico_range_high: ok as-is, integer from 629 to 850 in steps of 5 (40 values)
open_acc: ok as-is, integer from 0 to 90, 78 values, but some are rare so could consolidate infreq values
pub_rec: ok as-is, integer from 0 to 86, 30 values, but some are rare so could consolidate infreq values
revol_bal: ok as-is, float from 0 to 2,568,995, could try log1p transform
revol_util: data.revol_util.fillna(0 or 9999), otherwise ok as-is, 1319 values from 0 to 892.3
total_acc: ok as-is, 133 values from 2 to 176
initial_list_status: ok as-is, categorical 2 values
application_type: ok as-is, categorical 2 values
mort_acc: data.mort_acc.fillna(0 or 9999), otherwise ok. 38 values from 0 to 51
pub_rec_bankruptcies: data.pub_rec_bankruptcies.fillna(0 or 999) 10 values from 0 to 12