In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [ ]:
import data_science.lendingclub.dataprep_and_modeling.modeling_utils.data_prep_new as data_prep
import dir_constants as dc
# from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import time
from tqdm import tqdm_notebook
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
In [ ]:
def display_all(df):
with pd.option_context("display.max_rows", 100):
with pd.option_context("display.max_columns", 1000):
display(df)
In [ ]:
platform = 'lendingclub'
store = pd.HDFStore(
dc.home_path+'/justin_tinkering/data_science/lendingclub/{0}_store.h5'.
format(platform),
append=True)
loan_info = store['loan_info_clean']
filtered_col_example = store['base_dataset_filtered_columns']
base_cols = filtered_col_example.columns
del filtered_col_example
npv_rois = store['loan_npv_rois']
loan_info['npv_roi_10'] = npv_rois[0.1]
del npv_rois
# loan_info = loan_info[base_cols]
store.close()
In [ ]:
# figure out how to split dataset into train and validation. Split validation in half for valid and test
sbd = loan_info[loan_info['maturity_time'] >= .98]
In [ ]:
print((sbd['issue_d'].value_counts(dropna=False).index.max()))
# last date is 2014-10-01
# for 36 month, lets say 2014 july/aug/sep/oct are valid/test, everything before is train for normal loans
# assume 4 month to charge off for defaulting loans, so july aug sep oct still valid/test, but
# last pmt date for defaulting should be Feb.
# for 60 month, same as above but 2012 dates
# (sbd['last_pymnt_d'] <= '2014-06-01') & (sbd['loan_status'].isin(['paid', 'current']))) |((sbd['last_pymnt_d'] <= '2014-02-01') & (~sbd['loan_status'].isin(['paid', 'current'])))))
train_mask_36 = ((sbd['term'] == 36) & ( ((sbd['last_pymnt_d'] <= '2014-06-01') & (sbd['loan_status'].isin(['paid', 'current']))) | ((sbd['last_pymnt_d'] <= '2014-02-01') & (~sbd['loan_status'].isin(['paid', 'current']))) ))
train_mask_60 = ((sbd['term'] == 60) & ( ((sbd['last_pymnt_d'] <= '2012-06-01') & (sbd['loan_status'].isin(['paid', 'current']))) | ((sbd['last_pymnt_d'] <= '2012-02-01') & (~sbd['loan_status'].isin(['paid', 'current']))) ))
In [ ]:
train_idx = np.array(pd.concat([sbd[train_mask_36], sbd[train_mask_60]])['id'])
# all_idx = np.array(sbd.index)
test_idx = np.array(sbd[~sbd['id'].isin(train_idx)].index)
assert (len(train_idx) + len(test_idx)) == len(sbd)
valid_idx = np.random.choice(test_idx, len(test_idx)//2, replace=False)
va_set = set(valid_idx)
te_set = set(test_idx).difference(va_set)
tr_set = set(train_idx)
assert len(te_set.intersection(va_set)) == 0
test_idx = np.array(list(te_set))
assert (len(train_idx)+ len(test_idx)+ len(valid_idx)) == len(sbd)
In [ ]:
sbd = sbd[base_cols]
cols_to_drop = ['hardship_type', 'hardship_reason', 'hardship_status', 'deferral_term', 'hardship_amount',
'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date', 'hardship_length',
'hardship_dpd', 'hardship_loan_status', 'orig_projected_additional_accrued_interest',
'hardship_payoff_balance_amount', 'hardship_last_payment_amount', 'maturity_time', 'maturity_paid',
'target_loose','installment_amount', 'npv_roi_10', 'orig_amt_due',]
sbd.drop(cols_to_drop, axis=1, inplace=True)
sbd.rename({'issue_d': 'issue_date'}, axis=1, inplace=True)
In [ ]:
add_datepart(sbd, 'issue_date')
train_cats(sbd)
df, y, nas = proc_df(sbd, 'target_strict')
In [ ]:
y = pd.DataFrame(y, index=df.index, columns=['target'])
In [ ]:
# save in temp
os.makedirs('tmp', exist_ok=True)
X_train = df.ix[train_idx,:].fillna(0).reset_index(drop=True).to_feather('tmp/X_train')
X_valid = df.ix[valid_idx,:].fillna(0).reset_index(drop=True).to_feather('tmp/X_valid')
X_test = df.ix[test_idx,:].fillna(0).reset_index(drop=True).to_feather('tmp/X_test')
y_train = y.ix[train_idx].reset_index(drop=True).to_feather('tmp/y_train')
y_valid = y.ix[valid_idx].reset_index(drop=True).to_feather('tmp/y_valid')
y_test = y.ix[test_idx].reset_index(drop=True).to_feather('tmp/y_test')
In [ ]:
# load in
os.makedirs('tmp', exist_ok=True)
X_train = pd.read_feather('tmp/X_train')
X_valid = pd.read_feather('tmp/X_valid')
X_test = pd.read_feather('tmp/X_test')
y_train = pd.read_feather('tmp/y_train')
y_valid = pd.read_feather('tmp/y_valid')
y_test = pd.read_feather('tmp/y_test')
In [ ]:
m = RandomForestClassifier(n_jobs=-1)
m.fit(X_train, y_train)
m.score(X_train, y_train)
In [ ]:
def rmse(x,y): return mean_squared_error(x, y)
def print_score(m):
res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
m.score(X_train, y_train), m.score(X_valid, y_valid)]
if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
print(res)
In [ ]:
m = RandomForestClassifier(n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)
In [ ]:
103571/len(y_valid)
In [ ]:
y_valid['target'].value_counts(dropna=False)
In [ ]:
print('hi')
In [ ]:
import datetime
print(datetime.datetime.utcnow())
In [ ]:
print(datetime.datetime.utcnow())
In [ ]:
print(datetime.datetime.utcnow())
In [ ]:
In [ ]:
# change depth
m = RandomForestRegressor(n_estimators=1, max_depth=3, bootstrap=False, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)
In [ ]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)
In [ ]:
regr = RandomForestRegressor()
In [ ]:
def report(results, n_top=3):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
In [ ]:
fit_params_dict = {'n_estimators': np.arange(1,201),
'criterion': ['mse'],#, 'mae'
'max_features': [3,10, 50, 70, 100, 150, 200],
'min_samples_split': [20, 200, 2000],
'min_samples_leaf': [10, 100, 1000],
'bootstrap': [True],
'oob_score': [True],
'n_jobs': [-1],
'verbose': [10]}
In [ ]:
n_iter = 20
random_search = RandomizedSearchCV(
regr, param_distributions=fit_params_dict, n_iter=n_iter)
In [ ]:
start = time.time()
random_search.fit(standardized, eval_cols)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time.time() - start), n_iter))
In [ ]:
report(random_search.cv_results_)
# Model with rank: 1
# Mean validation score: 0.078 (std: 0.001)
# Parameters: {'verbose': 10, 'oob_score': True, 'n_jobs': -1, 'n_estimators': 175, 'min_samples_split': 200, 'min_samples_leaf': 10, 'max_features': 70, 'criterion': 'mse', 'bootstrap': True}
# Model with rank: 2
# Mean validation score: 0.078 (std: 0.001)
# Parameters: {'verbose': 10, 'oob_score': True, 'n_jobs': -1, 'n_estimators': 185, 'min_samples_split': 200, 'min_samples_leaf': 10, 'max_features': 100, 'criterion': 'mse', 'bootstrap': True}
# Model with rank: 3
# Mean validation score: 0.077 (std: 0.002)
# Parameters: {'verbose': 10, 'oob_score': True, 'n_jobs': -1, 'n_estimators': 139, 'min_samples_split': 20, 'min_samples_leaf': 10, 'max_features': 100, 'criterion': 'mse', 'bootstrap': True}
In [ ]:
regr = RandomForestRegressor(
n_estimators=175,
random_state=0,
max_features=70,
min_samples_split=200,
min_samples_leaf=10,
n_jobs=-1,
oob_score=True,
bootstrap=True,
criterion='mse',
)
regr.fit(standardized, eval_cols)
In [ ]:
regr.score(standardized, eval_cols)
In [ ]:
# dump the model
joblib.dump(regr, 'model_dump/model_0.2.1.pkl')
joblib.dump((mean_series, std_dev_series), 'model_dump/mean_stddev.pkl')
In [ ]:
now = time.strftime("%Y_%m_%d_%Hh_%Mm_%Ss")
# info to stick in detailed dataframe describing each model
model_info = {'model_version': '0.2.1',
'target': 'npv_roi_10',
'weights': 'None',
'algo_model': 'RF_regr',
'hyperparams': "bootstrap=True, criterion='mse', max_depth=None, max_features=70, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=10, min_samples_split=200, min_weight_fraction_leaf=0.0, n_estimators=175, n_jobs=-1, oob_score=True, random_state=0, verbose=0, warm_start=False",
'cost_func': 'sklearn default, which I think is mse',
'useful_notes': 'R2 score of .3830649 (regr.score())',
'date': now}
model_info_df = pd.DataFrame(model_info, index = ['0.2.1'])
In [ ]:
store.open()
model_info = store['model_info']
model_info.ix['0.2.1',:] = model_info_df.values
model_info.sort_index(inplace=True)
store.append(
'model_info',
model_info_df,
data_columns=True,
index=True,
append=False,
)
store.close()
In [ ]:
store.open()
test = store['test_filtered_columns']
train = store['train_filtered_columns']
loan_npv_rois = store['loan_npv_rois']
default_series = test['target_strict']
results = store['results']
store.close()
In [ ]:
train_X, train_y = data_prep.process_data_test(train)
train_y = train_y['npv_roi_10'].values
test_X, test_y = data_prep.process_data_test(test)
test_y = test_y['npv_roi_10'].values
regr = joblib.load('model_dump/model_0.2.1.pkl')
regr_version = '0.2.1'
test_yhat = regr.predict(test_X)
train_yhat = regr.predict(train_X)
In [ ]:
test_mse = np.sum((test_yhat - test_y)**2)/len(test_y)
train_mse = np.sum((train_yhat - train_y)**2)/len(train_y)
In [ ]:
test_mse
In [ ]:
train_mse
In [ ]:
def eval_models(trials, port_size, available_loans, regr, regr_version, test, loan_npv_rois,
default_series):
results = {}
pct_default = {}
test_copy = test.copy()
for trial in tqdm_notebook(np.arange(trials)):
loan_ids = np.random.choice(
test_copy.index.values, available_loans, replace=False)
loans_to_pick_from = test_copy.loc[loan_ids, :]
scores = regr.predict(loans_to_pick_from)
scores_series = pd.Series(dict(zip(loan_ids, scores)))
scores_series.sort_values(ascending=False, inplace=True)
picks = scores_series[:900].index.values
results[trial] = loan_npv_rois.loc[picks, :].mean().to_dict()
pct_default[trial] = (default_series.loc[picks].sum()) / port_size
pct_default_series = pd.Series(pct_default)
results_df = pd.DataFrame(results).T
results_df['pct_def'] = pct_default_series
return results_df
In [ ]:
# as per done with baseline models, say 3000 loans available
# , pick 900 of them
trials = 20000
port_size = 900
available_loans = 3000
model_results = eval_models(trials, port_size, available_loans, regr, regr_version, test_X, loan_npv_rois, default_series)
In [ ]:
multi_index = []
for col in model_results.columns.values:
multi_index.append((str(col),regr_version))
append_results = model_results.copy()
append_results.columns = pd.MultiIndex.from_tuples(multi_index, names = ['discount_rate', 'model'])
In [ ]:
multi_index_results = []
for col in results.columns.values:
multi_index_results.append((str(col[0]), col[1]))
results.columns = pd.MultiIndex.from_tuples(multi_index_results, names = ['discount_rate', 'model'])
In [ ]:
try:
results = results.join(append_results)
except ValueError:
results.loc[:, (slice(None), slice('0.2.1','0.2.1'))] = append_results
results.sort_index(axis=1, inplace = True)
In [ ]:
full_results = results
In [ ]:
full_results.describe()
In [ ]:
store.open()
store['results'] = full_results
store.close()
In [ ]: