In [ ]:
import modeling_utils.data_prep_new as data_prep
from sklearn.externals import joblib
import time
import dir_constants as dc
from tqdm import tqdm_notebook
In [ ]:
platform = 'lendingclub'
store = pd.HDFStore(
dc.home_path+'/justin_tinkering/data_science/lendingclub/{0}_store.h5'.
format(platform),
append=True)
In [ ]:
store.open()
train = store['train_filtered_columns']
test = store['test_filtered_columns']
loan_npv_rois = store['loan_npv_rois']
default_series = test['target_strict']
results = store['results']
store.close()
train_ids = set(train.index.values)
test_ids = set(test.index.values)
assert len(train_ids.intersection(test_ids)) == 0
In [ ]:
test_X, test_y = data_prep.process_data_test(test)
test_y = test_y['npv_roi_10'].values
regr = joblib.load('model_dump/model_0.2.1.pkl')
regr_version = '0.2.1'
test_yhat = regr.predict(test_X)
In [ ]:
for col in test_X.columns:
if len(test_X[test_X[col].isnull()]) > 0:
print(col)
In [ ]:
importances = regr.feature_importances_
std = np.std([tree.feature_importances_ for tree in regr.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
columns = test_X.columns.values
# Print the feature ranking
print("Feature ranking:")
for f in range(test_X.shape[1]):
print("{0}. feature {1}: {2} {3}".format(f + 1, indices[f], columns[indices[f]], importances[indices[f]]))
# Plot the feature importances of the regr
plt.figure()
plt.title("Feature importances")
plt.bar(range(test_X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(test_X.shape[1]), indices)
plt.xlim([-1, test_X.shape[1]])
plt.show()
In [ ]:
test['0.2.1_scores'] = test_yhat
In [ ]:
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 10000
plt.figure(figsize=(12,9))
plt.plot(test['0.2.1_scores'], test['npv_roi_10'], 'o')
plt.show()
In [ ]:
pos_scores = test[test['0.2.1_scores'] >= 0]
ps_pos_returns = pos_scores[pos_scores['npv_roi_10'] > 0]
In [ ]:
len(ps_pos_returns)/len(pos_scores)
In [ ]:
percentiles = np.arange(0,100,1)
In [ ]:
def eval_models(trials, port_size, available_loans, test, percentiles):
results = {}
pct_default = {}
test_copy = test.copy()
for trial in tqdm_notebook(np.arange(trials)):
loan_ids = np.random.choice(
test_copy.index.values, available_loans, replace=False)
loans_to_pick_from = test_copy.loc[loan_ids, :]
loans_to_pick_from.sort_values('0.2.1_scores', ascending=True, inplace = True)
chunksize = int(len(loans_to_pick_from)/100)
results_dict = {}
for k,perc in enumerate(percentiles):
subset = loans_to_pick_from[k*chunksize:(k+1)*chunksize]
results_dict[perc] = subset['npv_roi_10'].mean()
results[trial] = pd.Series(results_dict)
return pd.DataFrame.from_dict(results).T
# picks = scores_series[:900].index.values
# results[trial] = loan_npv_rois.loc[picks, :].mean().to_dict()
# pct_default[trial] = (default_series.loc[picks].sum()) / port_size
# pct_default_series = pd.Series(pct_default)
# results_df = pd.DataFrame(results).T
# results_df['pct_def'] = pct_default_series
# return results_df
In [ ]:
# as per done with baseline models, say 3000 loans available
# , pick 900 of them
trials = 20000
port_size = 900
available_loans = 3000
results = eval_models(trials, port_size, available_loans, test, percentiles)
In [ ]:
summaries = results.describe()
In [ ]:
summaries
In [ ]:
plt.figure(figsize=(12,9))
plt.plot(summaries.columns.values, summaries.loc['mean',:], 'o', label='mean')
plt.plot(summaries.columns.values, summaries.loc['25%',:], 'ro', label='25%')
# plt.plot(summaries.columns.values, summaries.loc['50%',:], '-.')
plt.plot(summaries.columns.values, summaries.loc['75%',:], 'ko', label='75%')
plt.title('return per percentile over batches')
plt.legend(loc='best')
plt.xlabel('percentile of 0.2.1_score')
plt.ylabel('npv_roi_10')
plt.show()
In [ ]:
store.open()
store['percentiles_for_0.2.1'] = results
store.close()
In [ ]: