In [1]:
import numpy as np
import pandas as pd
from transfers.fileio import load_from_pickle
from helpers.preprocessing import process_features, process_payment
from model.validate import actual_IRR
In [2]:
%reload_ext rpy2.ipython
In [3]:
%%R
library(ggplot2)
library(scales)
library(grid)
In [4]:
df_3c = pd.read_csv('data/LoanStats3c_securev1.csv', header=True).iloc[:-2, :]
df_3b = pd.read_csv('data/LoanStats3b_securev1.csv', header=True).iloc[:-2, :]
df_3a = pd.read_csv('data/LoanStats3a_securev1.csv', header=True).iloc[:-2, :]
df_raw = pd.concat((df_3a, df_3b, df_3c), axis=0)
df_raw = df_raw[df_raw['issue_d'].str.contains('20', na=False)]
df_raw['year'] = df_raw['issue_d'].map(lambda x: str(x).split('-')[1])
In [5]:
df_risk = df_3a.copy()
df_risk.rename(columns={'sub_grade': 'subgrade'}, inplace=True)
df_risk['rate'] = df_risk['int_rate'].map(lambda x: float(str(x).strip('%')) / 100)
df_risk['default'] = df_risk['loan_status'].map({
'Fully Paid': 0,
'Does not meet the credit policy. Status:Fully Paid': 0,
'Current': 0,
'Does not meet the credit policy. Status:Current': 0,
'Charged Off': 1,
'Does not meet the credit policy. Status:Charged Off': 1,
'In Grace Period': 0,
'Does not meet the credit policy. Status:In Grace Period': 1,
'Late (16-30 days)': 1,
'Does not meet the credit policy. Status:Late (16-30 days)': 1,
'Late (31-120 days)': 1,
'Does not meet the credit policy. Status:Late (31-120 days)': 1,
'Default': 1})
df_risk = df_risk[df_risk['grade'].isin(['A', 'B', 'C', 'D'])]\
.groupby('subgrade', as_index=False)\
.mean()[['rate', 'default', 'subgrade']]
%Rpush df_risk
In [6]:
%%R -w 800
multiplot = function(..., plotlist=NULL, file, cols=1, layout=NULL) {
require(grid)
# Make a list from the ... arguments and plotlist
plots = c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
p1 = ggplot(df_risk, aes(x = factor(subgrade), y = rate, fill=factor(subgrade))) +
geom_bar(stat = "identity") +
scale_x_discrete(limits = rev(levels(df_risk$subgrade))) +
scale_y_reverse(labels=percent) +
ylab("Interest rate") +
guides(fill=FALSE) +
theme(text = element_text(size=15), axis.title.y = element_blank()) +
coord_flip()
p2 = ggplot(df_risk, aes(x = factor(subgrade), y = default, fill=factor(subgrade))) +
geom_bar(stat = "identity") +
scale_x_discrete(limits = rev(levels(df_risk$subgrade))) +
scale_y_continuous(labels=percent) +
ylab("Default rate") +
guides(fill=FALSE) +
theme(text = element_text(size=15), axis.title.y = element_blank()) +
coord_flip()
multiplot(p1, p2, cols=2)
In [7]:
df_volume = df_raw.groupby('year', as_index=False).sum()[['year', 'loan_amnt']]
%Rpush df_volume
In [8]:
%%R -w 800
volume_formatter = function(x) {h = sprintf('$%d bn',floor(x/1000000000))}
ggplot(df_volume, aes(x = factor(year), y = loan_amnt, fill=factor(year))) +
geom_bar(stat = "identity") +
scale_fill_brewer() +
scale_y_continuous("Total issuance on Lending Club", label=volume_formatter) +
guides(fill=FALSE) +
theme(text = element_text(size=15), axis.title.x = element_blank())
In [9]:
print "Loading model..."
try:
model = load_from_pickle('pickle/model.pkl')
except (OSError, IOError):
print "Model not found. Initializing training process, this might take some time..."
model = initialize_model()
print "Calculating expected IRR of loans that have matured..."
df_expected = process_features(df_3a.copy(), restrict_date=True, current_loans=False)
# To make a fair comparison, the same set of interest rates are used. The
# enclosed set is from Dec 2014.
int_rate_dict = {'A1':0.0603, 'A2':0.0649, 'A3':0.0699, 'A4':0.0749, 'A5':0.0819,
'B1':0.0867, 'B2':0.0949, 'B3':0.1049, 'B4':0.1144, 'B5':0.1199,
'C1':0.1239, 'C2':0.1299, 'C3':0.1366, 'C4':0.1431, 'C5':0.1499,
'D1':0.1559, 'D2':0.1599, 'D3':0.1649, 'D4':0.1714, 'D5':0.1786}
IRR_predict = model.expected_IRR(df_expected, actual_rate=False, rate_dict=int_rate_dict)
print "Calculating actual IRR of loans that have matured..."
df_actual = process_payment(df_3a.copy())
IRR_true = actual_IRR(df_actual, actual_rate=False, rate_dict=int_rate_dict)
print "Collecting results..."
df_result = df_expected[['id', 'sub_grade']].copy()
df_result['IRR_predict'] = IRR_predict
df_result['IRR_true'] = IRR_true
In [10]:
sub_grade_range = ['A1', 'A2', 'A3', 'A4', 'A5',
'B1', 'B2', 'B3', 'B4', 'B5',
'C1', 'C2', 'C3', 'C4', 'C5',
'D1', 'D2', 'D3', 'D4', 'D5']
average_list = []
top_list = []
bottom_list = []
for sub_grade in sub_grade_range:
df_sub_grade = df_result[df_result['sub_grade'] == sub_grade]
df_sub_grade['quartile'] = pd.qcut(df_sub_grade['IRR_predict'], 4, False)
id_top = df_sub_grade[df_sub_grade['quartile'] == 3]['id'].values
id_bottom = df_sub_grade[df_sub_grade['quartile'] == 0]['id'].values
average_IRR = df_sub_grade.mean()['IRR_true']
top_IRR = df_result[df_result['id'].isin(id_top)]['IRR_true'].values.mean()
bottom_IRR = df_result[df_result['id'].isin(id_bottom)]['IRR_true'].values.mean()
average_list.append(average_IRR)
top_list.append(top_IRR)
bottom_list.append(bottom_IRR)
In [11]:
df_strategy = pd.DataFrame({'sub_grade':xrange(60), 'IRR':top_list + average_list + bottom_list})
df_strategy['group'] = df_strategy['sub_grade'].map(lambda x: x/20).map({0:'top quartile', 1:'mean', 2:'bottom quartile'})
df_strategy['sub_grade'] = df_strategy['sub_grade'].map(lambda x: x%20) \
.map({0:'A1', 1:'A2', 2:'A3', 3:'A4', 4:'A5',
5:'B1', 6:'B2', 7:'B3', 8:'B4', 9:'B5',
10:'C1', 11:'C2', 12:'C3', 13:'C4', 14:'C5',
15:'D1', 16:'D2', 17:'D3', 18:'D4', 19:'D5'})
%Rpush df_strategy
In [12]:
%%R -w 800
ggplot(data=df_strategy, aes(x=factor(sub_grade), y=IRR, group=group, color=group)) +
geom_line() +
geom_point() +
scale_y_continuous("Rate of return", breaks=pretty_breaks(5), labels=percent) +
theme(text = element_text(size=15), axis.title.x = element_blank(), legend.position=c(0.9, 0.125)) +
guides(col = guide_legend(reverse = TRUE))
In [13]:
feature_importance = np.zeros(36,)
for grade in ['A', 'B', 'C', 'D']:
for submodel in model.model_dict[grade]:
feature_importance += submodel.feature_importances_
weights = []
features = []
for item in sorted(zip(feature_importance/144, model.features), reverse=True)[:10]:
weights.append(item[0])
features.append(item[1])
df_weights = pd.DataFrame({'weights':weights, 'features':features})
%Rpush df_weights
In [14]:
%%R -w 800
df_weights$features = factor(df_weights$features, levels = df_weights$features[rev(order(df_weights$weights))])
ggplot(df_weights, aes(x = factor(features), y = weights, fill=factor(features))) + geom_bar(stat = "identity") +
scale_y_continuous("Feature importance weights", labels=percent) +
theme(text = element_text(size=15), axis.title.x = element_blank(), legend.position=c(0.885, 0.74), axis.text.x=element_blank())
In [15]:
model.expected_IRR(df_expected.iloc[0:1,:], actual_rate=False, rate_dict=int_rate_dict)
X = df_expected.iloc[:1,:][model.features].values
X_sub_grade = df_expected.iloc[:1,:]['sub_grade'].values
expected_payout = model.get_expected_payout(X, X_sub_grade)
df_probability = pd.DataFrame({'period':xrange(1,37), 'probability':expected_payout.ravel()})
%Rpush df_probability
In [16]:
%%R -w 800
period_formatter = function(x) {h = sprintf('%d mth', x)}
ggplot(df_probability, aes(x=period, y=probability, fill=period, width=0.948)) +
geom_bar(stat="identity") +
scale_y_continuous("Probability of receiving payment") +
scale_x_continuous(breaks=seq(0, 36, 12), label=period_formatter) +
theme(text=element_text(size=15), axis.title.x=element_blank(), legend.position=c(0.95, 0.85))
In [17]:
IRR_predict_mean = list(df_result.groupby('sub_grade', as_index=False).mean()['IRR_predict'].values)
IRR_true_mean = list(df_result.groupby('sub_grade', as_index=False).mean()['IRR_true'].values)
headline_rate = sorted(int_rate_dict.values())
df_compare = pd.DataFrame({'sub_grade':xrange(60), 'IRR':IRR_predict_mean + IRR_true_mean + headline_rate})
df_compare['group'] = df_compare['sub_grade'].map(lambda x: x/20).map({0:'predict', 1:'true', 2:'headline'})
df_compare['sub_grade'] = df_compare['sub_grade'].map(lambda x: x%20).map({0:'A1', 1:'A2', 2:'A3', 3:'A4', 4:'A5',
5:'B1', 6:'B2', 7:'B3', 8:'B4', 9:'B5',
10:'C1', 11:'C2', 12:'C3', 13:'C4', 14:'C5',
15:'D1', 16:'D2', 17:'D3', 18:'D4', 19:'D5'})
%Rpush df_compare
In [18]:
%%R -w 800
ggplot(data=df_compare, aes(x=factor(sub_grade), y=IRR, group=group, color=group)) +
geom_line() +
geom_point() +
scale_y_continuous("Rate of return", breaks=pretty_breaks(6), labels=percent) +
theme(text = element_text(size=15), axis.title.x = element_blank(), legend.position=c(0.925, 0.125))
In [18]: