In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
pd.options.display.max_columns = 999
%matplotlib inline
matplotlib.rcParams['savefig.dpi'] = 1.5 * matplotlib.rcParams['savefig.dpi']
In insurance, underwriting is to sign and accept liability and guaranteeing payment in case loss or damage occurs. Underwriting is provided by a large financial service provider such as a bank, insurer or investment house. From Wikipedia
Develop a prediction model to predict the risk of the morgage
Fannie Mae Single Family House Morgage data (2000 - 2014)
In [2]:
# Read the data inside:
loan2011 = pd.read_csv('merged_summary_2011.csv')
loan2010 = pd.read_csv('merged_summary_2010.csv')
In [3]:
loan2012 = pd.read_csv('merged_summary_2012.csv')
In [4]:
loan2011.head()
Out[4]:
In [5]:
loan2011['Monthly.Rpt.Prd'] = pd.to_datetime(loan2011['Monthly.Rpt.Prd'], format = '%m/%d/%Y')
loan2011['ORIG_DTE'] = pd.to_datetime(loan2011['ORIG_DTE'], format = '%m/%Y')
In [6]:
loan = {'2011':loan2011, '2012': loan2012, '2010': loan2010}
In [7]:
years = []
prepaid = {}
default = {}
loan_num = {}
for key in loan.keys():
temp_size =len(loan[key])
temp_count = loan[key].groupby('Zero.Bal.Code')['LOAN_ID'].count()
loan_num[key] = temp_size
prepaid[key] = temp_count[1]/temp_size
default[key] = 1 - sum(temp_count[0:2])/temp_size
In [26]:
def plot_dict_bar(val_dict):
x_axis = sorted(list(val_dict.keys()))
y_axis = [val_dict[key] for key in x_axis]
x = np.arange(3) - 0.175
y = np.array(y_axis)*100
fig, ax = plt.subplots()
ax.bar(x,y,0.35)
ax.set_xticks(np.arange(3))
ax.set_xticklabels(x_axis)
ax.set_xlabel("Year")
ax.set_ylabel("Percentage (%)")
ax.set_title("Prepaid Ratio")
plt.show()
plot_dict_bar(prepaid)
Remember prepay includes a common case that if you want to switch to another house. You will sell the current house that you live in. That would count as prepay too.
In [27]:
def plot_dict_bar(val_dict):
x_axis = sorted(list(val_dict.keys()))
y_axis = [val_dict[key] for key in x_axis]
x = np.arange(3) - 0.175
y = np.array(y_axis)*100
fig, ax = plt.subplots()
ax.bar(x,y,0.35)
ax.set_xticks(np.arange(3))
ax.set_xticklabels(x_axis)
ax.set_xlabel("Year")
ax.set_ylabel("Percentage (%)")
ax.set_title("Default Ratio")
plt.show()
plot_dict_bar(default)
Default rate is not very high in these three years. The general trend is going down, which make senses because the economy is getting better.
In [28]:
def plot_dict_bar(val_dict):
x_axis = sorted(list(val_dict.keys()))
y_axis = [val_dict[key] for key in x_axis]
x = np.arange(3) - 0.175
y = np.array(y_axis)
fig, ax = plt.subplots()
ax.bar(x,y,0.35)
ax.set_xticks(np.arange(3))
ax.set_xticklabels(x_axis)
ax.set_xlabel("Year")
ax.set_ylabel("Percentage (%)")
ax.set_title("Loan Initiation Amount")
plt.show()
plot_dict_bar(loan_num)
In [21]:
years = []
loan_to_value = {}
for key in loan.keys():
temp_ltv = {}
temp_mean = loan[key].groupby('Zero.Bal.Code')['OLTV'].mean()
temp_ltv['Current'] = temp_mean[0]
temp_ltv['Prepaid'] = temp_mean[1]
temp_ltv['Default'] = np.mean(temp_mean[2:])
loan_to_value[key] = temp_ltv
result = pd.DataFrame.from_dict(loan_to_value)
result.head()
Out[21]:
It is very interesting to find out that "default" loans tends to have slightly higher loan-to-value (LTV) ratio on average, meaning they tends to have lower downpayment.
In [24]:
years = []
credit_score = {}
for key in loan.keys():
temp_cscore = {}
temp_mean = loan[key].groupby('Zero.Bal.Code')['CSCORE_C'].mean()
temp_cscore['Current'] = temp_mean[0]
temp_cscore['Prepaid'] = temp_mean[1]
temp_cscore['Default'] = np.mean(temp_mean[2:])
credit_score[key] = temp_cscore
result2 = pd.DataFrame.from_dict(credit_score)
result2.head()
Out[24]:
Again, it is very interesting to find that default loans tend to have slightly lower credit score, on average.
In [16]:
# Last one, default by states...
years = []
state_res = {}
for key in loan.keys():
temp_state_count = loan[key][(loan[key]['Zero.Bal.Code'] == 3) | (loan[key]['Zero.Bal.Code'] == 6) | (loan[key]['Zero.Bal.Code'] == 9)].groupby('STATE')['LOAN_ID'].count()
states = list(temp_state_count.index)
for state in states:
if state_res.get(state) == None:
state_res[state] = temp_state_count[state]
else:
state_res[state] = state_res[state] + temp_state_count[state]
total_state_res = pd.DataFrame.from_dict(state_res, orient='index')
total_state_res.columns=['Default']
In [14]:
total_state_res.sort_values(by='Default', ascending=False)
Out[14]:
Big states tend to have more default cases. The better way of doing it maybe take the default ratio.