In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn
import re
import datetime
pd.options.display.max_columns=200
matplotlib.rcParams['figure.figsize'] = (15.0, 12.0)
Column names of data file:
In [2]:
summaryfile_type = {'Unnamed':int,
'Unnamed2':int,
'LOAN_ID':object,
'ORIG_CHN':object,
'Seller.Name':object,
'ORIG_RT':float,
'ORIG_AMT':float,
'ORIG_TRM':float,
'ORIG_DTE':object,
'FRST_DTE':object,
'OLTV':float,
'OCLTV':float,
'NUM_BO':float,
'DTI':float,
'CSCORE_B':float,
'FTHB_FLG':object,
'PURPOSE':object,
'PROP_TYP':object,
'NUM_UNIT':int,
'OCC_STAT':object,
'STATE':object,
'ZIP_3':object,
'MI_PCT':float,
'Product.Type':object,
'CSCORE_C':float,
'Monthly.Rpt.Prd':object,
'Servicer.Name':object,
'LAST_RT':float,
'LAST_UPB':float,
'Loan.Age':float,
'Months.To.Legal.Mat':float,
'Adj.Month.To.Mat':float,
'Maturity.Date':object,
'MSA':int,
'Delq.Status':object,
'MOD_FLAG':object,
'Zero.Bal.Code':float,
'ZB_DTE':object,
'LPI_DTE':object,
'FCC_DTE':object,
'DISP_DT':object
}
In [3]:
fannie1 = pd.read_csv('./processed/total/total_2012.csv', dtype=summaryfile_type, nrows=10000000)
fannie1.drop(fannie1.columns[:1], axis=1, inplace=True)
fannie1.rename(columns=lambda x: re.sub('[.]','_',x), inplace=True)
fannie1.head()
Out[3]:
In [6]:
# All helper functions!
def extract_status_year(df, col, x):
'''
This function is to extract the year of last report time. For groupby purpose
Instead of using the datetime object, let's use the string function to extract year
'''
return df.ix[x,col].split('/')[-1]
def extract_credit_score(x,y):
temp = x.copy()
temp[x.isnull()] = y.loc[x.isnull()]
temp[x>y] = y[x>y]
#x[x>y] = y[x>y]
return temp
def extract_status_default(df, x):
'''
This function is to extract the year of last report time. For groupby purpose
Instead of using the datetime object, let's use the string function to extract year
'''
if (df.ix[x,'Zero_Bal_Code'] == 0) or (df.ix[x,'Zero_Bal_Code'] == 1):
return 'Healthy'
elif (df.ix[x,'Zero_Bal_Code'] == 3) or (df.ix[x,'Zero_Bal_Code'] == 6) or (df.ix[x,'Zero_Bal_Code'] == 9):
return 'Default'
else:
return 'NA'
In [7]:
# Processing the credit score: Take the lower credit score from two scores if both are available
credit = extract_credit_score(fannie1['CSCORE_B'],fannie1['CSCORE_C'])
fannie1.drop(['CSCORE_B','CSCORE_C'],axis = 1, inplace=True)
fannie1['CSCORE'] = credit
In [8]:
# Total number of record
total_num = len(fannie1)
print("Total number of record is: {}".format(total_num))
In [9]:
# Number of prepaid
total_prepaid = len(fannie1.ix[fannie1['Zero_Bal_Code'] == 1])
print("Total number of prepaid is {}".format(total_prepaid))
print("Total prepaid ratio is {}".format(total_prepaid/total_num))
In [10]:
fannie1_default = fannie1.ix[(fannie1['Zero_Bal_Code'] == 9) | (fannie1['Zero_Bal_Code'] == 6) |
(fannie1['Zero_Bal_Code'] == 9)]
# Number of default
total_default = len(fannie1_default)
print("Total number of default is: {}".format(total_default))
print("Total default ratio is {}".format(total_default/total_num))
In [11]:
total_default_by_year = fannie1_default.groupby(lambda x:extract_status_year(fannie1,'Monthly_Rpt_Prd',x))['LOAN_ID'].count()
total_by_year = fannie1.groupby(lambda x:extract_status_year(fannie1,'ORIG_DTE',x))['LOAN_ID'].count()
In [12]:
total_default_by_year.name='Default number'
total_by_year.name='Total number'
default_rate_by_year = pd.concat([total_default_by_year,total_by_year],axis = 1, join='outer')
default_rate_by_year['Default_Rate'] = default_rate_by_year['Default number']/default_rate_by_year['Total number']
default_rate_by_year.plot(y='Default_Rate')
Out[12]:
From the plot above, there is a tendency to say the default rate peaks at 2011. However, this dataset is just a subset of all Fannie's single family data. Also, many loan won't default in a short period of time. These factors all contribute to the increase of default rate at 2011. The dataset also cuts off in 2012, which probably explains why we see a decrease after 2011.
In [13]:
total_applications_by_year = fannie1.groupby(lambda x:extract_status_year(fannie1,'ORIG_DTE',x))['LOAN_ID'].count()
total_applications_by_year.plot(kind='bar')
Out[13]:
In [14]:
rate_by_year = fannie1.groupby(lambda x:extract_status_year(fannie1,'ORIG_DTE',x))['ORIG_RT'].mean()
In [15]:
rate_by_year.plot()
Out[15]:
In [16]:
# Any different in each state?
rate_by_year_state = fannie1.groupby([lambda x:extract_status_year(fannie1,'ORIG_DTE',x),'STATE'])['ORIG_RT'].mean()
In [17]:
rate_by_year_state = rate_by_year_state.unstack(level=0)
i = int(np.ceil(fannie1['STATE'].nunique()/4))
j = 4
fig, axes = plt.subplots(i,j, figsize=(10, 8))
plt.subplots_adjust(wspace=0.5, hspace=0.5)
i2 = 0
j2=0
for state in rate_by_year_state.index:
axes[i2,j2].plot(rate_by_year_state.ix[state,])
axes[i2,j2].set_title(state)
axes[i2,j2].get_xaxis().set_visible(False)
j2 += 1
if j2 > 3:
i2 += 1
i2 = i2 % i
j2 = j2 % j
In [18]:
default_by_year = pd.concat([total_by_year,total_default_by_year], join='outer', axis=1)#.reset_index().plot()
default_by_year['Rate'] = default_by_year['Default number']/default_by_year['Total number']
default_by_year['Rate'].plot(kind='bar')
Out[18]:
In [19]:
default_by_year_state = fannie1_default.groupby([lambda x:extract_status_year(fannie1,'Monthly_Rpt_Prd',x),
'STATE'])['LOAN_ID'].count()
In [20]:
total_by_year_state = fannie1.groupby([lambda x:extract_status_year(fannie1,'Monthly_Rpt_Prd',x),
'STATE'])['LOAN_ID'].count()
In [21]:
default_by_year_state.name='default_cases_by_state'
total_by_year_state.name='total_cases_by_state'
In [22]:
default_stat_by_year_state = pd.concat([default_by_year_state,
total_by_year_state], join='outer', axis=1)
default_stat_by_year_state['Rate'] = default_stat_by_year_state['default_cases_by_state']/default_stat_by_year_state['total_cases_by_state']
default_stat_by_year_state.unstack(level=0).plot(y='Rate',kind='bar',subplots=True)
Out[22]:
In [23]:
default_stat_by_year_state.sort_values(by='Rate',ascending=False)['Rate']#.unstack(level=0).plot(y='Rate',kind='bar',subplots=True)
Out[23]:
In [24]:
fannie1['CSCORE'].describe()
Out[24]:
The average credit score of all applications is 743. This is a little bit surprising. I don't know what credit agency do these score come from. But if it is FICO, then average potential home owner has a excellent credit score based on the article here. If you have credit score less than 700 then you are already at only 25% percentile!
In [25]:
fannie1['CSCORE'].hist()
Out[25]:
In [26]:
total_by_loan_status = fannie1.groupby([lambda x: extract_status_default(fannie1, x),
lambda x:extract_status_year(fannie1,'ORIG_DTE',x)])['CSCORE'].mean()
In [27]:
total_by_loan_status.unstack(level=0).plot(kind='bar')
Out[27]:
This graph shows some interesting findings:
In [30]:
LTV_by_loan_status = fannie1.groupby([lambda x: extract_status_default(fannie1, x),
lambda x:extract_status_year(fannie1,'ORIG_DTE',x)])['OLTV'].mean()
In [31]:
LTV_by_loan_status.unstack(level=0).plot(kind='bar')
Out[31]:
In [34]:
DTI_by_loan_status = fannie1.groupby([lambda x: extract_status_default(fannie1, x),
lambda x:extract_status_year(fannie1,'ORIG_DTE',x)])['DTI'].mean()
In [35]:
DTI_by_loan_status.unstack(level=0).plot(kind='bar')
Out[35]:
There are a lot of websites out there to tell you how much you can afford to buy a house. Like this one. The conservative approach uses ~37% as DTI and the aggressive uses 41%. Analysis of Fannie's data kind of supports this theory. Default cases in general has higher DTI compared to those non-default cases. Also, before the sub-prime crisis, you can get a loan with DTI higher than 40%. Now, the general trend of DTI becomes more and more conservative. DTI rates are now between 30-35%. So next time when you use the online house affordability calculator. Keep this in mind that the conservative approach is really the approach you should take. Don't bother with the "aggressive" approach...
In [38]:
purpose_status = fannie1.groupby([lambda x: extract_status_default(fannie1, x), 'PURPOSE'])['LOAN_ID'].count()
In [39]:
purpose_status = purpose_status.unstack(level=0)
In [40]:
purpose_status['Default_Ratio'] = purpose_status['Default']/purpose_status['Healthy']
purpose_status
Out[40]:
Code | Purpose |
---|---|
C | Cash-out refinance |
P | Purchase |
R | Refinance |
U | Unspecified refinance |
Keep in mind that overall default rate is 0.0247709
. Cash out refinance has relatively higher default rate compared to other types. If you think about why, click the link and read what "cash-out refinance" means and you would know why.
In [41]:
occupancy_status = fannie1.groupby([lambda x: extract_status_default(fannie1, x), 'OCC_STAT'])['LOAN_ID'].count()
occupancy_status = occupancy_status.unstack(level=0)
In [42]:
occupancy_status['Default_Ratio'] = occupancy_status['Default']/occupancy_status['Healthy']
occupancy_status
Out[42]:
In [43]:
location_default = fannie1.groupby([lambda x: extract_status_default(fannie1, x), 'STATE'])['LOAN_ID'].count()
location_total = fannie1.groupby(['STATE'])['LOAN_ID'].count()
location_default_stat = pd.concat([location_default.unstack(level=0),location_total], axis = 1, join='inner')
location_default_stat.columns = ['Default', 'Healthy', 'Total']
location_default_stat['Default_Ratio'] = location_default_stat['Default']/location_default_stat['Total']
In [44]:
location_default_mean = location_default_stat['Default_Ratio'].mean()
location_default_std = location_default_stat['Default_Ratio'].std()
In [45]:
location_default_stat[location_default_stat['Default_Ratio'] > location_default_mean + 1.5*location_default_std]
Out[45]:
I am very surprised about this analysis. Those four states have higher default rate (more than 1.5$\sigma$ higher than the mean). I thought they would be CA, TX etc. But none of them are at the top of the list. In fact, Nevada is the state with highest default rate, followed by Florida.