In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
sns.set_style('white')
In [2]:
# 887,379 loans in total
loans = pd.read_csv('../data/loan.csv')
loans['grade'] = loans['grade'].astype('category', ordered=True)
loans['last_pymnt_d'] = pd.to_datetime(loans['last_pymnt_d'])#.dt.strftime("%Y-%m-%d")
loans.shape
Out[2]:
In [3]:
loans['loan_status'].unique()
Out[3]:
In [4]:
# most loans are current
sns.countplot(loans['loan_status'], color='turquoise')
plt.xticks(rotation=90)
plt.savefig('../figures/barplot_loan_statusses.jpg', bbox_inches='tight')
In [5]:
# exclude current loans leaves 256,939 (about 30%)
closed_status = ['Fully Paid', 'Charged Off',
'Does not meet the credit policy. Status:Fully Paid',
'Does not meet the credit policy. Status:Charged Off']
closed_loans = loans[loans['loan_status'].isin(closed_status)]
closed_loans.shape
Out[5]:
In [6]:
sns.countplot(closed_loans['loan_status'], color='turquoise')
plt.xticks(rotation=90)
plt.savefig('../figures/barplot_loan_statusses_closed.jpg', bbox_inches='tight')
In [7]:
# two categories: paid/unpaid
paid_status = ['Fully Paid', 'Does not meet the credit policy. Status:Fully Paid']
closed_loans['paid'] = [True if loan in paid_status else False for loan in closed_loans['loan_status']]
sns.countplot(closed_loans['paid'])
plt.xticks(rotation=90)
Out[7]:
Total loans: 256,939 Total features: 74
Loan
Borrower
Two borrowers (only in 1 case)
In general, a note goes into Default status when it is 121 or more days past due. When a note is in Default status, Charge Off occurs no later than 150 days past due (i.e. No later than 30 days after the Default status is reached) when there is no reasonable expectation of sufficient payment to prevent the charge off. However, bankruptcies may be charged off earlier based on date of bankruptcy notification.
--> so default is not closed yet (so threw that one out).
In [8]:
# 1914 loans amounts bigger than funded amount
sum(closed_loans['loan_amnt'] != closed_loans['funded_amnt'])
Out[8]:
In [9]:
# nr of null values per feature
nr_nulls = closed_loans.isnull().apply(sum, 0)
nr_nulls = nr_nulls[nr_nulls != 0]
ratio_missing = nr_nulls.sort_values(ascending=False) / 255720
ratio_missing.to_csv('../data/missing_ratio.txt', sep='\t')
ratio_missing
Out[9]:
In [10]:
sns.distplot(closed_loans['funded_amnt'], kde=False, bins=50)
plt.savefig('../figures/funded_amount.jpg')
In [11]:
# closed loans about 20% are 60 months
# all loans lot of missing data, rest 30% are 60 months
sns.countplot(closed_loans['term'], color='darkblue')
plt.title('closed')
plt.savefig('../figures/term_closed.jpg')
plt.show()
sns.countplot(loans['term'])
plt.title('all')
Out[11]:
In [12]:
# higher interest rate more interesting for lenders
# higher grade gets higher interest rate (more risk)
# does it default more often?
# do you get richer from investing in grade A-C (less default?) or from D-G (more interest)?
fig = sns.distplot(closed_loans['int_rate'], kde=False, bins=50)
fig.set(xlim=(0, None))
plt.savefig('../figures/int_rates.jpg')
In [13]:
sns.boxplot(data=closed_loans, x='grade', y='int_rate', color='turquoise')
plt.savefig('../figures/boxplots_intrate_grade.jpg')
In [14]:
sns.stripplot(data=closed_loans, x='grade', y='int_rate', color='gray')
Out[14]:
In [15]:
# closed_loans['collection_recovery_fee']
closed_loans['profit'] = (closed_loans['total_rec_int'] + closed_loans['total_rec_prncp']
+ closed_loans['total_rec_late_fee'] + closed_loans['recoveries']) - closed_loans['funded_amnt']
profits = closed_loans.groupby('grade')['profit'].sum()
sns.barplot(data=profits.reset_index(), x='grade', y='profit', color='gray')
plt.savefig('../figures/profit_grades.jpg')
plt.show()
profits = closed_loans.groupby('paid')['profit'].sum()
sns.barplot(data=profits.reset_index(), x='paid', y='profit')
plt.show()
profits = closed_loans.groupby(['grade', 'paid'])['profit'].sum()
sns.barplot(data=profits.reset_index(), x='profit', y='grade', hue='paid', orient='h')
plt.savefig('../figures/profit_grades_paid.jpg')
plt.show()
In [16]:
# Sort off normally distributed --> statistically test whether means are different?
sns.distplot(closed_loans[closed_loans['paid']==True]['int_rate'])
sns.distplot(closed_loans[closed_loans['paid']==False]['int_rate'])
plt.savefig('../figures/int_rate_paid.jpg')
In [17]:
grade_paid = closed_loans.groupby(['grade', 'paid'])['id'].count()
risk_grades = dict.fromkeys(closed_loans['grade'].unique())
for g in risk_grades.keys():
risk_grades[g] = grade_paid.loc[(g, False)] / (grade_paid.loc[(g, False)] + grade_paid.loc[(g, True)])
risk_grades = pd.DataFrame(risk_grades, index=['proportion_unpaid_loans'])
sns.stripplot(data=risk_grades, color='darkgray', size=15)
plt.savefig('../figures/proportion_grades.jpg')
In [18]:
# does the purpose matter for the chance of charged off?
sns.countplot(closed_loans['purpose'], color='turquoise')
plt.xticks(rotation=90)
plt.show()
purpose_paid = closed_loans.groupby(['purpose', 'paid'])['id'].count()
sns.barplot(data=pd.DataFrame(purpose_paid).reset_index(), x='purpose', y='id', hue='paid')
plt.xticks(rotation=90)
plt.savefig('../figures/purposes.jpg', bbox_inches='tight')
In [19]:
# debt to income
sns.boxplot(data=closed_loans, x='paid', y='dti')
plt.savefig('../figures/dti.jpg')
Next payment day is not NAN in the 'does not meet' categories. Outstanding principle is all 0 (so not active anymore) Indeed seems like older loans --> seems they are in fact closed, so leave them in
In [20]:
sns.countplot(closed_loans[closed_loans['next_pymnt_d'].notnull()]['loan_status'])
plt.xticks(rotation=90)
plt.savefig('../figures/last_payment_day.jpg', bbox_inches='tight')
plt.show()
print(closed_loans['loan_status'].value_counts())
new_loans = ['Fully Paid', 'Charged Off']
sns.countplot(data=closed_loans[~closed_loans['loan_status'].isin(new_loans)], x='last_pymnt_d', hue='loan_status')
plt.xticks([])
plt.savefig('../figures/last_payment_day_old.jpg')
plt.show()
sns.countplot(data=closed_loans[closed_loans['loan_status'].isin(new_loans)], x='last_pymnt_d', hue='loan_status')
plt.xticks([])
plt.savefig('../figures/last_payment_day_new.jpg')
plt.show()
closed_loans['out_prncp'].value_counts()
Out[20]:
http://www.lendacademy.com/forum/index.php?topic=2427.msg20813#msg20813 Only policy 1 loans in this case, so no problem.
In [21]:
closed_loans['policy_code'].value_counts()
Out[21]:
In [ ]:
In [ ]: