In [1]:
import matplotlib
import os
import sqlite3
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (16.0, 6.0)
matplotlib.rcParams['font.size'] = 16.0
In [2]:
DIR = os.getcwd() + "/../data/"
DIR
Out[2]:
In [3]:
df = pd.read_csv(DIR + 'raw/lending-club-loan-data/loan.csv')
df.head()
Out[3]:
In [4]:
print df.shape
df.info()
In [5]:
df['purpose'].head()
Out[5]:
See here for a plan:
In [52]:
df['grade'].unique()
Out[52]:
In [49]:
# Histogram: interest rate
# By grade? By loan status?
for i in
plt.hist(df['int_rate'], bins = 30, alpha = 0.5)
plt.xlabel('Interest rate')
plt.ylabel('Freq (normed)')
plt.show()
In [87]:
# By grade
for grade in df['grade'].unique():
plt.hist(df[df['grade'] == grade]['int_rate'].values, bins = 20, normed = True, alpha = 0.3, label = grade)
plt.xlabel('Interest rate')
plt.ylabel('Freq (normed)')
plt.title('Interest rate: Differences by grade')
plt.legend()
plt.show()
In [123]:
# By grade
for status in df['loan_status'].unique():
plt.hist(df[df['loan_status'] == status]['int_rate'].values, bins = 50, \
normed = True, alpha = 0.3, label = status)
plt.xlabel('Interest rate')
plt.ylabel('Freq (normed)')
plt.title('Interest rate: Not a big difference by loan status')
plt.legend()
plt.show()
In [124]:
# By term
for term in df['term'].unique():
plt.hist(df[df['term'] == term]['int_rate'].values, bins = 50, \
normed = True, alpha = 0.3, label = term)
plt.xlabel('Interest rate')
plt.ylabel('Freq (normed)')
plt.title('Interest rate: By term')
plt.legend()
plt.show()
In [90]:
# Scatter plot: interest rate, number of inquiries?
plt.scatter(df['int_rate'], df['inq_fi'])
plt.xlabel('Interest rate')
plt.ylabel('Number of inquiries')
plt.show()
In [99]:
# Scatter plot: interest rate, number of inquiries?
# Not helpful!
plt.scatter(df['int_rate'], df['last_pymnt_amnt'], alpha = 0.2)
plt.xlabel('Interest rate')
plt.ylabel('Amount of last payment')
plt.show()
In [103]:
# By grade
for grade in df['grade'].unique():
plt.hist(df[df['grade'] == grade]['last_pymnt_amnt'].values, bins = 50, \
normed = True, alpha = 0.3, label = grade)
plt.xlabel('Amount of last payment')
plt.ylabel('Freq (normed)')
plt.title('Last payment amount: Differences by grade')
plt.legend()
plt.show()
In [104]:
# By grade
for status in df['loan_status'].unique():
plt.hist(df[df['loan_status'] == status]['last_pymnt_amnt'].values, bins = 50, \
normed = True, alpha = 0.3, label = status)
plt.xlabel('Amount of last payment')
plt.ylabel('Freq (normed)')
plt.title('Last payment amount: Differences by loan status')
plt.legend()
plt.show()
In [127]:
plt.scatter(df['int_rate'], df['tot_cur_bal'], alpha = 0.2)
plt.xlabel('Interest rate')
plt.ylabel('Total current balance')
plt.show()
In [174]:
sgrades = df['sub_grade'].value_counts().reset_index()
sgrades = sgrades.sort('index').reset_index()
sgrades['grade'] = sgrades['index'].apply(lambda x: x[0])
sgrades.head()
Out[174]:
In [187]:
# TODO: Categorize by subgrade[0:1] and grade (all the As are one color, Bs are another)
colors = {'A': 'red',
'B': 'blue',
'C': 'green',
'D': 'orange',
'E': 'purple',
'F': 'gray',
'G': 'yellow'}
x = np.arange(len(sgrades['index']))
plt.bar(x, sgrades[0], width = 0.5)
plt.xticks(x, sgrades['index'])
plt.xlabel('Subgrade')
plt.ylabel('Frequency')
plt.show()
In [190]:
plt.hist(df['dti'], bins= 50)
plt.show()
In [198]:
# Data cleaning issue! I think this is just an outlier
print df[df['dti'] > 100].shape, df.shape
df2 = df[df['dti'] <= 100]
In [200]:
plt.hist(df2['dti'], bins= 50, normed = True)
plt.xlabel('Debt-to-Income ratio')
plt.ylabel('Frequency')
plt.show()
In [203]:
# By grade
for grade in df2['grade'].unique():
plt.hist(df2[df2['grade'] == grade]['dti'].values, bins = 50, \
normed = True, alpha = 0.3, label = grade)
plt.xlabel('Debt-to-Income ratio')
plt.ylabel('Frequency (normed)')
plt.legend()
plt.show()
il_util
).
In [209]:
df2['il_util'].describe()
Out[209]:
In [219]:
plt.hist(df2['il_util'].dropna().values, bins = 50)
plt.xlabel('Balance to credit limit ratio')
plt.ylabel('Histogram')
plt.show()
In [220]:
# By grade
for grade in df2['grade'].unique():
plt.hist(df2[df2['grade'] == grade]['il_util'].dropna().values, bins = 50, \
normed = True, alpha = 0.3, label = grade)
plt.xlabel('Balance-to-Credit Limit ratio')
plt.ylabel('Frequency (normed)')
plt.legend()
plt.show()
In [231]:
plt.hist(df2['loan_amnt'].values, bins = 50)
plt.xlabel('Loan amount')
plt.ylabel('Freq')
plt.show()
In [228]:
# By grade
for grade in df2['grade'].unique():
plt.hist(df2[df2['grade'] == grade]['loan_amnt'].values, bins = 50, \
normed = True, alpha = 0.3, label = grade)
plt.xlabel('Loan amount')
plt.ylabel('Frequency (normed)')
plt.legend()
plt.show()
In [237]:
df['loan_status'].value_counts().plot(kind = 'barh')
Out[237]:
In [236]:
# subset of features
subset = []
# Correlation matrix
corrs = df.corr()
# Plotting it onto a heatmap
plt.subplots(6, 1, figsize=(15,15))
for i, n in enumerate(range(0, 76, 15)):
plt.subplot(6, 1, i + 1)
plt.pcolor(corrs, alpha=0.75, cmap = 'seismic')
plt.yticks(np.arange(0.5, len(corrs.index), 1), corrs.index)
plt.xticks(np.arange(0.5, len(corrs.columns), 1), corrs.columns)
plt.xlim([n, n + 15])
plt.ylim([n, n + 15])
plt.colorbar()
plt.tight_layout()
In [42]:
df['purpose'].value_counts()
Out[42]: