In [1]:
#importing things
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [3]:
#reading data
df = pd.read_csv('loan_data.csv')
In [4]:
#describing the data
df.head()
Out[4]:
In [5]:
df.info()
In [6]:
df.describe()
Out[6]:
In [7]:
#initial exploration and visualization of accept/deny credit
plt.figure(figsize=(10,6))
plt.hist(df[df['credit.policy']==1]['fico'],bins=30,alpha=0.5,edgecolor='black')
plt.hist(df[df['credit.policy']==0]['fico'],color='red',bins=30,alpha=0.5,edgecolor='black')
plt.legend(['credit.policy=1','credit.polocy=0'])
plt.xlabel("FICO")
Out[7]:
In [8]:
#visualization of loans not paid back
plt.figure(figsize=(10,6))
plt.hist(df[df['not.fully.paid']==1]['fico'],bins=30,alpha=0.5,edgecolor='black')
plt.hist(df[df['not.fully.paid']==0]['fico'],color='red',bins=30,alpha=0.5,edgecolor='black')
plt.legend(['.not.fully.paid=1','not.fully.paid=0'])
plt.xlabel("FICO")
Out[8]:
In [10]:
#seaborn plot to show reasons for debt and payment status
plt.figure(figsize=(12,6))
sns.countplot(df['purpose'],hue=df['not.fully.paid'])
Out[10]:
In [12]:
#plotting relationship between fico score and interest rate
sns.jointplot('fico','int.rate',data=df,color='purple',size=8,ratio=3,space=0.4)
Out[12]:
In [13]:
#differentiate between not-full-paid and credit-policy
sns.lmplot('fico','int.rate',data=df,hue='credit.policy',col='not.fully.paid',size=8)
Out[13]:
In [14]:
#inferential model: decision trees.
# lets set it up
df.info()
In [16]:
#lets look at categorical features
#transforming to dummy variables
cat_feats = ['purpose']
final_data = pd.get_dummies(df,columns=cat_feats,drop_first=True)
In [17]:
#lets look at the transformed data
final_data.head()
Out[17]:
In [18]:
#train-test splitting
X = final_data.drop('not.fully.paid',axis=1)
y = final_data['not.fully.paid']
In [19]:
#forgot to import! oops
from sklearn.cross_validation import train_test_split
In [20]:
#70-30 split: you can experiment with any split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=101)
In [21]:
#training decision trees
from sklearn.tree import DecisionTreeClassifier
dTree = DecisionTreeClassifier()
dTree.fit(X_train,y_train)
Out[21]:
In [22]:
#predictions and inferences
predictions = dTree.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))
In [23]:
#printing confusion matrix
print(confusion_matrix(y_test,predictions))
In [24]:
#lets do random forests - a better way than singular decision trees
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=300)
rfc.fit(X_train,y_train)
Out[24]:
In [25]:
#inferences from random forests
rfc_pred = rfc.predict(X_test)
print(classification_report(y_test,rfc_pred))
In [26]:
#confusion matrix from random forests
print(confusion_matrix(y_test,rfc_pred))
In [ ]: