In [13]:
    
import pandas as pd
import numpy as np
    
In [14]:
    
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
    
In [15]:
    
loans = pd.read_csv("loan_data.csv")
    
In [16]:
    
loans.head()
    
    Out[16]:
In [17]:
    
loans.info()
    
    
In [18]:
    
loans.describe()
    
    Out[18]:
In [19]:
    
plt.figure(figsize=(10,6))
loans[loans['credit.policy']==1]['fico'].hist(alpha=0.5,color='blue',
                                              bins=30,label='Credit.Policy=1')
loans[loans['credit.policy']==0]['fico'].hist(alpha=0.5,color='red',
                                              bins=30,label='Credit.Policy=0')
plt.legend()
plt.xlabel('FICO')
    
    Out[19]:
    
In [20]:
    
plt.figure(figsize=(10,6))
loans[loans['not.fully.paid']==1]['fico'].hist(alpha=0.5,color='blue',
                                              bins=30,label='Credit.Policy=1')
loans[loans['not.fully.paid']==0]['fico'].hist(alpha=0.5,color='red',
                                              bins=30,label='Credit.Policy=0')
plt.legend()
plt.xlabel('FICO')
    
    Out[20]:
    
In [22]:
    
sns.countplot(loans["purpose"], hue = loans["not.fully.paid"])
    
    Out[22]:
    
In [23]:
    
sns.jointplot(loans['fico'], loans['int.rate'])
    
    Out[23]:
    
In [24]:
    
plt.figure(figsize=(11,7))
sns.lmplot(y='int.rate',x='fico',data=loans,hue='credit.policy',
           col='not.fully.paid',palette='Set1')
    
    Out[24]:
    
    
In [25]:
    
loans.info()
    
    
In [28]:
    
cat_feats = ["purpose"]
    
In [30]:
    
final_data = pd.get_dummies(loans, columns = cat_feats, drop_first = True)
    
In [31]:
    
from sklearn.model_selection import train_test_split
    
In [32]:
    
X = final_data.drop("not.fully.paid", axis=1)
y = final_data["not.fully.paid"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
In [33]:
    
from sklearn.tree import DecisionTreeClassifier
    
In [34]:
    
tree = DecisionTreeClassifier()
    
In [35]:
    
tree.fit(X_train, y_train)
    
    Out[35]:
In [36]:
    
tree_pred = tree.predict(X_test)
    
In [37]:
    
from sklearn.metrics import confusion_matrix, classification_report
    
In [38]:
    
print(confusion_matrix(y_test, tree_pred))
print("\n")
print(classification_report(y_test, tree_pred))
    
    
In [39]:
    
from sklearn.ensemble import RandomForestClassifier
    
In [40]:
    
rfc = RandomForestClassifier(n_estimators=600)
    
In [41]:
    
rfc.fit(X_train, y_train)
    
    Out[41]:
In [42]:
    
rfc_pred = rfc.predict(X_test)
    
In [43]:
    
print(confusion_matrix(y_test, rfc_pred))
print("\n")
print(classification_report(y_test, rfc_pred))
    
    
In [44]:
    
##### Finding optimal number of trees
    
In [45]:
    
error_rates =[]
for i in range(50, 1000, 50):
    c = RandomForestClassifier(n_estimators=i)
    c.fit(X_train, y_train)
    i_pred = c.predict(X_test)
    error = np.mean(y_test != i_pred)
    error_rates.append(error)
    
In [48]:
    
plt.figure(figsize=(10, 6))
plt.plot(range(50, 1000, 50), error_rates, color='blue', linestyle='dashed', marker='o', markerfacecolor='red', markersize=10)
plt.title("Error Rate vs n trees")
plt.xlabel("n trees")
plt.ylabel("Error Rate")
    
    Out[48]:
    
In [49]:
    
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print(confusion_matrix(y_test, rfc_pred))
print("\n")
print(classification_report(y_test, rfc_pred))
    
    
In [ ]: