In [13]:
import pandas as pd
import numpy as np
In [14]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [15]:
loans = pd.read_csv("loan_data.csv")
In [16]:
loans.head()
Out[16]:
In [17]:
loans.info()
In [18]:
loans.describe()
Out[18]:
In [19]:
plt.figure(figsize=(10,6))
loans[loans['credit.policy']==1]['fico'].hist(alpha=0.5,color='blue',
bins=30,label='Credit.Policy=1')
loans[loans['credit.policy']==0]['fico'].hist(alpha=0.5,color='red',
bins=30,label='Credit.Policy=0')
plt.legend()
plt.xlabel('FICO')
Out[19]:
In [20]:
plt.figure(figsize=(10,6))
loans[loans['not.fully.paid']==1]['fico'].hist(alpha=0.5,color='blue',
bins=30,label='Credit.Policy=1')
loans[loans['not.fully.paid']==0]['fico'].hist(alpha=0.5,color='red',
bins=30,label='Credit.Policy=0')
plt.legend()
plt.xlabel('FICO')
Out[20]:
In [22]:
sns.countplot(loans["purpose"], hue = loans["not.fully.paid"])
Out[22]:
In [23]:
sns.jointplot(loans['fico'], loans['int.rate'])
Out[23]:
In [24]:
plt.figure(figsize=(11,7))
sns.lmplot(y='int.rate',x='fico',data=loans,hue='credit.policy',
col='not.fully.paid',palette='Set1')
Out[24]:
In [25]:
loans.info()
In [28]:
cat_feats = ["purpose"]
In [30]:
final_data = pd.get_dummies(loans, columns = cat_feats, drop_first = True)
In [31]:
from sklearn.model_selection import train_test_split
In [32]:
X = final_data.drop("not.fully.paid", axis=1)
y = final_data["not.fully.paid"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
In [33]:
from sklearn.tree import DecisionTreeClassifier
In [34]:
tree = DecisionTreeClassifier()
In [35]:
tree.fit(X_train, y_train)
Out[35]:
In [36]:
tree_pred = tree.predict(X_test)
In [37]:
from sklearn.metrics import confusion_matrix, classification_report
In [38]:
print(confusion_matrix(y_test, tree_pred))
print("\n")
print(classification_report(y_test, tree_pred))
In [39]:
from sklearn.ensemble import RandomForestClassifier
In [40]:
rfc = RandomForestClassifier(n_estimators=600)
In [41]:
rfc.fit(X_train, y_train)
Out[41]:
In [42]:
rfc_pred = rfc.predict(X_test)
In [43]:
print(confusion_matrix(y_test, rfc_pred))
print("\n")
print(classification_report(y_test, rfc_pred))
In [44]:
##### Finding optimal number of trees
In [45]:
error_rates =[]
for i in range(50, 1000, 50):
c = RandomForestClassifier(n_estimators=i)
c.fit(X_train, y_train)
i_pred = c.predict(X_test)
error = np.mean(y_test != i_pred)
error_rates.append(error)
In [48]:
plt.figure(figsize=(10, 6))
plt.plot(range(50, 1000, 50), error_rates, color='blue', linestyle='dashed', marker='o', markerfacecolor='red', markersize=10)
plt.title("Error Rate vs n trees")
plt.xlabel("n trees")
plt.ylabel("Error Rate")
Out[48]:
In [49]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print(confusion_matrix(y_test, rfc_pred))
print("\n")
print(classification_report(y_test, rfc_pred))
In [ ]: