Credit Risk Model


In [2]:
## Frame - Predict Default
import numpy as np
import pandas as pd

In [5]:
# Acquire the Data
df = pd.read_csv("../../loanize.csv")

In [6]:
# Refine the Data
df.years.fillna(df.years.median(), inplace = True)

In [18]:
# Transform the Data - Log Normalize
df['log_age'] = np.log(df.age)
df['log_income'] = np.log(df.income)
df['log_amount'] = np.log(df.amount)
df['log_years'] = np.log(df.years + 0.00001)

In [37]:
# Transform - Encoding Label Encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
grade_encoder = LabelEncoder().fit(df.grade)
df['grade_label'] = grade_encoder.transform(df.grade)
ownership_encoder = LabelEncoder().fit(df.ownership)
df['ownership_label'] = ownership_encoder.transform(df.ownership)

In [65]:
# Model - Features & Target
X = df.iloc[:,7:13]
y = df.iloc[:,0]

In [69]:
# Model Creation
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1234, max_depth=5)

In [70]:
# Model Selection
from sklearn.model_selection import cross_val_score
score = cross_val_score(clf, X, y, scoring="roc_auc", cv=5, n_jobs=-1)
np.mean(score)


Out[70]:
0.6843924134801082

In [79]:
# Model Tuning
from sklearn.model_selection import GridSearchCV
parameters = {"n_estimators": [10, 50 ,100, 200, 300, 400], "max_depth": [5, 10, 15]}
clf2 = GridSearchCV(clf, parameters, return_train_score=True, cv=5, verbose=1, n_jobs=-1)
clf2.fit(X,y)
clf2.best_params_, np.mean(clf2.cv_results_['mean_test_score'])


Fitting 5 folds for each of 18 candidates, totalling 90 fits
[Parallel(n_jobs=-1)]: Done  54 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   45.7s finished
Out[79]:
({'max_depth': 15, 'n_estimators': 100}, 0.6577153703464044)

In [ ]:
GridSearchCV