In [1]:
# Frame - Predict Default
import numpy as np
import pandas as pd
In [2]:
# Acquire Data
df = pd.read_csv('credit-risk/data/historical_loan.csv')
In [3]:
# Refine Data
df.years.fillna(df.years.median(), inplace = True)
In [4]:
# Transform Data
from sklearn.preprocessing import LabelEncoder
df['ownership'] = LabelEncoder().fit_transform(df.ownership)
df['grade'] = LabelEncoder().fit_transform(df.grade)
df['income'] = np.log(df.income)
In [5]:
# Model - Data & Target
X = df.iloc[:,1:8]
y = df.iloc[:,0]
In [6]:
# Model Creation
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
In [7]:
# Model Selection
from sklearn.model_selection import cross_val_score
score = cross_val_score(clf,X,y, scoring="roc_auc", cv=5, n_jobs=-1)
np.mean(score)
Out[7]:
In [8]:
# Model Tuning
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[10, 50, 100], 'max_depth':[5, 10]}
clf2 = GridSearchCV(clf, parameters, return_train_score=True)
clf2.fit(X, y)
clf2.best_params_
#clf2.cv_results_['mean_test_score']
Out[8]: