Concise ML


In [1]:
# Frame - Predict Default
import numpy as np
import pandas as pd

In [2]:
# Acquire Data
df = pd.read_csv('credit-risk/data/historical_loan.csv')

In [3]:
# Refine Data
df.years.fillna(df.years.median(), inplace = True)

In [4]:
# Transform Data
from sklearn.preprocessing import LabelEncoder
df['ownership'] = LabelEncoder().fit_transform(df.ownership)
df['grade'] = LabelEncoder().fit_transform(df.grade)
df['income'] = np.log(df.income)

In [5]:
# Model - Data & Target
X = df.iloc[:,1:8]
y = df.iloc[:,0]

In [6]:
# Model Creation
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [7]:
# Model Selection
from sklearn.model_selection import cross_val_score
score = cross_val_score(clf,X,y, scoring="roc_auc", cv=5, n_jobs=-1)
np.mean(score)


Out[7]:
0.70162528816671299

In [8]:
# Model Tuning
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[10, 50, 100], 'max_depth':[5, 10]}
clf2 = GridSearchCV(clf, parameters, return_train_score=True)
clf2.fit(X, y)
clf2.best_params_
#clf2.cv_results_['mean_test_score']


Out[8]:
{'max_depth': 10, 'n_estimators': 100}