In [2]:
## Frame - Predict Default
import numpy as np
import pandas as pd
In [5]:
# Acquire the Data
df = pd.read_csv("../../loanize.csv")
In [6]:
# Refine the Data
df.years.fillna(df.years.median(), inplace = True)
In [18]:
# Transform the Data - Log Normalize
df['log_age'] = np.log(df.age)
df['log_income'] = np.log(df.income)
df['log_amount'] = np.log(df.amount)
df['log_years'] = np.log(df.years + 0.00001)
In [37]:
# Transform - Encoding Label Encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
grade_encoder = LabelEncoder().fit(df.grade)
df['grade_label'] = grade_encoder.transform(df.grade)
ownership_encoder = LabelEncoder().fit(df.ownership)
df['ownership_label'] = ownership_encoder.transform(df.ownership)
In [65]:
# Model - Features & Target
X = df.iloc[:,7:13]
y = df.iloc[:,0]
In [69]:
# Model Creation
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1234, max_depth=5)
In [70]:
# Model Selection
from sklearn.model_selection import cross_val_score
score = cross_val_score(clf, X, y, scoring="roc_auc", cv=5, n_jobs=-1)
np.mean(score)
Out[70]:
In [79]:
# Model Tuning
from sklearn.model_selection import GridSearchCV
parameters = {"n_estimators": [10, 50 ,100, 200, 300, 400], "max_depth": [5, 10, 15]}
clf2 = GridSearchCV(clf, parameters, return_train_score=True, cv=5, verbose=1, n_jobs=-1)
clf2.fit(X,y)
clf2.best_params_, np.mean(clf2.cv_results_['mean_test_score'])
Out[79]:
In [ ]:
GridSearchCV