In [1]:
import pandas as pd
import numpy as np
from sklearn import *
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
df = pd.read_csv("/data/credit-default.csv")
df.head()
Out[2]:
In [3]:
df.info()
In [4]:
df.default.value_counts()
Out[4]:
In [5]:
target = "default"
label_encoder = preprocessing.LabelEncoder()
y = label_encoder.fit_transform(df[target])
X = df.drop(columns=[target])
X.head()
Out[5]:
In [6]:
cat_columns = [field for field in dict(X.dtypes)
if X.dtypes[field] == "object"]
cat_columns
Out[6]:
In [7]:
num_columns = [field for field in dict(X.dtypes)
if X.dtypes[field] != "object"]
num_columns
Out[7]:
In [8]:
cat_pipe = pipeline.Pipeline([
('imputer', impute.SimpleImputer(strategy='constant'
, fill_value='missing')),
('onehot', preprocessing.OneHotEncoder(handle_unknown='error'
, drop="first"))
])
num_pipe = pipeline.Pipeline([
('imputer', impute.SimpleImputer(strategy='median')),
('poly', preprocessing.PolynomialFeatures(degree=1
, include_bias=False)),
('scaler', preprocessing.StandardScaler()),
])
preprocessing_pipe = compose.ColumnTransformer([
("cat", cat_pipe, cat_columns),
("num", num_pipe, num_columns)
])
In [9]:
estimator_pipe = pipeline.Pipeline([
("preprocessing", preprocessing_pipe),
("est", linear_model.LogisticRegression(random_state=1
, solver="liblinear"))
])
param_grid = {
"est__C": np.random.random(10) + 1
}
gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
, verbose=1, n_jobs=8, scoring="accuracy")
gsearch.fit(X, y)
print("Best score: ", gsearch.best_score_
, "Best parameters: ", gsearch.best_params_)
In [10]:
log_clf = linear_model.LogisticRegression(C = 1.53
, solver= "liblinear", random_state=1)
rnd_clf = ensemble.RandomForestClassifier(max_depth=6
, n_estimators = 30, random_state=1)
svm_clf = svm.SVC(C = 1.0, gamma = 0.15, random_state=1)
estimator_pipe = pipeline.Pipeline([
("preprocessing", preprocessing_pipe),
("est", ensemble.VotingClassifier(voting="hard", estimators=
[('lr', log_clf),
('rf', rnd_clf),
('svm', svm_clf)
])
)
])
param_grid = {
"est__svm__C": np.linspace(1.0, 20, 10)
}
gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
, verbose=1, n_jobs=8, scoring="accuracy")
gsearch.fit(X, y)
print("Best score: ", gsearch.best_score_, "Best parameters: ", gsearch.best_params_)
In [11]:
estimator_pipe = pipeline.Pipeline([
("preprocessing", preprocessing_pipe),
("est", ensemble.AdaBoostClassifier(
linear_model.LogisticRegression(random_state=1
, solver="liblinear")
, n_estimators=200
, algorithm="SAMME.R"
, learning_rate=0.051)
)
])
param_grid = {
"est__base_estimator__C": np.random.random(10) + 1
}
gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
, verbose=1, n_jobs=8, scoring="accuracy")
gsearch.fit(X, y)
print("Best score: ", gsearch.best_score_, "Best parameters: ", gsearch.best_params_)
In [12]:
estimator_pipe = pipeline.Pipeline([
("preprocessing", preprocessing_pipe),
("est", ensemble.BaggingClassifier(
tree.DecisionTreeClassifier(),
max_samples= 0.5,
n_estimators=50,
bootstrap=True,
oob_score=True)
)
])
param_grid = {
"est__base_estimator__max_depth": np.arange(5, 15)
}
gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
, verbose=1, n_jobs=8, scoring="accuracy")
gsearch.fit(X, y)
print("Best score: ", gsearch.best_score_, "Best parameters: ", gsearch.best_params_)
In [18]:
estimator_pipe = pipeline.Pipeline([
("preprocessing", preprocessing_pipe),
("est", ensemble.GradientBoostingClassifier(random_state=1))
])
param_grid = {
"est__max_depth": np.arange(3, 10),
"est__learning_rate": np.linspace(0.01, 1, 10)
}
gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
, verbose=1, n_jobs=8, scoring="accuracy")
gsearch.fit(X, y)
print("Best score: ", gsearch.best_score_
, "Best parameters: ", gsearch.best_params_)
In [23]:
scores = pd.DataFrame(gsearch.cv_results_)
scores.head()
Out[23]:
In [24]:
scores[scores.rank_test_score == 1]
Out[24]:
In [ ]: