In [1]:
# !python3 -m pip install pandas
# !python3 -m pip install matplotlib
# !python3 -m pip install numpy
# !python3 -m pip install sklearn
# !python3 -m pip install scipy
# !python3 -m pip install mlflow

In [2]:
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn import cross_validation
from sklearn import metrics
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import ElasticNet

# Data
df = pd.read_csv('https://raw.githubusercontent.com/fclesio/learning-space/master/Datasets/02%20-%20Classification/default_credit_card.csv')

# Split the data into training and test sets. (0.75, 0.25) split.
train, test = train_test_split(df)

# The predicted column is "quality" which is a scalar from [3, 9]
train_x = train.drop(["DEFAULT"], axis=1)
test_x = test.drop(["DEFAULT"], axis=1)
train_y = train[["DEFAULT"]]
test_y = test[["DEFAULT"]]

# Convert Pandas datasets to numpy arrays 
train_x = train_x.as_matrix()
test_x = test_x.as_matrix()
train_y = train_y.as_matrix()
test_y = test_y.as_matrix()

# Standardize the data attributes
train_x = preprocessing.scale(train_x)
test_x = preprocessing.scale(test_x)

# Set seed
np.random.seed(12345)


/usr/local/lib/python3.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/usr/local/lib/python3.7/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d

In [3]:
# RandomForestClassifier
n_estimators = [500, 1000, 3000]
min_samples_splits = [50, 100, 500]
max_depths = [10, 20, 50]

for n_estimator in n_estimators:
    for min_samples_split in min_samples_splits:    
        for depth in max_depths:
            with mlflow.start_run():
                
                lr = RandomForestClassifier(n_estimators=n_estimator, 
                                            min_samples_split=min_samples_split,
                                            max_depth=depth)
            
                lr.fit(train_x, train_y)
                accuracy = lr.score(test_x, test_y)

                predicted_qualities = lr.predict(test_x)


                print("Random Forest model (n_estimators=%f, min_samples_split=%f, max_depth=%f):" % (n_estimator,min_samples_split,depth))
                print("Accuracy: %s" % round((accuracy * 100),2))

                # Logging in mlflow to appears in UI
                mlflow.log_param("n_estimators", n_estimator)
                mlflow.log_param("min_samples_split", min_samples_split)
                mlflow.log_param("max_depth", depth)
                
                mlflow.log_metric("accuracy", accuracy)
                mlflow.sklearn.log_model(lr, "model_random_forests")


Random Forest model (n_estimators=500.000000, min_samples_split=50.000000, max_depth=10.000000):
Accuracy: 82.67
Random Forest model (n_estimators=500.000000, min_samples_split=50.000000, max_depth=20.000000):
Accuracy: 82.63
Random Forest model (n_estimators=500.000000, min_samples_split=50.000000, max_depth=50.000000):
Accuracy: 82.6
Random Forest model (n_estimators=500.000000, min_samples_split=100.000000, max_depth=10.000000):
Accuracy: 82.81
Random Forest model (n_estimators=500.000000, min_samples_split=100.000000, max_depth=20.000000):
Accuracy: 82.76
Random Forest model (n_estimators=500.000000, min_samples_split=100.000000, max_depth=50.000000):
Accuracy: 82.72
Random Forest model (n_estimators=500.000000, min_samples_split=500.000000, max_depth=10.000000):
Accuracy: 82.73
Random Forest model (n_estimators=500.000000, min_samples_split=500.000000, max_depth=20.000000):
Accuracy: 82.67
Random Forest model (n_estimators=500.000000, min_samples_split=500.000000, max_depth=50.000000):
Accuracy: 82.75
Random Forest model (n_estimators=1000.000000, min_samples_split=50.000000, max_depth=10.000000):
Accuracy: 82.83
Random Forest model (n_estimators=1000.000000, min_samples_split=50.000000, max_depth=20.000000):
Accuracy: 82.69
Random Forest model (n_estimators=1000.000000, min_samples_split=50.000000, max_depth=50.000000):
Accuracy: 82.69
Random Forest model (n_estimators=1000.000000, min_samples_split=100.000000, max_depth=10.000000):
Accuracy: 82.75
Random Forest model (n_estimators=1000.000000, min_samples_split=100.000000, max_depth=20.000000):
Accuracy: 82.72
Random Forest model (n_estimators=1000.000000, min_samples_split=100.000000, max_depth=50.000000):
Accuracy: 82.77
Random Forest model (n_estimators=1000.000000, min_samples_split=500.000000, max_depth=10.000000):
Accuracy: 82.77
Random Forest model (n_estimators=1000.000000, min_samples_split=500.000000, max_depth=20.000000):
Accuracy: 82.75
Random Forest model (n_estimators=1000.000000, min_samples_split=500.000000, max_depth=50.000000):
Accuracy: 82.69
Random Forest model (n_estimators=3000.000000, min_samples_split=50.000000, max_depth=10.000000):
Accuracy: 82.71
Random Forest model (n_estimators=3000.000000, min_samples_split=50.000000, max_depth=20.000000):
Accuracy: 82.73
Random Forest model (n_estimators=3000.000000, min_samples_split=50.000000, max_depth=50.000000):
Accuracy: 82.71
Random Forest model (n_estimators=3000.000000, min_samples_split=100.000000, max_depth=10.000000):
Accuracy: 82.77
Random Forest model (n_estimators=3000.000000, min_samples_split=100.000000, max_depth=20.000000):
Accuracy: 82.75
Random Forest model (n_estimators=3000.000000, min_samples_split=100.000000, max_depth=50.000000):
Accuracy: 82.79
Random Forest model (n_estimators=3000.000000, min_samples_split=500.000000, max_depth=10.000000):
Accuracy: 82.79
Random Forest model (n_estimators=3000.000000, min_samples_split=500.000000, max_depth=20.000000):
Accuracy: 82.76
Random Forest model (n_estimators=3000.000000, min_samples_split=500.000000, max_depth=50.000000):
Accuracy: 82.76

In [4]:
# ExtraTreesClassifier
n_estimators = [1000, 3000]
max_depths = [20, 30]
min_samples_leafs = [20, 50]

for n_estimator in n_estimators:
    for min_samples_leaf in min_samples_leafs:    
        for depth in max_depths:
            with mlflow.start_run():
                
                lr = ExtraTreesClassifier(n_estimators=n_estimator, 
                                          min_samples_leaf=min_samples_leaf,
                                          max_depth=depth)
            
                lr.fit(train_x, train_y)
                accuracy = lr.score(test_x, test_y)

                predicted_qualities = lr.predict(test_x)

                print("Gradient Boosting model (n_estimators=%f, min_samples_leaf=%f, max_depth=%f):" % (n_estimator, min_samples_leaf, depth))
                print("Accuracy: %s" % round((accuracy * 100),2))

                # Logging in mlflow to appears in UI
                mlflow.log_param("n_estimators", n_estimator)
                mlflow.log_param("min_samples_leaf", min_samples_leaf)
                mlflow.log_param("max_depth", depth)
                
                mlflow.log_metric("accuracy", accuracy)
                mlflow.sklearn.log_model(lr, "model_extratrees_classifier")


Gradient Boosting model (n_estimators=1000.000000, min_samples_leaf=20.000000, max_depth=20.000000):
Accuracy: 82.0
Gradient Boosting model (n_estimators=1000.000000, min_samples_leaf=20.000000, max_depth=30.000000):
Accuracy: 81.88
Gradient Boosting model (n_estimators=1000.000000, min_samples_leaf=50.000000, max_depth=20.000000):
Accuracy: 81.09
Gradient Boosting model (n_estimators=1000.000000, min_samples_leaf=50.000000, max_depth=30.000000):
Accuracy: 81.05
Gradient Boosting model (n_estimators=3000.000000, min_samples_leaf=20.000000, max_depth=20.000000):
Accuracy: 81.96
Gradient Boosting model (n_estimators=3000.000000, min_samples_leaf=20.000000, max_depth=30.000000):
Accuracy: 81.96
Gradient Boosting model (n_estimators=3000.000000, min_samples_leaf=50.000000, max_depth=20.000000):
Accuracy: 81.0
Gradient Boosting model (n_estimators=3000.000000, min_samples_leaf=50.000000, max_depth=30.000000):
Accuracy: 81.01

In [5]:
# LogisticRegression
penalties  = ['l2']
fit_intercepts = [False, True]
Cs = [0.03, 0.02, 0.01, 0.005, 0.004, 0.003, 0.002, 0.001]
solvers = ['newton-cg', 'sag', 'saga', 'lbfgs']

for penalty in penalties:
    for fit_intercept in fit_intercepts:    
        for C in Cs:
            for solver in solvers:
                with mlflow.start_run():
                    lr = LogisticRegression(penalty=penalty,
                                        fit_intercept=fit_intercept,
                                        C=C,
                                        solver=solver)
            
                    lr.fit(train_x, train_y)
                    accuracy = lr.score(test_x, test_y)

                    predicted_qualities = lr.predict(test_x)

                    print("Accuracy: %s" % round((accuracy * 100),2))

                    # Logging in mlflow to appears in UI
                    mlflow.log_param("penalty", penalty)
                    mlflow.log_param("fit_intercept", fit_intercept)
                    mlflow.log_param("C", C)

                    mlflow.log_metric("accuracy", accuracy)
                    mlflow.sklearn.log_model(lr, "model_logistic_regression")


Accuracy: 58.73
Accuracy: 58.73
Accuracy: 58.72
Accuracy: 58.73
Accuracy: 58.69
Accuracy: 58.71
Accuracy: 58.71
Accuracy: 58.69
Accuracy: 58.57
Accuracy: 58.59
Accuracy: 58.59
Accuracy: 58.57
Accuracy: 58.33
Accuracy: 58.33
Accuracy: 58.33
Accuracy: 58.33
Accuracy: 58.29
Accuracy: 58.31
Accuracy: 58.31
Accuracy: 58.29
Accuracy: 58.23
Accuracy: 58.23
Accuracy: 58.23
Accuracy: 58.23
Accuracy: 58.2
Accuracy: 58.2
Accuracy: 58.2
Accuracy: 58.2
Accuracy: 57.92
Accuracy: 57.92
Accuracy: 57.92
Accuracy: 57.92
Accuracy: 81.71
Accuracy: 81.71
Accuracy: 81.69
Accuracy: 81.71
Accuracy: 81.67
Accuracy: 81.67
Accuracy: 81.67
Accuracy: 81.67
Accuracy: 81.71
Accuracy: 81.71
Accuracy: 81.71
Accuracy: 81.71
Accuracy: 81.65
Accuracy: 81.65
Accuracy: 81.65
Accuracy: 81.65
Accuracy: 81.69
Accuracy: 81.69
Accuracy: 81.69
Accuracy: 81.69
Accuracy: 81.67
Accuracy: 81.67
Accuracy: 81.67
Accuracy: 81.67
Accuracy: 81.63
Accuracy: 81.63
Accuracy: 81.63
Accuracy: 81.63
Accuracy: 81.29
Accuracy: 81.29
Accuracy: 81.29
Accuracy: 81.29