Logistic Regression with Hyperparameter Optimization (scikit-learn)

Imports


In [1]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

import itertools
import time
from multiprocessing import Pool

import numpy as np
import pandas as pd

from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

Prepare Data


In [2]:
try:
    import wget
except ImportError:
    !pip install wget # you may need pip3
    import wget

In [3]:
train_data_url = "http://s3.amazonaws.com/verta-starter/census-train.csv"
train_data_filename = wget.download(train_data_url)
test_data_url = "http://s3.amazonaws.com/verta-starter/census-test.csv"
test_data_filename = wget.download(test_data_url)

In [4]:
df_train = pd.read_csv("census-train.csv")
X_train = df_train.iloc[:,:-1].values
y_train = df_train.iloc[:, -1]

df_train.head()

Prepare Hyperparameters


In [5]:
hyperparam_candidates = {
    'C': [1e-4, 1e-1, 1, 10, 1e3],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [15, 28],
}

# total models 20

# create hyperparam combinations
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
                   for values
                   in itertools.product(*hyperparam_candidates.values())]

Run Validation


In [6]:
# create validation split
(X_val_train, X_val_test,
 y_val_train, y_val_test) = model_selection.train_test_split(X_train, y_train,
                                                             test_size=0.2,
                                                             shuffle=True)    

def run_experiment(hyperparams):
    
    # create and train model
    model = linear_model.LogisticRegression(**hyperparams)
    model.fit(X_train, y_train)
    
    # calculate and log validation accuracy
    val_acc = model.score(X_val_test, y_val_test)
    print(hyperparams, end=' ')
    print("Validation accuracy: {:.4f}".format(val_acc))
    
with Pool() as pool:
    pool.map(run_experiment, hyperparam_sets)

Pick the best hyperparameters and train the full data


In [7]:
best_hyperparams = {}
model = linear_model.LogisticRegression(multi_class='auto', **best_hyperparams)
model.fit(X_train, y_train)

Calculate Accuracy on Full Training Set


In [8]:
train_acc = model.score(X_train, y_train)
print("Training accuracy: {:.4f}".format(train_acc))


In [9]: