In [1]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
import itertools
import time
from multiprocessing import Pool
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics
In [2]:
try:
import wget
except ImportError:
!pip install wget # you may need pip3
import wget
In [3]:
train_data_url = "http://s3.amazonaws.com/verta-starter/census-train.csv"
train_data_filename = wget.download(train_data_url)
test_data_url = "http://s3.amazonaws.com/verta-starter/census-test.csv"
test_data_filename = wget.download(test_data_url)
In [4]:
df_train = pd.read_csv("census-train.csv")
X_train = df_train.iloc[:,:-1].values
y_train = df_train.iloc[:, -1]
df_train.head()
In [5]:
hyperparam_candidates = {
'C': [1e-4, 1e-1, 1, 10, 1e3],
'solver': ['liblinear', 'lbfgs'],
'max_iter': [15, 28],
}
# total models 20
# create hyperparam combinations
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
for values
in itertools.product(*hyperparam_candidates.values())]
In [6]:
# create validation split
(X_val_train, X_val_test,
y_val_train, y_val_test) = model_selection.train_test_split(X_train, y_train,
test_size=0.2,
shuffle=True)
def run_experiment(hyperparams):
# create and train model
model = linear_model.LogisticRegression(**hyperparams)
model.fit(X_train, y_train)
# calculate and log validation accuracy
val_acc = model.score(X_val_test, y_val_test)
print(hyperparams, end=' ')
print("Validation accuracy: {:.4f}".format(val_acc))
with Pool() as pool:
pool.map(run_experiment, hyperparam_sets)
In [7]:
best_hyperparams = {}
model = linear_model.LogisticRegression(multi_class='auto', **best_hyperparams)
model.fit(X_train, y_train)
In [8]:
train_acc = model.score(X_train, y_train)
print("Training accuracy: {:.4f}".format(train_acc))
In [9]: