Using the GLM for Classification

In this notebook we test the GLM for binary classification on the USPS handwritten digits dataset. We compare to select models in scikit learn too.


In [1]:
%matplotlib inline
import logging
import numpy as np
from scipy.stats import gamma
import matplotlib.pyplot as pl
pl.style.use('ggplot')
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from revrand import GeneralizedLinearModel, Parameter, Positive
from revrand.utils.datasets import fetch_gpml_usps_resampled_data
from revrand.basis_functions import RandomRBF
from revrand.likelihoods import Bernoulli
from revrand.optimize import AdaDelta, Adam

# Log output to the terminal attached to this notebook
logging.basicConfig(level=logging.INFO)

Settings


In [2]:
# Which digits to classify
dig1 = 3
dig2 = 5

# Algorith settings
nbases = 800
lenscale = gamma(1, scale=6.)
random_state = 150
nsamples = 50
maxiter = 10000
updater = Adam(epsilon=1e-8)

# Bounded variables
lenscale_init = Parameter(lenscale, Positive())

# Feature Transform
basis = RandomRBF(nbases, 256, lenscale=lenscale_init, random_state=random_state)

# SVR settings
svc_params = {
    'gamma': np.logspace(-1, 2, 10),
    'C': np.linspace(0.1, 10, 5)
}

# Gaussian Process settings
n_restarts = 10
kernel = 1**2 * RBF(length_scale=lenscale.mean(), length_scale_bounds=(1e-2, 1e2)) + WhiteKernel()

Get and load data


In [3]:
# Fetch/load
usps_resampled = fetch_gpml_usps_resampled_data()

# Training dataset
ind_fals = usps_resampled.train.targets == dig1
ind_true = usps_resampled.train.targets == dig2
ind_all = np.logical_or(ind_fals, ind_true) 

X = usps_resampled.train.data[ind_all]
Y = (usps_resampled.train.targets[ind_all] == dig2).astype(float)

# Test dataset
ind_fals = usps_resampled.test.targets == dig1
ind_true = usps_resampled.test.targets == dig2
ind_all = np.logical_or(ind_fals, ind_true) 

Xs = usps_resampled.test.data[ind_all]
Ys = (usps_resampled.test.targets[ind_all] == dig2).astype(float)

Classify using revrand


In [4]:
llhood = Bernoulli()
glm = GeneralizedLinearModel(llhood,
                             basis,
                             random_state=random_state,
                             maxiter=maxiter,
                             nsamples=nsamples,
                             updater=updater
                             )
glm.fit(X, Y)

# Predict
pys_l = glm.predict(Xs)
pys_l = np.vstack((1 - pys_l, pys_l)).T
Eys_l = pys_l[:, 0] > 0.5


INFO:revrand.glm:Optimising parameters...
INFO:revrand.optimize.decorators:Evaluating random starts...
INFO:revrand.glm:Random starts: Iter -500: ELBO = -5656.793211685731, reg = 0.253406216378982, like_hypers = [], basis_hypers = 7.295168991387261
INFO:revrand.optimize.decorators:Best start found with objective = 1615.2967163964163
INFO:revrand.glm:Iter 0: ELBO = -1767.987652533809, reg = 2.2810477010071613, like_hypers = [], basis_hypers = 6.25412680989254
INFO:revrand.glm:Iter 500: ELBO = -499.0590802366584, reg = 3.5565383048981993, like_hypers = [], basis_hypers = 4.778698299921703
INFO:revrand.glm:Iter 1000: ELBO = -515.209497150816, reg = 4.406957974022703, like_hypers = [], basis_hypers = 4.569620764563347
INFO:revrand.glm:Iter 1500: ELBO = -409.28094979085705, reg = 5.039283430995326, like_hypers = [], basis_hypers = 4.613621946779851
INFO:revrand.glm:Iter 2000: ELBO = -445.357983449804, reg = 5.582208540765181, like_hypers = [], basis_hypers = 4.808277678337811
INFO:revrand.glm:Iter 2500: ELBO = -421.00940104717785, reg = 5.98938432997258, like_hypers = [], basis_hypers = 4.754803526716088
INFO:revrand.glm:Iter 3000: ELBO = -401.9144215460573, reg = 6.386663430860647, like_hypers = [], basis_hypers = 4.780127859587692
INFO:revrand.glm:Iter 3500: ELBO = -466.740884498409, reg = 6.652956853287503, like_hypers = [], basis_hypers = 4.947203275231476
INFO:revrand.glm:Iter 4000: ELBO = -445.61606951961704, reg = 6.923781257016669, like_hypers = [], basis_hypers = 4.5869318258718375
INFO:revrand.glm:Iter 4500: ELBO = -436.4063205669755, reg = 7.157824775332062, like_hypers = [], basis_hypers = 5.158434255067629
INFO:revrand.glm:Iter 5000: ELBO = -448.56370108044405, reg = 7.290201289514519, like_hypers = [], basis_hypers = 4.757076130678274
INFO:revrand.glm:Iter 5500: ELBO = -420.51595975594364, reg = 7.428074004368439, like_hypers = [], basis_hypers = 4.697369789614059
INFO:revrand.glm:Iter 6000: ELBO = -404.282584066576, reg = 7.575276334583827, like_hypers = [], basis_hypers = 4.794932989968227
INFO:revrand.glm:Iter 6500: ELBO = -441.1982688297304, reg = 7.663948191015947, like_hypers = [], basis_hypers = 4.71595527055678
INFO:revrand.glm:Iter 7000: ELBO = -404.518826093686, reg = 7.759813651988363, like_hypers = [], basis_hypers = 4.82248887513812
INFO:revrand.glm:Iter 7500: ELBO = -379.2834821978814, reg = 7.865463399879842, like_hypers = [], basis_hypers = 4.688107565288598
INFO:revrand.glm:Iter 8000: ELBO = -398.7435483353421, reg = 7.923530546021679, like_hypers = [], basis_hypers = 4.349173725330818
INFO:revrand.glm:Iter 8500: ELBO = -552.4936170653793, reg = 7.984965242063296, like_hypers = [], basis_hypers = 4.705995626001421
INFO:revrand.glm:Iter 9000: ELBO = -463.24557726709435, reg = 8.017223142478667, like_hypers = [], basis_hypers = 4.944896410580306
INFO:revrand.glm:Iter 9500: ELBO = -452.1602750977797, reg = 8.095423703572214, like_hypers = [], basis_hypers = 4.583834763557641
INFO:revrand.glm:Iter 9999: ELBO = -405.4033872818201, reg = 8.112221131107157, like_hypers = [], basis_hypers = 4.739208243495937
INFO:revrand.glm:Finished! reg = 8.116604513970342, likelihood_hypers = [], basis_hypers = 4.731641353183766, message: maxiter reached.

Classify using Scikit Learn Logistic regression with bases


In [5]:
lreg = LogisticRegression(penalty='l2', class_weight='balanced')
lreg.fit(basis.transform(X, lenscale.mean()), Y)

pys_r = lreg.predict_proba(basis.transform(Xs, lenscale.mean()))
Eys_r = 1 - lreg.predict(basis.transform(Xs, lenscale.mean()))

Classify using Scikit Learn SVC


In [6]:
svc = GridSearchCV(SVC(class_weight='balanced', probability=True), svc_params, n_jobs=-1)
svc.fit(X, Y)

pys_s = svc.predict_proba(Xs)
Eys_s = 1 - svc.predict(Xs)

Classify using Scikit Learn GPC


In [7]:
gpc = GaussianProcessClassifier(kernel=kernel, n_restarts_optimizer=n_restarts)
gpc.fit(X, Y)

pys_g = gpc.predict_proba(Xs)
Eys_g = 1 - gpc.predict(Xs)

Classify using Scikit Learn Random Forest


In [8]:
rfc = RandomForestClassifier(n_estimators=40, class_weight='balanced', random_state=random_state)
rfc.fit(X, Y)

pys_f = rfc.predict_proba(Xs)
Eys_f = 1 - rfc.predict(Xs)

Score results


In [9]:
def print_scores(alg_name, y_true, y_pred, y_prob):
    print("{}: av log loss = {:.4f}, error rate = {:.4f}"
          .format(alg_name,
                  log_loss(y_true, y_prob),
                  accuracy_score(y_true, y_pred)
                 )
         )

print_scores('GLM', Ys, Eys_l, pys_l)
print_scores('Logistic', Ys, Eys_r, pys_r)
print_scores('SVC', Ys, Eys_s, pys_s)
print_scores('GPC', Ys, Eys_g, pys_g)
print_scores('RFC', Ys, Eys_f, pys_f)


GLM: av log loss = 0.1138, error rate = 0.0207
Logistic: av log loss = 0.1734, error rate = 0.0362
SVC: av log loss = 0.1008, error rate = 0.0699
GPC: av log loss = 0.1405, error rate = 0.0259
RFC: av log loss = 0.1368, error rate = 0.0272

Distribution of probabilistic outputs


In [10]:
Ns = len(Xs)
pl.figure(figsize=(15, 10))
pl.hist([pys_l[:, 0], pys_r[:, 0]], 50, label=['GLM', 'logisitic'])
pl.ylabel('Count')
pl.xlabel('$p(y = 1)$')
pl.title('Histogram of Prediction Probabilities')
pl.legend()
pl.show()



In [ ]: