CHAPTER 4 - LOGISTIC REGRESSION

Create a dummy dataset


In [2]:
%matplotlib inline

import matplotlib.pyplot as plt
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=100, n_features=2, n_informative=2,
                           n_redundant=0, n_clusters_per_class=1, 
                           class_sep = 2.0, random_state=101)

plt.scatter(X[:, 0], X[:, 1], marker='o', c=y, linewidth=0, edgecolor=None)
plt.show()



In [6]:
y_orig = [0,0,0,0,0,0,1,1,1,1]
y_pred = [0,0,0,0,1,1,1,1,1,0]

Measuring the classifier's performance


In [7]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_orig, y_pred)


Out[7]:
array([[4, 2],
       [1, 3]])

In [8]:
plt.matshow(confusion_matrix(y_orig, y_pred))
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()



In [5]:
from sklearn.metrics import accuracy_score
accuracy_score(y_orig, y_pred)


Out[5]:
0.69999999999999996

In [6]:
from sklearn.metrics import precision_score
precision_score(y_orig, y_pred)


Out[6]:
0.59999999999999998

In [7]:
from sklearn.metrics import recall_score
recall_score(y_orig, y_pred)


Out[7]:
0.75

In [8]:
from sklearn.metrics import f1_score
f1_score(y_orig, y_pred)


Out[8]:
0.66666666666666652

In [9]:
from sklearn.metrics import classification_report
print(classification_report(y_orig, y_pred))


             precision    recall  f1-score   support

          0       0.80      0.67      0.73         6
          1       0.60      0.75      0.67         4

avg / total       0.72      0.70      0.70        10


In [21]:
#from sklearn.metrics import r2_score
#r2_score(y_orig,y_pred)


Out[21]:
-0.25

Fitting the classifier


In [17]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y.astype(float), 
                                   test_size=0.33, random_state=101)

In [18]:
y_test.dtype


Out[18]:
dtype('float64')

In [19]:
y_test


Out[19]:
array([ 0.,  1.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,
        0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  1.,  0.,  1.,  0.])

In [20]:
from sklearn.linear_model import LinearRegression

regr = LinearRegression()
regr.fit(X_train, y_train)
regr.predict(X_test)


Out[20]:
array([ 0.06688448,  1.01981921,  1.08597427, -0.15225094,  1.05856628,
        0.8156161 ,  0.04837505,  0.7997539 ,  0.18942251, -0.03658995,
       -0.0462575 , -0.09640911,  1.0253004 , -0.17062754,  1.13642842,
        0.14052848, -0.00703683,  0.90903158,  1.26997191,  0.03606483,
       -0.19047191,  0.22476337, -0.05936491, -0.18559975,  0.28378888,
        0.01139188, -0.03559395,  0.22742328,  0.07485246,  1.24545626,
        0.13924533,  1.09388935,  0.35341582])

The sigmoid (logit) function


In [14]:
import numpy as np

def model(x):
    return 1 / (1 + np.exp(-x))

X_vals = np.linspace(-10, 10, 1000)
plt.plot(X_vals, model(X_vals), color='blue', linewidth=3)
plt.ylabel('sigma(t)')
plt.xlabel('t')

plt.show()


Classification and decision boundaries


In [15]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train.astype(int))
y_clf = clf.predict(X_test)

print(classification_report(y_test, y_clf))


             precision    recall  f1-score   support

        0.0       1.00      0.95      0.98        22
        1.0       0.92      1.00      0.96        11

avg / total       0.97      0.97      0.97        33


In [16]:
# Example based on:
# Code source: Gaël Varoquaux, Modified for documentation by Jaques Grobler, License: BSD 3 clause

h = .02  # step size in the mesh

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.autumn)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', linewidth=0, cmap=plt.cm.Paired)

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

plt.show()



In [17]:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]
Z = Z.reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.autumn)

ax = plt.axes()
ax.arrow(0, 0, clf.coef_[0][0], clf.coef_[0][1], head_width=0.5, head_length=0.5, fc='k', ec='k')
plt.scatter(0, 0, marker='o', c='k')

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())

plt.show()



In [18]:
clf.coef_


Out[18]:
array([[-2.22576095, -0.28230719]])

In [19]:
%timeit clf.fit(X, y)


1000 loops, best of 3: 577 µs per loop

In [20]:
%timeit clf.predict(X)


10000 loops, best of 3: 48.5 µs per loop

In [21]:
%timeit clf.predict_proba(X)


10000 loops, best of 3: 74.7 µs per loop

In [22]:
# Clean all previous variables
%reset -f

Multiclass Logistic Regression


In [23]:
%matplotlib inline

import matplotlib.pyplot as plt
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=200, n_features=2, 
                           n_classes=3, n_informative=2,
                           n_redundant=0, n_clusters_per_class=1, 
                           class_sep = 2.0, random_state=101)

plt.scatter(X[:, 0], X[:, 1], marker='o', c=y, linewidth=0, edgecolor=None)
plt.show()



In [24]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y.astype(float), 
                                   test_size=0.33, random_state=101)

In [25]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train.astype(int))
y_clf = clf.predict(X_test)

In [26]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_clf))


             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00        24
        1.0       1.00      1.00      1.00        22
        2.0       1.00      1.00      1.00        20

avg / total       1.00      1.00      1.00        66


In [27]:
import numpy as np

h = .02  # step size in the mesh

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.autumn)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

plt.show()



In [28]:
print(X_test[0])
print(y_test[0])
print(y_clf[0])


[ 0.73255032  1.19639333]
0.0
0

In [29]:
clf.predict_proba(X_test[0])


/usr/local/lib/python3.4/dist-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
Out[29]:
array([[ 0.72797056,  0.06275109,  0.20927835]])

In [30]:
clf.decision_function(X_test[0])


/usr/local/lib/python3.4/dist-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
Out[30]:
array([[ 0.37213503, -2.92299969, -1.58433774]])

An example


In [31]:
X, y = make_classification(n_samples=10000, n_features=10, 
                           n_informative=10, n_redundant=0, 
                           random_state=101)

Using statsmodels


In [32]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [33]:
Xc = sm.add_constant(X)
logistic_regression = sm.Logit(y,Xc)
fitted_model = logistic_regression.fit()


Optimization terminated successfully.
         Current function value: 0.438685
         Iterations 7

In [34]:
fitted_model.summary()


Out[34]:
Logit Regression Results
Dep. Variable: y No. Observations: 10000
Model: Logit Df Residuals: 9989
Method: MLE Df Model: 10
Date: Mon, 26 Dec 2016 Pseudo R-squ.: 0.3671
Time: 10:45:22 Log-Likelihood: -4386.8
converged: True LL-Null: -6931.5
LLR p-value: 0.000
coef std err z P>|z| [95.0% Conf. Int.]
const 0.4299 0.039 11.023 0.000 0.353 0.506
x1 0.0671 0.015 4.410 0.000 0.037 0.097
x2 -0.7828 0.019 -41.947 0.000 -0.819 -0.746
x3 0.1221 0.016 7.815 0.000 0.091 0.153
x4 0.2841 0.016 18.150 0.000 0.253 0.315
x5 0.1469 0.014 10.283 0.000 0.119 0.175
x6 -0.3414 0.019 -17.636 0.000 -0.379 -0.303
x7 0.0503 0.014 3.481 0.000 0.022 0.079
x8 -0.1393 0.014 -9.642 0.000 -0.168 -0.111
x9 0.1127 0.014 7.931 0.000 0.085 0.141
x10 -0.4792 0.018 -27.340 0.000 -0.514 -0.445

In [35]:
import pandas as pd

# As an alternative, this example is using the statsmodels.formula.api module
# Equivalent to the R syntax for linear models, it requires to specify the form
# of the logistic regression using 'response ~ predictor1 (+ predictor2 + ...)'

Xd = pd.DataFrame(X)
Xd.columns = ['VAR'+str(i+1) for i in range(10)]
Xd['response'] = y

logistic_regression = smf.logit(
    formula = 'response ~ VAR1+ VAR2 + VAR3 + VAR4 + \
               VAR5 + VAR6 + VAR7 + VAR8 + VAR9 + VAR10',
               data=Xd)

fitted_model = logistic_regression.fit()
fitted_model.summary()


Optimization terminated successfully.
         Current function value: 0.438685
         Iterations 7
Out[35]:
Logit Regression Results
Dep. Variable: response No. Observations: 10000
Model: Logit Df Residuals: 9989
Method: MLE Df Model: 10
Date: Mon, 26 Dec 2016 Pseudo R-squ.: 0.3671
Time: 10:45:22 Log-Likelihood: -4386.8
converged: True LL-Null: -6931.5
LLR p-value: 0.000
coef std err z P>|z| [95.0% Conf. Int.]
Intercept 0.4299 0.039 11.023 0.000 0.353 0.506
VAR1 0.0671 0.015 4.410 0.000 0.037 0.097
VAR2 -0.7828 0.019 -41.947 0.000 -0.819 -0.746
VAR3 0.1221 0.016 7.815 0.000 0.091 0.153
VAR4 0.2841 0.016 18.150 0.000 0.253 0.315
VAR5 0.1469 0.014 10.283 0.000 0.119 0.175
VAR6 -0.3414 0.019 -17.636 0.000 -0.379 -0.303
VAR7 0.0503 0.014 3.481 0.000 0.022 0.079
VAR8 -0.1393 0.014 -9.642 0.000 -0.168 -0.111
VAR9 0.1127 0.014 7.931 0.000 0.085 0.141
VAR10 -0.4792 0.018 -27.340 0.000 -0.514 -0.445

Gradient Descend


In [36]:
from sklearn.preprocessing import StandardScaler
import numpy as np
observations = len(X)
variables = ['VAR'+str(i+1) for i in range(10)]

In [37]:
import random

def random_w( p ):
    return np.array([np.random.normal() for j in range(p)])

def sigmoid(X,w):
    return 1./(1.+np.exp(-np.dot(X,w)))

def hypothesis(X,w):
    return np.dot(X,w)

def loss(X,w,y):
    return hypothesis(X,w) - y

def logit_loss(X,w,y):
    return sigmoid(X,w) - y

def squared_loss(X,w,y):
    return loss(X,w,y)**2

def gradient(X,w,y,loss_type=squared_loss):
    gradients = list()
    n = float(len( y ))
    for j in range(len(w)):
        gradients.append(np.sum(loss_type(X,w,y) * X[:,j]) / n)
    return gradients

def update(X,w,y, alpha=0.01, loss_type=squared_loss):
    return [t - alpha*g for t, g in zip(w, gradient(X,w,y,loss_type))]

def optimize(X,y, alpha=0.01, eta = 10**-12, loss_type=squared_loss, iterations = 1000):
    standardization = StandardScaler()
    Xst = standardization.fit_transform(X)
    original_means, originanal_stds = standardization.mean_, standardization.std_
    Xst = np.column_stack((Xst,np.ones(observations)))
    w = random_w(Xst.shape[1])
    path = list()
    for k in range(iterations):
        SSL = np.sum(squared_loss(Xst,w,y))
        new_w = update(Xst,w,y, alpha=alpha, loss_type=logit_loss)
        new_SSL = np.sum(squared_loss(Xst,new_w,y))
        w = new_w
        if k>=5 and (new_SSL - SSL <= eta and new_SSL - SSL >= -eta):
            path.append(new_SSL)
            break
        if k % (iterations / 20) == 0:
            path.append(new_SSL)
    unstandardized_betas = w[:-1] / originanal_stds
    unstandardized_bias  = w[-1]-np.sum((original_means / originanal_stds) * w[:-1])
    return np.insert(unstandardized_betas, 0, unstandardized_bias), path, k
                                  
alpha = 0.5
w, path, iterations = optimize(X, y, alpha, eta = 10**-5, loss_type=logit_loss, iterations = 100000)
print ("These are our final standardized coefficients: %s" % w)
print ("Reached after %i iterations" % (iterations+1))


/usr/local/lib/python3.4/dist-packages/sklearn/utils/__init__.py:93: DeprecationWarning: Function std_ is deprecated; Attribute ``std_`` will be removed in 0.19. Use ``scale_`` instead
  warnings.warn(msg, category=DeprecationWarning)
These are our final standardized coefficients: [ 0.42991407  0.0670771  -0.78279578  0.12208733  0.28410285  0.14689341
 -0.34143436  0.05031078 -0.1393206   0.11267402 -0.47916908]
Reached after 885 iterations

Scikit-Learn implementations


In [38]:
from sklearn.linear_model import LogisticRegression

# C: Inverse of regularization strength; must be a positive float.
# tol: Tolerance for stopping criteria.

clf = LogisticRegression(C=1E4, tol=1E-25, random_state=101)
clf.fit(X,y)


Out[38]:
LogisticRegression(C=10000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=101,
          solver='liblinear', tol=1e-25, verbose=0, warm_start=False)

In [39]:
coeffs = [clf.intercept_[0]]
coeffs.extend(clf.coef_[0])
coeffs


Out[39]:
[0.42991394845314013,
 0.067077096874709571,
 -0.7827957661488677,
 0.12208730826867403,
 0.28410283693190325,
 0.14689340914475538,
 -0.34143434245188609,
 0.050310756492560116,
 -0.13932059152314755,
 0.11267402173781314,
 -0.47916904027905599]

In [40]:
from sklearn.linear_model import SGDClassifier

# alpha: Constant that multiplies the regularization term.
# n_iter: The number of passes over the training data (aka epochs).


clf = SGDClassifier(loss='log', alpha=1E-4, n_iter=1E2, random_state=101)
clf.fit(X,y)


Out[40]:
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=100.0, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=101, shuffle=True,
       verbose=0, warm_start=False)

In [41]:
coeffs = [clf.intercept_[0]]
coeffs.extend(clf.coef_[0])
coeffs


Out[41]:
[0.42571117875899578,
 0.092754663986175323,
 -0.78381378869544116,
 0.093708745822509432,
 0.16756466505271234,
 0.10596527209458735,
 -0.41091578158018655,
 0.062219832489940376,
 -0.19435965629236057,
 0.23531208244782129,
 -0.48793778455042092]

In [ ]: