Logistic Regression

cf. sklearn.linear_model.LogisticRegression documentation

Let's take a look at the examples in the LogisticRegression documentation of sklearn.

The Logistic Regression 3-class Classifier¶ has been credited to

Code source: Gaël Varoquaux
Modified for documentation by Jaques Grobler
License: BSD 3 clause



In [2]:

    
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets



In [5]:

    
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2] # take the first two features.  # EY : 20160503 type(X) is numpy.ndarray
Y = iris.target # EY : 20160503 type(Y) is numpy.ndarray



In [9]:

    
h = .02 # step size in the mesh



In [18]:

    
print "X shape: %s, Y shape: %s" % X.shape, Y.shape









    



X shape: 150, Y shape: 2 (150,)



In [5]:

    
logreg = linear_model.LogisticRegression(C=1e5)



In [20]:

    
# we create an instance of Neighbours Classifier and fit the data.
logreg.fit(X,Y)









    Out[20]:





LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)



In [24]:

    
# Plot the decision boundary. For that, we will assign a color to each
# point in the mest [x_min, x_max]x[y_min, y_max]
x_min, x_max = X[:,0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:,1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])



In [29]:

    
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(4,3))
plt.pcolormesh(xx,yy,Z, cmap=plt.cm.Paired)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

plt.show()

Loading files and dealing with local I/O



In [37]:

    
import os
print os.getcwd()
print os.path.abspath("./")  # find out "where you are" and "where Data folder is" with these commands

Let's load the data for Exercise 2 of Machine Learning, taught by Andrew Ng, of Coursera.



In [3]:

    
ex2data1 = np.loadtxt("./Data/ex2data1.txt",delimiter=',')  # you, the user, may have to change this, if the directory that you're running this from is somewhere else
ex2data2 = np.loadtxt("./Data/ex2data2.txt",delimiter=',')



In [27]:

    
X_ex2data1 = ex2data1[:,0:2]
Y_ex2data1 = ex2data1[:,2]
X_ex2data2 = ex2data2[:,:2]
Y_ex2data2 = ex2data2[:,2]



In [6]:

    
logreg.fit(X_ex2data1,Y_ex2data1)









    Out[6]:





LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)



In [7]:

    
def trainingdat2mesh(X,marginsize=.5, h=0.2):
    rows, features = X.shape
    ranges = []
    for feature in range(features):
        minrange = X[:,feature].min()-marginsize
        maxrange = X[:,feature].max()+marginsize
        ranges.append((minrange,maxrange))
    if len(ranges) == 2:
        xx, yy = np.meshgrid(np.arange(ranges[0][0], ranges[0][1], h), np.arange(ranges[1][0], ranges[1][1], h))
        return xx, yy
    else:
        return ranges



In [8]:

    
xx_ex2data1, yy_ex2data1 = trainingdat2mesh(X_ex2data1,h=0.2)



In [10]:

    
Z_ex2data1 = logreg.predict(np.c_[xx_ex2data1.ravel(),yy_ex2data1.ravel()])



In [12]:

    
Z_ex2data1 = Z_ex2data1.reshape(xx_ex2data1.shape)
plt.figure(2)
plt.pcolormesh(xx_ex2data1,yy_ex2data1,Z_ex2data1)
plt.scatter(X_ex2data1[:, 0], X_ex2data1[:, 1], c=Y_ex2data1, edgecolors='k')
plt.show()

Get the probability estimates; say a student has an Exam 1 score of 45 and an Exam 2 score of 85.



In [22]:

    
logreg.predict_proba(np.array([[45,85]])).flatten()
print "The student has a probability of no admission of %s and probability of admission of %s" % tuple( logreg.predict_proba(np.array([[45,85]])).flatten()  )









    



The student has a probability of no admission of 0.225482426511 and probability of admission of 0.774517573489

Let's change the "regularization" with the C parameter/option for LogisticRegression. Call this logreg2



In [26]:

    
logreg2 = linear_model.LogisticRegression()



In [28]:

    
logreg2.fit(X_ex2data2,Y_ex2data2)









    Out[28]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [31]:

    
xx_ex2data2, yy_ex2data2 = trainingdat2mesh(X_ex2data2,h=0.02)
Z_ex2data2 = logreg.predict(np.c_[xx_ex2data2.ravel(),yy_ex2data2.ravel()])



In [32]:

    
Z_ex2data2 = Z_ex2data2.reshape(xx_ex2data2.shape)
plt.figure(3)
plt.pcolormesh(xx_ex2data2,yy_ex2data2,Z_ex2data2)
plt.scatter(X_ex2data2[:, 0], X_ex2data2[:, 1], c=Y_ex2data2, edgecolors='k')
plt.show()

As one can see, the "dataset cannot be separated into positive and negative examples by a straight-line through the plot." cf. ex2.pdf

We're going to need polynomial terms to map onto.

Use this code: cf. Underfitting vs. Overfitting¶



In [33]:

    
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures



In [34]:

    
polynomial_features = PolynomialFeatures(degree=6,include_bias=False)



In [35]:

    
pipeline = Pipeline([("polynomial_features", polynomial_features),("logistic_regression",logreg2)])



In [36]:

    
pipeline.fit(X_ex2data2,Y_ex2data2)









    Out[36]:





Pipeline(steps=[('polynomial_features', PolynomialFeatures(degree=6, include_bias=False, interaction_only=False)), ('logistic_regression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])



In [37]:

    
Z_ex2data2 = pipeline.predict(np.c_[xx_ex2data2.ravel(),yy_ex2data2.ravel()])



In [38]:

    
Z_ex2data2 = Z_ex2data2.reshape(xx_ex2data2.shape)
plt.figure(3)
plt.pcolormesh(xx_ex2data2,yy_ex2data2,Z_ex2data2)
plt.scatter(X_ex2data2[:, 0], X_ex2data2[:, 1], c=Y_ex2data2, edgecolors='k')
plt.show()



In [ ]: