Let's take a look at the examples in the LogisticRegression documentation of sklearn.
The Logistic Regression 3-class Classifier¶ has been credited to
Code source: Gaël Varoquaux
Modified for documentation by Jaques Grobler
License: BSD 3 clause
In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets
In [5]:
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2] # take the first two features. # EY : 20160503 type(X) is numpy.ndarray
Y = iris.target # EY : 20160503 type(Y) is numpy.ndarray
In [9]:
h = .02 # step size in the mesh
In [18]:
print "X shape: %s, Y shape: %s" % X.shape, Y.shape
In [5]:
logreg = linear_model.LogisticRegression(C=1e5)
In [20]:
# we create an instance of Neighbours Classifier and fit the data.
logreg.fit(X,Y)
Out[20]:
In [24]:
# Plot the decision boundary. For that, we will assign a color to each
# point in the mest [x_min, x_max]x[y_min, y_max]
x_min, x_max = X[:,0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:,1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])
In [29]:
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(4,3))
plt.pcolormesh(xx,yy,Z, cmap=plt.cm.Paired)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
plt.show()
In [37]:
import os
print os.getcwd()
print os.path.abspath("./") # find out "where you are" and "where Data folder is" with these commands
Let's load the data for Exercise 2 of Machine Learning, taught by Andrew Ng, of Coursera.
In [3]:
ex2data1 = np.loadtxt("./Data/ex2data1.txt",delimiter=',') # you, the user, may have to change this, if the directory that you're running this from is somewhere else
ex2data2 = np.loadtxt("./Data/ex2data2.txt",delimiter=',')
In [27]:
X_ex2data1 = ex2data1[:,0:2]
Y_ex2data1 = ex2data1[:,2]
X_ex2data2 = ex2data2[:,:2]
Y_ex2data2 = ex2data2[:,2]
In [6]:
logreg.fit(X_ex2data1,Y_ex2data1)
Out[6]:
In [7]:
def trainingdat2mesh(X,marginsize=.5, h=0.2):
rows, features = X.shape
ranges = []
for feature in range(features):
minrange = X[:,feature].min()-marginsize
maxrange = X[:,feature].max()+marginsize
ranges.append((minrange,maxrange))
if len(ranges) == 2:
xx, yy = np.meshgrid(np.arange(ranges[0][0], ranges[0][1], h), np.arange(ranges[1][0], ranges[1][1], h))
return xx, yy
else:
return ranges
In [8]:
xx_ex2data1, yy_ex2data1 = trainingdat2mesh(X_ex2data1,h=0.2)
In [10]:
Z_ex2data1 = logreg.predict(np.c_[xx_ex2data1.ravel(),yy_ex2data1.ravel()])
In [12]:
Z_ex2data1 = Z_ex2data1.reshape(xx_ex2data1.shape)
plt.figure(2)
plt.pcolormesh(xx_ex2data1,yy_ex2data1,Z_ex2data1)
plt.scatter(X_ex2data1[:, 0], X_ex2data1[:, 1], c=Y_ex2data1, edgecolors='k')
plt.show()
Get the probability estimates; say a student has an Exam 1 score of 45 and an Exam 2 score of 85.
In [22]:
logreg.predict_proba(np.array([[45,85]])).flatten()
print "The student has a probability of no admission of %s and probability of admission of %s" % tuple( logreg.predict_proba(np.array([[45,85]])).flatten() )
Let's change the "regularization" with the C parameter/option for LogisticRegression. Call this logreg2
In [26]:
logreg2 = linear_model.LogisticRegression()
In [28]:
logreg2.fit(X_ex2data2,Y_ex2data2)
Out[28]:
In [31]:
xx_ex2data2, yy_ex2data2 = trainingdat2mesh(X_ex2data2,h=0.02)
Z_ex2data2 = logreg.predict(np.c_[xx_ex2data2.ravel(),yy_ex2data2.ravel()])
In [32]:
Z_ex2data2 = Z_ex2data2.reshape(xx_ex2data2.shape)
plt.figure(3)
plt.pcolormesh(xx_ex2data2,yy_ex2data2,Z_ex2data2)
plt.scatter(X_ex2data2[:, 0], X_ex2data2[:, 1], c=Y_ex2data2, edgecolors='k')
plt.show()
As one can see, the "dataset cannot be separated into positive and negative examples by a straight-line through the plot." cf. ex2.pdf
We're going to need polynomial terms to map onto.
Use this code: cf. Underfitting vs. Overfitting¶
In [33]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
In [34]:
polynomial_features = PolynomialFeatures(degree=6,include_bias=False)
In [35]:
pipeline = Pipeline([("polynomial_features", polynomial_features),("logistic_regression",logreg2)])
In [36]:
pipeline.fit(X_ex2data2,Y_ex2data2)
Out[36]:
In [37]:
Z_ex2data2 = pipeline.predict(np.c_[xx_ex2data2.ravel(),yy_ex2data2.ravel()])
In [38]:
Z_ex2data2 = Z_ex2data2.reshape(xx_ex2data2.shape)
plt.figure(3)
plt.pcolormesh(xx_ex2data2,yy_ex2data2,Z_ex2data2)
plt.scatter(X_ex2data2[:, 0], X_ex2data2[:, 1], c=Y_ex2data2, edgecolors='k')
plt.show()
In [ ]: