Suppose you are the product manager at a fabrication plant and you have the test results for some microchips from two different tests. From these two tests, you would like to determine whether the microchips should be accepted or rejected. To help you make the decision, you have a dataset of test results on past microchips, from which you can build a logistic regression model.
In [532]:
import numpy as np
import scipy as sp
from scipy import special
import scipy.optimize as op
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# import data
data = pd.read_csv('mlclass-ex2/ex2data2.txt', header=None)
In [533]:
data.head(1)
Out[533]:
In [534]:
# separate classes
accepted = data[data[2] == 1]
rejected = data[data[2] == 0]
# visualize the training data
plt.xlabel('Microchip Test 1')
plt.ylabel('Microchip Test 2')
plt.scatter(accepted[0],accepted[1],
marker='+', color='b', label="y = 1")
plt.scatter(rejected[0],rejected[1],
marker='o', color='y', label="y = 0")
plt.legend(loc=1)
plt.show()
In [535]:
# initial data vectors
x_1 = data[0].values
x_2 = data[1].values
y = data[2].values
# training sample size
m = data[0].count()
# proper vector shapes
x_1.shape = (m,1)
x_2.shape = (m,1)
y.shape = (m,1)
# design matrix initialized with 1st column a 1's vector
X = np.ones((m,1))
# feature mapping
degree = 6
X = np.ones((m,1))
for x in xrange(1,(degree+1)):
X = np.hstack((X, x_1**x))
X = np.hstack((X, x_2**x))
X = np.hstack((X, (x_1**x * x_2**x)))
if (x != 1):
X = np.hstack((X, (x_1**x * x_2)))
X = np.hstack((X, (x_2**x * x_1)))
# number of features
n = (X.shape[1] - 1)
# Feature Scaling: mean normalization
for i in np.arange(1,n):
x_i = X[:,i]
xbar = np.mean(x_i)
s = np.std(x_i)
X[:,i] = np.true_divide((np.subtract(x_i,xbar)),s)
# theta parameters
theta = np.zeros(((n+1),1))
In [536]:
# hypothesis, sigmoid function
def g(z):
return special.expit(z)
$J(\theta) = \frac{1}{m}\sum\limits_{i=0}^{m-1}[-y^{(i)}\log{(h_{\theta}(x^{(i)}))} - (1-y^{(i)})\log{(1 - h_{\theta}(x^{(i)}))}] + \frac{\lambda}{2m} \sum\limits_{j=1}^{n} \theta_{j}^{2}$
$J(\theta) = \frac{1}{m}[\log(g(X\theta))^{T}y - \log(1 - g(X\theta))^{T}(1 - y)] + \frac{\lambda}{2m} \sum\limits_{j=1}^{n} \theta_{j}^{2}$
Do not reqularize the parameter $\theta_{0}$.
In [537]:
#lambda_reg = 0
#lambda_reg = 0.5
#lambda_reg = 1
lambda_reg = 1.5
#lambda_reg = 2
#lambda_reg = 5
#lambda_reg = 100
def J(theta):
#summation = 0
#for i in xrange(0,m):
# summation += ((-y[i]) * (sp.log(g(X[i].dot(theta))))) - ((1 - y[i]) * (sp.log(1 - g(X[i].dot(theta)))))
#return (1.0/m) * summation + ((lambda_reg/(2.0*m)) * (np.sum(theta[1:(n+1)]**2)))
return ( ((1.0/m) * ((sp.log(g(X.dot(theta))).T.dot(-y)) - (sp.log(1.0 - g(X.dot(theta))).T).dot((1.0 - y)))) + ((lambda_reg/(2.0*m)) * (np.sum(theta[1:(n+1)]**2))) )
#J(theta)
(a vector of the same length as $\theta$ where the $j$-th element (for $j = 0, 1, ..., n$) is defined as:
$\frac{\partial J(\theta)}{\partial\theta_{0}} = \frac{1}{m}\sum\limits_{i=0}^{m-1}(h_{\theta}(x^{(i)}) - y^{(i)}) \cdot x_{j}^{(i)}$ for $j = 0$
$\frac{\partial J(\theta)}{\partial\theta_{j}} = \big(\frac{1}{m}\sum\limits_{i=0}^{m-1}(h_{\theta}(x^{(i)}) - y^{(i)}) \cdot x_{j}^{(i)}\big) + \frac{\lambda}{m} \theta_{j}$ for $j \geq 1$
$\frac{\partial}{\partial\theta_{j}} J(\theta) = \frac{\partial}{\partial\theta_{j}} \big(\frac{1}{m}\big[\sum\limits_{i=0}^{m-1}y^{(i)}\log{h_{\theta}(x^{(i)})} - (1 - y^{(i)})\log{(1 - h_{\theta}(x^{(i)}))}\big]\big)$
In [538]:
# delta-vector function for derivatives
def deltas(theta):
delta = np.zeros(((n+1),1))
for j in xrange(0,(n+1)):
summation = 0
for i in xrange(0,m):
summation += ((g(theta.T.dot(X[i])) - y[i]) * X[i][j])
if j == 0:
delta[j] = (1.0/m) * summation
else:
delta[j] = ( (1.0/m) * summation ) + ((lambda_reg/(1.0*m)) * theta[j])
return delta.flatten()
#deltas(theta)
In [539]:
# gradient provided
solution = op.minimize(J, theta, jac=deltas)
#print "\nSOLUTION (gradient provided)"
#print solution
# final values
theta = solution.x
print "\nFinal theta parameters: " + str(theta)
J = solution.fun
print "\nFinal J cost: " + str(J)
In [540]:
# decision boundary
def feature_map(x_1, x_2):
# design matrix initialized with 1st column a 1's vector
X = [1]
# feature mapping
degree = 6
for x in xrange(1,(degree+1)):
X.append(x_1**x)
X.append(x_2**x)
X.append(x_1**x * x_2**x)
if (x != 1):
X.append(x_1**x * x_2)
X.append(x_2**x * x_1)
return X
In [541]:
u = np.linspace(-1, 1.5, 50)
v = np.linspace(-1, 1.5, 50)
z = np.zeros((len(u),len(v)))
for i in range(len(u)):
for j in range(len(v)):
z[i,j] = np.dot(feature_map(u[i], v[j]),theta)
z = z.T
In [542]:
# separate classes
accepted = data[data[2] == 1]
rejected = data[data[2] == 0]
# visualize the training data
plt.xlabel('Microchip Test 1')
plt.ylabel('Microchip Test 2')
plt.scatter(accepted[0],accepted[1],
marker='+', color='b', label="y = 1")
plt.scatter(rejected[0],rejected[1],
marker='o', color='y', label="y = 0")
plt.contour(u,v,z,[-0.999])
plt.legend(loc=1)
plt.show()
In [543]:
from mpl_toolkits.mplot3d import axes3d
import matplotlib.cm as cm
# separate classes
accepted = data[data[2] == 1]
rejected = data[data[2] == 0]
# predictions
predictions = list(g(X.dot(theta)))
y_hat = np.empty([m, 1])
i = 0
for p in predictions:
if round(p,1) >= 0.5:
y_hat[i] = 1
else:
y_hat[i] = 0
i += 1
# visualize the training data
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(accepted[0],accepted[1],
marker='+', color='b', label="y = 1")
ax.scatter(rejected[0],rejected[1],
marker='o', color='y', label="y = 0")
#step = 0.025
#x = np.arange(np.min(data[0]), np.max(data[0]), step)
#y = np.arange(np.min(data[1]), np.max(data[1]), step)
#x, y = np.meshgrid(x,y)
#print x.shape, y.shape
#print X.shape
#Z
#X, Y = np.meshgrid(x,y)
#Z = np.sqrt((X**2 + Y**2))
#ax.contour(X, Y, Z, cmap=cm.coolwarm)
plt.contour(u,v,z,[-0.999])
plt.show()