In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# import data
data = pd.read_csv('mlclass-ex2/ex2data1.txt', header=None)

Visualize the Data


In [2]:
# separate classes
negative = data[data[2] == 0]
positive = data[data[2] == 1]

# visualize the training data
plt.xlabel('Exam 1 score')
plt.ylabel('Exam 2 score')
plt.scatter(positive[0],positive[1], 
            marker='+', color='b', label="admitted")
plt.scatter(negative[0],negative[1], 
            marker='o', color='y', label="rejected")
plt.legend(loc=1)
plt.show()


Implementation

logistic regression hypothesis

$h_{\theta}(x) = g(\theta^{T}x)$

where $g$ is the sigmoid function, defined as: $g(z) = \frac{1}{1+e^{-z}}$


In [3]:
# hypothesis, sigmoid function
def g(z):
    #return np.true_divide((1.0),(1.0 + np.exp(-z)))
    return sp.special.expit(z)

interpretation

$h_{\theta}(x) = P(y = 1 \mid x; \theta)$

$P(y = 0 \mid x; \theta) = 1 - P(y = 1 \mid x; \theta)$

$h_{\theta}(x) \geq 0.5 \rightarrow y = 1$, when $\theta^{T}x \geq 0$

$h_{\theta}(x) < 0.5 \rightarrow y = 0$, when $\theta^{T}x < 0$

$g(z) \geq 0.5$ when $z \geq 0$

data and parameters


In [15]:
# initial data vectors
x_1 = data[0].values
x_2 = data[1].values
y = data[2].values

# training sample size
m = data[0].count()

# number of features
n = 2

# proper vector shapes
x_1.shape = (m,1)
x_2.shape = (m,1)
y.shape = (m,1)

# Feature Scaling
# mean normalization (necessary for minimization not to return NaNs)
xbar_1 = np.mean(x_1)
xbar_2 = np.mean(x_2)

s_1 = np.std(x_1)
s_2 = np.std(x_2)

x_1 = np.true_divide((np.subtract(x_1,xbar_1)),s_1)
x_2 = np.true_divide((np.subtract(x_2,xbar_2)),s_2)

# design matrix
X = np.hstack((np.ones((m,1)),x_1,x_2))

# theta parameters 
theta = np.zeros(((n+1),1))

cost function for logistic regression

$J(\theta) = \frac{1}{m}\sum\limits_{i=0}^{m-1}[-y^{(i)}\log(h_{\theta}(x^{(i)})) - (1 - y^{(i)})\log(1 - h_{\theta}(x^{(i)}))]$

vectorized implementation

$J(\theta) = -\frac{1}{m}[\log(g(X\theta))^{T}y + \log(1 - g(X\theta))^{T}(1 - y)]$


In [5]:
# cost function
def J(theta):
    return (-1.0/m) * ( ((sp.log(g(X.dot(theta)))).T.dot(y)) + (sp.log(1.0 - g(X.dot(theta)))).T.dot(1.0 - y) )

gradient of the cost

(a vector of the same length as $\theta$ where the $j$-th element (for $j = 0, 1, ..., n$) is defined as:
$\frac{\partial J(\theta)}{\partial\theta_{j}} = \frac{1}{m}\sum\limits_{i=0}^{m-1}(h_{\theta}(x^{(i)}) - y^{(i)}) \cdot x_{j}^{(i)}$

$\frac{\partial}{\partial\theta_{j}} J(\theta) = \frac{\partial}{\partial\theta_{j}} \big(-\frac{1}{m}\big[\sum\limits_{i=0}^{m-1}y^{(i)}\log{h_{\theta}(x^{(i)})} + (1 - y^{(i)})\log{(1 - h_{\theta}(x^{(i)}))}\big]\big)$

vectorized implementation

Jacobian matrix:
$\delta = \begin{bmatrix} \delta_{0} \\ \delta_{1} \\ \delta_{2} \\ ... \\ \delta_{n} \end{bmatrix}$

$\delta_{j} = \frac{1}{m}\sum\limits_{i=0}^{m-1}[g(\theta^{T}X^{(i)}) - y^{(i)}] \cdot X_{j}^{(i)}$


In [6]:
# delta-vector function for derivatives
def deltas(theta):
    delta = np.zeros(((n+1),1))
    for j in xrange(0,(n+1)):
        summation = 0
        for i in xrange(0,m):
            summation += ((g(theta.T.dot(X[i])) - y[i]) * X[i][j])
        
        delta[j] = (1.0/m) * summation
        
    return delta.flatten()

Broyden-Fletcher-Goldfarb-Shanno algorithm


In [7]:
# BFGS
import scipy.optimize as op

# gradient estimated
solution = op.minimize(J, theta)
print "\nSOLUTION (gradient estimated)"
print solution

theta = np.zeros(((n+1),1))

# gradient provided
solution = op.minimize(J, theta, jac=deltas)
print "\nSOLUTION (gradient provided)"
print solution


SOLUTION (gradient estimated)
   status: 0
  success: True
     njev: 20
     nfev: 100
 hess_inv: array([[ 29.40694069,  32.30972939,  30.69199453],
       [ 32.30972939,  81.74339359,  68.18432055],
       [ 30.69199453,  68.18432055,  71.86286667]])
      fun: 0.2034977023502629
        x: array([ 1.71835754,  3.99274776,  3.72493591])
  message: 'Optimization terminated successfully.'
      jac: array([ -1.70059502e-06,   7.15814531e-06,  -8.22544098e-06])

SOLUTION (gradient provided)
   status: 0
  success: True
     njev: 20
     nfev: 20
 hess_inv: array([[ 29.40326034,  32.30420959,  30.68662265],
       [ 32.30420959,  81.73684341,  68.17882459],
       [ 30.68662265,  68.17882459,  71.85824921]])
      fun: 0.2034977023511548
        x: array([ 1.71835728,  3.99274735,  3.7249355 ])
  message: 'Optimization terminated successfully.'
      jac: array([ -1.70629953e-06,   7.15289452e-06,  -8.22913014e-06])

In [8]:
# final values
theta = solution.x
print "\nFinal theta parameters: " + str(theta)
J = solution.fun
print "\nFinal J cost: " + str(J)


Final theta parameters: [ 1.71835728  3.99274735  3.7249355 ]

Final J cost: 0.203497702351

In [9]:
# visualize the training data as a sigmoid function 
# with final theta parameters
plt.scatter(X.dot(theta),g(X.dot(theta)))
plt.show()


Plot the Decision Boundary

$g(z) \geq 0.5$ when $z \geq 0$
thus: $g(\theta^{T}x) \geq 0.5$, whenever $\theta^{T}x \geq 0$

decision boundary equation

$\theta^{T}x = \theta_{0}x_{0} + \theta_{1}x_{1} + \theta_{2}x_{2} = 0$

[$x_{0} = 1$ and, if using values for $x_{1}$, solve for $x_{2}$]
$\theta_{0} + \theta_{1}x_{1} + \theta_{2}x_{2} = 0$
$\theta_{1}x_{1} + \theta_{2}x_{2} = -\theta_{0}$
$\theta_{2}x_{2} = -\theta_{1}x_{1} - \theta_{0}$
$x_{2} = \frac{1}{\theta_{2}} (-\theta_{1}x_{1} - \theta_{0})$
$x_{2} = -\frac{1}{\theta_{2}} (\theta_{1}x_{1} + \theta_{0})$


In [10]:
# visualize the training data
plt.xlabel('Exam 1 score')
plt.ylabel('Exam 2 score')
for i in xrange(0,m):
    if (y[i] == 1):
        plt.scatter(X[i,1],X[i,2], marker='+', color='b')
    else:
        plt.scatter(X[i,1],X[i,2], marker='o', color='y')
        
x1_dec_bound = np.array([[np.min(X[:,1])], [np.max(X[:,1])]])
x2_dec_bound = (-1.0/theta[2])*(theta[1]*x1_dec_bound + theta[0])
plt.plot(x1_dec_bound, x2_dec_bound, label="dec. bound.", color='r')

plt.legend(loc=1)
plt.show()


Make a Single Prediction


In [11]:
# predict admission of student
x = np.array([[1.0],
              [45],
              [85]])

# mean normalize (unnecessary if using the normal equation method)
x[1] = np.true_divide((np.subtract(x[1],xbar_1)),s_1)
x[2] = np.true_divide((np.subtract(x[2],xbar_2)),s_2)

# compute and display prediction
y_hat = g(theta.T.dot(x))
print "\nAdmission probability if exam1=45 and exam2=85: " + str(y_hat) + "\n"


Admission probability if exam1=45 and exam2=85: [ 0.77624668]

Test Model (Fitted Parameters) Using Training Set

Correct Predictions


In [12]:
# test/count correct predictions
correct_predictions = 0
for i in xrange(0,m):
    if (np.round(g(theta.T.dot(X[i]))) == y[i]):
        correct_predictions += 1

print "\nTotal training examples: " + str(m)
print "\nTotal correct predictions: " + str(correct_predictions) + " (" + str((correct_predictions * 1.0/m)*100) + "% accurate)"


Total training examples: 100

Total correct predictions: 89 (89.0% accurate)

Show Incorrect Predictions


In [13]:
# get original exam scores
exam1 = data[0].values
exam2 = data[1].values

# display incorrect predictions
for i in xrange(0,m):
    if (np.round(g(theta.T.dot(X[i]))) != y[i]):
        print "Exam1: " + str(exam1[i]) + ", Exam2: " + str(exam2[i])
        plt.scatter(exam1[i],exam2[i], color='r')
        
# visualize
plt.xlabel('Exam 1 score')
plt.ylabel('Exam 2 score')
plt.show()


Exam1: 75.0247455674, Exam2: 46.5540135412
Exam1: 95.8615550709, Exam2: 38.225278058
Exam1: 69.0701440628, Exam2: 52.7404697302
Exam1: 93.1143887974, Exam2: 38.8006703371
Exam1: 52.0454047683, Exam2: 69.4328601205
Exam1: 33.9155001091, Exam2: 98.8694357422
Exam1: 82.3687537571, Exam2: 40.6182551597
Exam1: 32.5772001681, Exam2: 95.5985476139
Exam1: 82.2266615779, Exam2: 42.7198785372
Exam1: 57.2387063157, Exam2: 59.5142819801
Exam1: 55.34001756, Exam2: 64.9319380069