Suppose that you are the administrator of a university department and you want to determine each applicant's chance of admission based on their results on two exams. You have historical data from previous applicants that you use as a training set for logistic regression. For each training example, you have the applicant's scores on two exams and the admissions decision.
Your task is to build a classifcation model that estimates an applicant's probability of admission based on the scores from those two exams.
In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# import data
data = pd.read_csv('mlclass-ex2/ex2data1.txt', header=None)
In [2]:
# separate classes
negative = data[data[2] == 0]
positive = data[data[2] == 1]
# visualize the training data
plt.xlabel('Exam 1 score')
plt.ylabel('Exam 2 score')
plt.scatter(positive[0],positive[1],
marker='+', color='b', label="admitted")
plt.scatter(negative[0],negative[1],
marker='o', color='y', label="rejected")
plt.legend(loc=1)
plt.show()
In [3]:
# hypothesis, sigmoid function
def g(z):
#return np.true_divide((1.0),(1.0 + np.exp(-z)))
return sp.special.expit(z)
In [15]:
# initial data vectors
x_1 = data[0].values
x_2 = data[1].values
y = data[2].values
# training sample size
m = data[0].count()
# number of features
n = 2
# proper vector shapes
x_1.shape = (m,1)
x_2.shape = (m,1)
y.shape = (m,1)
# Feature Scaling
# mean normalization (necessary for minimization not to return NaNs)
xbar_1 = np.mean(x_1)
xbar_2 = np.mean(x_2)
s_1 = np.std(x_1)
s_2 = np.std(x_2)
x_1 = np.true_divide((np.subtract(x_1,xbar_1)),s_1)
x_2 = np.true_divide((np.subtract(x_2,xbar_2)),s_2)
# design matrix
X = np.hstack((np.ones((m,1)),x_1,x_2))
# theta parameters
theta = np.zeros(((n+1),1))
In [5]:
# cost function
def J(theta):
return (-1.0/m) * ( ((sp.log(g(X.dot(theta)))).T.dot(y)) + (sp.log(1.0 - g(X.dot(theta)))).T.dot(1.0 - y) )
$\frac{\partial}{\partial\theta_{j}} J(\theta) = \frac{\partial}{\partial\theta_{j}} \big(-\frac{1}{m}\big[\sum\limits_{i=0}^{m-1}y^{(i)}\log{h_{\theta}(x^{(i)})} + (1 - y^{(i)})\log{(1 - h_{\theta}(x^{(i)}))}\big]\big)$
In [6]:
# delta-vector function for derivatives
def deltas(theta):
delta = np.zeros(((n+1),1))
for j in xrange(0,(n+1)):
summation = 0
for i in xrange(0,m):
summation += ((g(theta.T.dot(X[i])) - y[i]) * X[i][j])
delta[j] = (1.0/m) * summation
return delta.flatten()
In [7]:
# BFGS
import scipy.optimize as op
# gradient estimated
solution = op.minimize(J, theta)
print "\nSOLUTION (gradient estimated)"
print solution
theta = np.zeros(((n+1),1))
# gradient provided
solution = op.minimize(J, theta, jac=deltas)
print "\nSOLUTION (gradient provided)"
print solution
In [8]:
# final values
theta = solution.x
print "\nFinal theta parameters: " + str(theta)
J = solution.fun
print "\nFinal J cost: " + str(J)
In [9]:
# visualize the training data as a sigmoid function
# with final theta parameters
plt.scatter(X.dot(theta),g(X.dot(theta)))
plt.show()
$g(z) \geq 0.5$ when $z \geq 0$
thus: $g(\theta^{T}x) \geq 0.5$, whenever $\theta^{T}x \geq 0$
$\theta^{T}x = \theta_{0}x_{0} + \theta_{1}x_{1} + \theta_{2}x_{2} = 0$
[$x_{0} = 1$ and, if using values for $x_{1}$, solve for $x_{2}$]
$\theta_{0} + \theta_{1}x_{1} + \theta_{2}x_{2} = 0$
$\theta_{1}x_{1} + \theta_{2}x_{2} = -\theta_{0}$
$\theta_{2}x_{2} = -\theta_{1}x_{1} - \theta_{0}$
$x_{2} = \frac{1}{\theta_{2}} (-\theta_{1}x_{1} - \theta_{0})$
$x_{2} = -\frac{1}{\theta_{2}} (\theta_{1}x_{1} + \theta_{0})$
In [10]:
# visualize the training data
plt.xlabel('Exam 1 score')
plt.ylabel('Exam 2 score')
for i in xrange(0,m):
if (y[i] == 1):
plt.scatter(X[i,1],X[i,2], marker='+', color='b')
else:
plt.scatter(X[i,1],X[i,2], marker='o', color='y')
x1_dec_bound = np.array([[np.min(X[:,1])], [np.max(X[:,1])]])
x2_dec_bound = (-1.0/theta[2])*(theta[1]*x1_dec_bound + theta[0])
plt.plot(x1_dec_bound, x2_dec_bound, label="dec. bound.", color='r')
plt.legend(loc=1)
plt.show()
In [11]:
# predict admission of student
x = np.array([[1.0],
[45],
[85]])
# mean normalize (unnecessary if using the normal equation method)
x[1] = np.true_divide((np.subtract(x[1],xbar_1)),s_1)
x[2] = np.true_divide((np.subtract(x[2],xbar_2)),s_2)
# compute and display prediction
y_hat = g(theta.T.dot(x))
print "\nAdmission probability if exam1=45 and exam2=85: " + str(y_hat) + "\n"
In [12]:
# test/count correct predictions
correct_predictions = 0
for i in xrange(0,m):
if (np.round(g(theta.T.dot(X[i]))) == y[i]):
correct_predictions += 1
print "\nTotal training examples: " + str(m)
print "\nTotal correct predictions: " + str(correct_predictions) + " (" + str((correct_predictions * 1.0/m)*100) + "% accurate)"
In [13]:
# get original exam scores
exam1 = data[0].values
exam2 = data[1].values
# display incorrect predictions
for i in xrange(0,m):
if (np.round(g(theta.T.dot(X[i]))) != y[i]):
print "Exam1: " + str(exam1[i]) + ", Exam2: " + str(exam2[i])
plt.scatter(exam1[i],exam2[i], color='r')
# visualize
plt.xlabel('Exam 1 score')
plt.ylabel('Exam 2 score')
plt.show()