In [3]:
%matplotlib inline
In [4]:
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
In [5]:
import os, sys
os.getcwd()
os.listdir( os.getcwd() ) ;
In [6]:
import numpy as np
import scipy
In [7]:
import pandas as pd
In [7]:
np.random.random(10).shape # testing, playing,
Out[7]:
In [10]:
x = np.array([5,3,0,4])
y = np.array([4,4,1,3])
x.shape
Out[10]:
In [18]:
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x,y)
print( slope, intercept, r_value,p_value,std_err)
In [16]:
( (x-y)*(x-y) ).mean()/2.
Out[16]:
In [17]:
-1.+0.5*4
Out[17]:
cf. Week 1, Linear Algebra Review, Coursera, Machine Learning with Ng
I'll take this opportunity to provide a dictionary between the syntax of linear algebra math and numpy
In [ ]:
In [ ]:
cf. Week 2 Programming Assignment:Linear Regression, 1st programming assignment, machine-learning-ex1.zip
In [19]:
np.identity(5) # eye(5) in Matlab/Octave
Out[19]:
In [6]:
os.listdir( './coursera_Ng/machine-learning-ex1/' )
Out[6]:
In [7]:
os.listdir( './coursera_Ng/machine-learning-ex1/ex1' )
Out[7]:
pandas.read_csv
, Parameters: header: defaults to 0, if no names are passed, but we need to set it to None
so not to include the first row
In [8]:
## ================================== Part 2: Plotting =====================================
print("Plotting Data ... \n")
linregdata = pd.read_csv('./coursera_Ng/machine-learning-ex1/ex1/ex1data1.txt', header=None)
In [9]:
print( linregdata.describe() )
linregdata.head()
Out[9]:
In [10]:
X_linreg = linregdata.as_matrix()[:,0] # pandas.DataFrame.as_matrix convert frame to its numpy-array representation
y_linreg = linregdata.as_matrix()[:,1]
m_linreg = len(y_linreg) # number of training examples
print( X_linreg.shape, type(X_linreg))
print( y_linreg.shape, type(y_linreg))
print m_linreg
In [42]:
X_linreg
Out[42]:
In [45]:
plt.xlabel('Population of City in 10,000s')
plt.ylabel('Profit in $10,000s')
plt.plot( X_linreg, y_linreg, 'rx', markersize=10)
Out[45]:
cf. machine-learning-ex1/ex1/ex1.m
cf. How to add column to numpy array
"
all_data = np.hstack((my_data, new_col))
#or
all_data = np.concatenate((my_data, new_col), 1)
I believe that the only difference between these three functions (as well as np.vstack) are their default behaviors for when axis is unspecified:
* `concatenate` assumes axis = 0
* `hstack` assumes axis = 1 unless inputs are 1d, then `axis = 0`
* `vstack` assumes axis = 0 after adding an axis if inputs are 1d
* `append` flattens array
"
See also How to add an extra column to an numpy array
"I think a more straightforward solution and faster to boot is to do the following:"
import numpy as np
N = 10
a = np.random.rand(N,N)
b = np.zeros((N,N+1))
b[:,:-1] = a
In [11]:
# np.hstack( ( np.ones((m_linreg,1)), X_linreg.reshape(,1) ) ).shape
input_X_linreg = np.vstack( (np.ones(m_linreg), X_linreg ) ).T
input_X_linreg.shape
Out[11]:
In [64]:
b = np.ones((m_linreg,1))
b.shape
b[:,:-1] = X_linreg
In [12]:
theta = np.zeros((2,1)); print( theta )
In [ ]:
# Some gradient descent settings
iterations = 1500
alpha = 0.01 # learning rate
In [84]:
print( np.dot( input_X_linreg, theta ).shape )
y_linreg.reshape((len(y_linreg),1)).shape
#(np.dot( input_X_linreg, theta) - y_linreg ).shape
Out[84]:
In [90]:
res = np.dot(input_X_linreg,theta) - y_linreg.reshape((len(y_linreg),1))
(res*res).mean()
Out[90]:
In [105]:
print( res.shape)
print( input_X_linreg.shape )
print( X_linreg.shape)
input_X_linreg[:,1].reshape( 97,1).shape
Out[105]:
In [109]:
# ( res * input_X_linreg[:,1].reshape(m_linreg,1) ).shape
dres = ( res * input_X_linreg )
print(dres.shape)
dres.mean(axis=0)
Out[109]:
In [115]:
temp = theta.flatten() - 0.5 * dres.mean(axis=0)
print( temp)
In [118]:
theta - np.vstack( temp)
Out[118]:
The object of linear regression is to minimize the cost function:
$$ J(\theta) = \frac{1}{2m} \sum_{i=1}^m(h_{\theta}(x^{i}) - y^{(i)})^2 $$
In [13]:
def computeCost(X,y,theta):
"""
COMPUTECOST Compute cost for linear regression
J = COMPUTECOST(X, y, theta) computes the cost of using theta as the
parameter for linear regression to fit the data points in X and y
"""
# Initialize some useful values
m = len(y) # number of training examples
# You will need to return the following variable correctly
J = 0
# attach a column of 1's to make \theta_0 a feature
input_X_linreg = np.vstack( (np.ones(m), X)).T # take the transpose since the np array is a "row"
# preprocess inputs X,y to make sure numpy array dimensions are correct
target_y_linreg = y.reshape((m,1))
# Compute the cost of a particular choice of theta. Set J to the cost
## assume that theta is a numpy array of dim. 2x1 and not (2,)
predicted_vals = np.dot( input_X_linreg, theta) # X * \theta
res = predicted_vals - target_y_linreg # res for residual
ressq = res * res
J = ressq.mean() * 0.5
return J
In [93]:
computeCost(X_linreg, y_linreg, theta)
Out[93]:
In [ ]:
In [14]:
def gradientDescent(X, y, theta, alpha, num_iters=1500):
"""
GRADIENTDESCENT Performs gradient descent to learn theta
theta = GRADIENTDESCENT(X, y, theta, alpha, num_iters) updates theta by
taking num_iters gradient steps with learning rate alpha
"""
# Initialize some useful values
m = len(y)
J_history = np.zeros(num_iters);
# attach a column of 1's to make \theta_0 a feature
input_X_linreg = np.vstack( (np.ones(m), X)).T
# preprocess inputs X,y to make sure numpy array dimensions are correct
target_y_linreg = y.reshape((m,1))
for iter in range(num_iters):
# perform a single gradient step on the parameter vector theta
## assume that theta is a numpy array of dim. 2x1, and not (2,)
### predicted_vals is h_{\theta}(x^{(i)}), h is the hypothesis
predicted_vals = np.dot( input_X_linreg, theta) # X * \theta
res = predicted_vals - target_y_linreg # res for residual
dres = res * input_X_linreg # it's the partial derivative with respect to each x_j
temp = theta.flatten() - alpha * dres.mean( axis=0)
theta = np.vstack( temp ) # this is effectively numpy's transpose from row to column vector
J_history[iter] = computeCost(X,y,theta)
return theta, J_history
In [120]:
theta
Out[120]:
In [121]:
result_1d_graddesc = gradientDescent(X_linreg, y_linreg,theta, 0.01)
In [15]:
%time result_1d_graddesc = gradientDescent(X_linreg, y_linreg,theta, 0.01)
In [132]:
print( result_1d_graddesc[0] )
print( result_1d_graddesc[0][0,0] )
print( result_1d_graddesc[0][1,0])
In [124]:
plt.plot( result_1d_graddesc[1] )
Out[124]:
cf. 2.2.4 Gradient descent, pp. 7 of ex1.pdf
:
"Your final values for $\theta$ will also be used to make predictions on profits in areas of 35,000 and 70,000 people. Note the way that the following lines in ex1.m uses matrix multiplication, rather than explicit summation or looping, to calculate the predictions.
"
In [149]:
# Plot the linear fit
# cf. http://matthiaseisen.com/pp/patterns/p0170/ Draw a regression line with matplotlib
fig, ax = plt.subplots()
ax.set_xlabel('Population of City in 10,000s')
ax.set_ylabel('Profit in $10,000s')
#plt.plot( X_linreg, y_linreg, 'rx', markersize=10)
ax.scatter(X_linreg, y_linreg, s=40,c='r',marker='x' ) # s is size in points^2, c is color
ax.plot( X_linreg,
result_1d_graddesc[0][0,0] + X_linreg * result_1d_graddesc[0][1,0],'-')
Out[149]:
In [128]:
type(X_linreg); X_linreg.shape
Out[128]:
In [155]:
# Grid over which we will calculate J
theta0_vals = np.arange(-10,10,20./100.)
theta1_vals = np.arange(-1,4,5./100.)
theta0_vals, theta1_vals = np.meshgrid( theta0_vals, theta1_vals )
In [156]:
print( type( theta0_vals ), type( theta1_vals) )
print( theta0_vals.shape, theta1_vals.shape )
In [163]:
# Note to self: computeCost(X_linreg, y_linreg, theta)
# Fill out J_vals
J_vals = np.array( [[computeCost(X_linreg,y_linreg, np.vstack( np.array( [
theta0_vals[i,j],
theta1_vals[i,j] ] ) ) ) for j in range(100) ]
for i in range(100) ] )
In [168]:
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
In [169]:
surf_fig = plt.figure()
surf_ax = surf_fig.gca(projection='3d')
# Plot the surface
surftheta = surf_ax.plot_surface( theta0_vals, theta1_vals, J_vals,
cmap=cm.coolwarm, linewidth=0, antialiased=False)
# Add a color bar which maps values to colors
surf_fig.colorbar(surftheta, shrink=0.5, aspect=5)
Out[169]:
In [158]:
theta
Out[158]:
In [161]:
np.vstack( np.array( [ theta0_vals[2,3], theta1_vals[2,3] ] ) )
Out[161]:
In [176]:
# Contour plot
plt.figure()
thetacontourplt = plt.contour(theta0_vals, theta1_vals, J_vals,
levels=np.logspace(-2,3,20) )
plt.clabel(thetacontourplt, inline=True, fontsize=10)
plt.xlabel('\theta_0')
plt.ylabel('\theta_1')
plt.plot( result_1d_graddesc[0][0,0], result_1d_graddesc[0][1,0], 'rx', markersize=10 )
plt.show()
In [18]:
## Load Data
print("Loading data ... \n")
linregdata2 = pd.read_csv('./coursera_Ng/machine-learning-ex1/ex1/ex1data2.txt', header=None)
print( linregdata2.describe() )
linregdata2.head()
Out[18]:
In [11]:
linregdata2.as_matrix([0,1])
Out[11]:
In [19]:
X_linreg = linregdata2.as_matrix([0,1]) # pandas.DataFrame.as_matrix convert frame to its numpy-array representation
y_linreg = linregdata2.as_matrix([2])
m_linreg = len(y_linreg) # number of training examples
print( X_linreg.shape, type(X_linreg))
print( y_linreg.shape, type(y_linreg))
print m_linreg
In [15]:
print( X_linreg.mean(axis=0) )
y_linreg.mean()
Out[15]:
In [18]:
print( X_linreg.std(axis=0) )
y_linreg.std()
Out[18]:
In [20]:
normedX_linreg = (X_linreg - X_linreg.mean(axis=0))/X_linreg.std(axis=0)
normedy_linreg = (y_linreg - y_linreg.mean(axis=0))/y_linreg.std(axis=0)
In [25]:
print( normedX_linreg.mean() )
print( normedX_linreg.std() )
print( normedy_linreg.mean() )
normedy_linreg.std()
Out[25]:
In [21]:
def featureNormalize(X):
"""
FEATURENORMALIZE Normalizes the features in X
FEATURENORMALIZE(X) returns a normalized version of X where
the mean value of each feature is 0 and the standard deviation
is 1. This is often a good preprocessing step to do when
working with learning algorithms.
"""
# You need to set these values correctly
X_norm = (X-X.mean(axis=0))/X.std(axis=0)
mu = X.mean(axis=0)
sigma = X.std(axis=0)
return [X_norm, mu, sigma]
In [35]:
print( normedX_linreg.shape )
theta = np.zeros((2,1))
np.zeros((2,1)).shape
Out[35]:
In [38]:
predicted_val = np.dot( normedX_linreg, theta )
In [39]:
res = predicted_val - y_linreg
In [43]:
%timeit np.dot( res.T, res )/ m_linreg
In [44]:
%timeit (res*res).mean()
In [16]:
def computeCostMulti(X,y,theta):
"""
COMPUTECOSTMULTI Compute cost for linear regresion with multiple variables
J = COMPUTECOSTMULTI(X, y, theta) computes the cost of using theta as the
parameter for linear regression to fit the data points in X and y
"""
# Initialize some useful values
m = len(y) # number of training examples
# You need to return the following variables correctly
J = 0
# Compute the cost of a particualar choice of theta, setting J to the cost
predicted_val = np.dot(X,theta)
res = predicted_val - y
J = np.dot( res.T, res)/m * 0.5
return J
In [53]:
np.dot( np.dot( np.linalg.inv( np.dot( normedX_linreg.T, normedX_linreg) ) , normedX_linreg.T ) , y_linreg)
Out[53]:
In [17]:
def gradientDescentMulti(X, y, theta, alpha, num_iters=1500):
"""
GRADIENTDESCENTMULTI Performs gradient descent to learn theta
theta = GRADIENTDESCENTMULTI(X, y, theta, alpha, num_iters) updates theta by
taking num_iters gradient steps with learning rate alpha
"""
# Initialize some useful values
m = len(y) # number of training examples
J_history = np.zeros((num_iters , 1) )
## assume preprocessing is needed (in the case when using pandas for DataFrame, only column of 1 is needed)
input_X_linreg = np.hstack( (np.ones((m,1)), X))
for iter in range(num_iters):
# perform a single gradient step on the parameter vector theta
predicted_vals = np.dot( input_X_linreg, theta)
res = predicted_vals - y
dres = res * input_X_linreg # it's the partial derivative with respect to each x_j
temp = theta.flatten() - alpha * dres.mean( axis=0)
theta = np.vstack( temp ) # this is effectively numpy's transpose from row to column vector
J_history[iter] = computeCostMulti(input_X_linreg,y,theta)
return theta, J_history
In [22]:
input_X_linreg = np.hstack( ( np.ones((m_linreg,1)), normedX_linreg ) )
In [67]:
predicted_vals_multi = np.dot( input_X_linreg, np.zeros((3,1)) )
predicted_vals_multi.shape
Out[67]:
In [69]:
res_multi = predicted_vals_multi - y_linreg
In [71]:
dres_multi = res_multi * input_X_linreg
dres_multi.shape
Out[71]:
In [73]:
( np.zeros((3,1)).flatten() - 0.5 * dres_multi.mean(axis=0) ).shape
Out[73]:
Bottom line for serial, (batch) gradient descent for multiple variables (multiple features)
In [97]:
[normedX_linreg, mu_multi, sigma_multi] = featureNormalize(X_linreg)
In [26]:
theta = np.zeros((3,1))
In [24]:
alpha = 0.01 # Learning rate
num_iters = 400
In [101]:
[theta, J_history] = gradientDescentMulti(normedX_linreg, y_linreg, theta, alpha, num_iters)
In [27]:
%time [theta, J_history] = gradientDescentMulti(normedX_linreg, y_linreg, theta, alpha, num_iters)
In [106]:
plt.plot( J_history)
plt.show()
In [28]:
theta
Out[28]:
In [108]:
def normalEqn(X,y):
"""
NORMALEQN Computes the closed-form solution to linear regression
NORMALEQN(X,y) computes the closed-form solution to linear
regression using the normal equations.
"""
normX = np.dot( X.T, X)
normXinverse = np.linalg.inv( normX )
theta = np.dot( np.dot( normXinverse, X.T),y)
return theta
In [110]:
# Add intercept term to X
X_linreg_w_intercept = np.hstack( (np.ones((m_linreg,1)), X_linreg))
In [111]:
normalEqn( X_linreg_w_intercept, y_linreg)
Out[111]:
cf. https://www.coursera.org/learn/machine-learning/exam/7pytE/linear-regression-with-multiple-variables
In [113]:
midterm_exampow2 = np.array( [ 7921, 5184, 8836, 4761])
In [119]:
midterm_exampow2min = midterm_exampow2.min()
midterm_exampow2max = midterm_exampow2.max()
midterm_exampow2range = float( midterm_exampow2max - midterm_exampow2min )
midterm_exampow2mean = midterm_exampow2.mean()
In [118]:
midterm_exampow2 / midterm_exampow2range
Out[118]:
In [121]:
(midterm_exampow2 - midterm_exampow2mean) / midterm_exampow2range
Out[121]:
Took the quiz today for Week 2. 20170209
EY : Diabetes Sample data from sci-kit learn, sklearn
In [29]:
# Load the diabetes dataset
diabetes = sklearn.datasets.load_diabetes()
In [30]:
diabetes_X = diabetes.data
diabetes_Y = diabetes.target
In [31]:
#diabetes_X1 = diabetes_X[:,np.newaxis,2]
diabetes_X1 = diabetes_X[:,np.newaxis, 2]
In [32]:
theta = np.zeros((2,1))
In [34]:
%time linreg_diabetes_result = gradientDescent(diabetes_X1.flatten(), diabetes_Y.flatten(),theta,0.01,num_iters=10000)
In [35]:
print(linreg_diabetes_result)
In [36]:
theta = np.zeros((diabetes_X.shape[1]+1,1))
In [42]:
%time linreg_diabetes_result = gradientDescentMulti(diabetes_X, np.vstack(diabetes_Y),theta,0.01,num_iters=10000)
In [38]:
diabetes_X.shape
Out[38]:
In [40]:
theta.shape
Out[40]:
In [41]:
diabetes_Y.shape
Out[41]:
In [43]:
os.listdir( './coursera_Ng/machine-learning-ex2/' )
Out[43]:
In [44]:
os.listdir( './coursera_Ng/machine-learning-ex2/ex2' )
Out[44]:
pandas.read_csv
, Parameters: header: defaults to 0, if no names are passed, but we need to set it to None
so not to include the first row
In [45]:
## ================================== Part 2: Plotting =====================================
print("Plotting Data ... \n")
logregdata = pd.read_csv('./coursera_Ng/machine-learning-ex2/ex2/ex2data1.txt', header=None)
In [47]:
logregdata.describe()
Out[47]:
In [ ]:
plt.scatter( logregdata.as_matrix([0]) , logregdata.as_matrix([1])
In [ ]:
df.loc[df['column_name'] == some_value]
In [51]:
logregdata.head()
Out[51]:
In [52]:
# Find Indices of Positive and Negative Examples
logregdata.loc[ logregdata[2] == 0]
Out[52]:
In [79]:
# First,
# Find Indices of Positive and Negative Examples
# neg, negative examples, y == 0
plt.scatter( logregdata.loc[ logregdata[2] == 0].as_matrix([0]) ,
logregdata.loc[ logregdata[2] == 0].as_matrix([1]), s=35,c='y',marker='o' , label='Not Admitted')
# parameters for scatter: s : size in points^2, c : color, marker : MarkerStyle, default 'o'
# pos. positive examples, y == 1
plt.scatter( logregdata.loc[ logregdata[2] == 1].as_matrix([0]) ,
logregdata.loc[ logregdata[2] == 1].as_matrix([1]), s=75,c='b',marker='+' , label='Admitted')
plt.xlabel("Exam 1 score")
plt.ylabel("Exam 2 score")
plt.legend(loc=5)
plt.show()
cf. sigmoid.m
The sigmoid function is defined as
$$ g(z) = \frac{1}{1 + e^{-z} } $$
In [17]:
def sigmoid(z):
"""
SIGMOID Compute sigmoid function
g = SIGMOID(z) computes the sigmoid of z
"""
g = (1. + np.exp(-z))
g = 1./g
return g
In [64]:
sigmoid( np.array([1,2,3]))
Out[64]:
cf. 1.2.2 Cost function and gradient, pp. 4, ex2.pdf
Remember that, for logistic regression hypothesis, $$ h_{\theta}(x) = g(\theta^Tx ) $$
with $g$ being the sigmoid function, defined as
$$
g(z) = \frac{1}{ 1 + e^{-z} }
$$
In [91]:
def costFunction(theta, X,y):
"""
COSTFUNCTION Compute cost and gradient for logistic regression
J = COSTFUNCTION(theta, X, y) computes the cost of using theta as the
parameter for logistic regression and the gradient of the cost
w.r.t. to the parameters
RETURNS
=======
[J, grad]
"""
# Initialize some useful values
m = len(y) # number of training examples
# You need to return the following variables correctly
J = 0
grad = np.zeros( len(theta))
# Compute the cost of a particular choice of theta.
# You should set J to the cost.
# Compute the partial derivatives and set grad to the partial
# derivatives of the cost w.r.t. each parameter in theta
## assume preprocessing is needed (in the case when using pandas for DataFrame, only column of 1 is needed)
input_X_linreg = np.hstack( (np.ones((m,1)), X))
predicted_vals = np.dot( input_X_linreg, theta ) # h_{\theta}
predicted_vals = sigmoid( predicted_vals )
interpolation = -y * np.log( predicted_vals ) - (1. - y) * np.log( 1. - predicted_vals )
J = interpolation.mean()
res = predicted_vals - y # res for residual
dJ = res * input_X_linreg # res * x_j^{(i)}
grad = dJ.mean(axis=0)
return [J, grad]
In [92]:
len( np.vstack( np.zeros(3) ) )
Out[92]:
In [70]:
np.log( logregdata.as_matrix([0]) ).mean()
#logregdata.loc[ logregdata[2] == 1].as_matrix([0])
Out[70]:
In [93]:
## ======================= Part 2: Compute Cost and Gradient ==============================
d = logregdata.as_matrix().shape[1] - 1
# Initialize fitting parameters
initial_theta = np.zeros( (d + 1,1) )
In [94]:
X = logregdata.as_matrix( range(d) )
y = logregdata.as_matrix( [d])
# Compute and display initial cost and gradient
[cost, grad] = costFunction( initial_theta , X, y )
In [96]:
print("Cost at initial theta (zeros): %f\n" % cost )
print("Expected cost (approx): 0.693\n")
print("Gradient at initial theta (zeros): \n")
#print(" %f \n" % grad)
print(grad)
print("Expected gradients (approx):\n -0.1000\n -12.0092\n -11.2628\n")
In [97]:
# Compute and display cost and gradient with non-zero theta
test_theta = np.vstack( np.array( [-24., 0.2, 0.2]) )
[cost, grad] = costFunction( test_theta, X,y)
In [98]:
print("\nCost at test theta: %f\n" % cost)
print("Expected cost (approx): 0.218\n")
print("Gradient at test theta: \n")
print( grad)
print("Expected gradients (approx): \n 0.043\n 2.586\n 2.647\n")
In [99]:
## ===================== Part 3: optimizing using fminunc
#
cf. 2.3 Cost function and gradient
Recall regularized cost function in logistic regression
$$
J(\theta) = \frac{1}{m} \sum_{i=1}^m \left[-y^{(i)} \log{ (h_{\theta}(x^{(i)} )) } - (1-y^{(i)} )\log{ (1-h_{\theta}(x^{(i)} )) } \right] + \frac{ \lambda}{ 2m } \sum_{j=1}^n \theta_j^2
$$
In [107]:
def costFunctionReg(theta, X,y,lambda_val):
"""
COSTFUNCTIONREG Compute cost and gradient for logistic regression with regularization
J = COSTFUNCTION(theta, X, y) computes the cost of using theta as the
parameter for regularized logistic regression and the gradient of the cost
w.r.t. to the parameters
RETURNS
=======
[J, grad]
"""
# Initialize some useful values
m = len(y) # number of training examples
# You need to return the following variables correctly
J = 0
grad = np.zeros( len(theta))
# Compute the cost of a particular choice of theta.
# You should set J to the cost.
# Compute the partial derivatives and set grad to the partial
# derivatives of the cost w.r.t. each parameter in theta
## assume preprocessing is needed (in the case when using pandas for DataFrame, only column of 1 is needed)
input_X_linreg = np.hstack( (np.ones((m,1)), X))
predicted_vals = np.dot( input_X_linreg, theta ) # h_{\theta}
predicted_vals = sigmoid( predicted_vals )
interpolation = -y * np.log( predicted_vals ) - (1. - y) * np.log( 1. - predicted_vals )
J = interpolation.mean()
# regularized term
theta1 = theta[1:] # "Note that you should not regularize the parameter $\theta_0$"
reg_term = np.inner( theta1.flatten(), theta1.flatten() ) # \sum_{j=1}^n \theta_j^2
reg_term = lambda_val * (1./(2. * m)) * reg_term
J += reg_term
res = predicted_vals - y # res for residual
dJ = res * input_X_linreg # res * x_j^{(i)}
grad = dJ.mean(axis=0)
# regularization term for gradient : grad_reg_term
grad_reg_term = np.zeros( len(theta))
grad_reg_term[1:] = theta[1:]
grad_reg_term *= lambda_val / float(m)
grad += grad_reg_term
return [J, grad]
In [102]:
initial_theta[1:].shape
Out[102]:
In [106]:
np.inner( initial_theta.flatten(), initial_theta.flatten())
Out[106]:
In [ ]:
cf. ex3.pdf
, Programming Exercise 3: Multi-class Classification and Neural Networks, Machine Learning,
1 Multi-class Classification
In [108]:
os.getcwd()
Out[108]:
In [109]:
os.listdir( './coursera_Ng/machine-learning-ex3/' )
Out[109]:
In [110]:
os.listdir( './coursera_Ng/machine-learning-ex3/ex3/' )
Out[110]:
In [14]:
# Load saved matrices from file
multiclscls_data = scipy.io.loadmat('./coursera_Ng/machine-learning-ex3/ex3/ex3data1.mat')
In [115]:
multiclscls_data.keys()
Out[115]:
In [119]:
print( type(multiclscls_data['y'] ))
print( multiclscls_data['y'].shape )
print(type(multiclscls_data['X']))
print( multiclscls_data['X'].shape )
In [9]:
def sigmoid(z):
"""
SIGMOID COmpute sigmoid function
J = SIGMOID(z) computes the sigmoid of z.
"""
g = 1.0 / (1.0 + np.exp(-z))
return g
cf. 1.3 Vectorizing Logistic Regression, 1.3.1 Vectorizing the cost function pp. 4, ex3.pdf
Recall (unregularized) logistic regression:
$$ J(\theta) = \frac{1}{m} \sum_{i=1}^m \left[ -y^{ (i)} \log{ (h_{\theta}(x^{(i)} ) ) } - (1-y^{(i)} )\log{ (1-h_{\theta}(x^{(i)} ) ) } \right] $$
In [36]:
def lrCostFunction(theta, X, y, lambda_val):
"""
LRCOSTFUNCTION Compute cost and gradient for logistic regression with
regularization
J = LRCOSTFUNCTION(theta, X, y,lambda_val) computes the cost of using theta as the
parameter for regularized logistic regression and the gradient of the cost
w.r.t. to the parameters
theta
@type : numpy array of matrix size d+1,1 i.e. theta \in \mathbb{R}^{d+1}
@param : "weights"
RETURNS
=======
[J, grad]
"""
# Initialize some useful values
m = len(y) # number of training examples
# You need to return the following variables correctly
J = 0
grad = np.zeros( len(theta))
# Compute the cost of a particular choice of theta.
# You should set J to the cost.
# Compute the partial derivatives and set grad to the partial
# derivatives of the cost w.r.t. each parameter in theta
## assume preprocessing is needed (in the case when using pandas for DataFrame, only column of 1 is needed)
input_X_linreg = np.hstack( (np.ones((m,1)), X))
predicted_vals = np.dot( input_X_linreg, theta ) # h_{\theta}
predicted_vals = sigmoid( predicted_vals )
interpolation = -y * np.log( predicted_vals ) - (1. - y) * np.log( 1. - predicted_vals )
J = interpolation.mean()
res = predicted_vals - y # res for residual
# dJ = res * input_X_linreg # res * x_j^{(i)}
# grad = dJ.mean(axis=0)
dJ = np.dot( input_X_linreg.T , res )
# regularized term
theta1 = theta[1:] # "Note that you should not regularize the parameter $\theta_0$"
reg_term = np.inner( theta1.flatten(), theta1.flatten() ) # \sum_{j=1}^n \theta_j^2
reg_term = lambda_val * (1./(2. * m)) * reg_term
J += reg_term
# res = predicted_vals - y # res for residual
# dJ = res * input_X_linreg # res * x_j^{(i)}
# grad = dJ.mean(axis=0)
# regularization term for gradient : grad_reg_term
grad_reg_term = np.vstack( np.zeros( len(theta)) )
grad_reg_term[1:] = theta[1:]
grad_reg_term *= lambda_val / float(m)
grad = dJ + grad_reg_term
return [J, grad]
In [123]:
y = multiclscls_data['y']
X = multiclscls_data['X']
In [7]:
theta_t = np.vstack( np.array( [-2, -1, 1, 2]) )
X_t = np.array( [i/10. for i in range(1,16)]).reshape((3,5)).T
#X_t = np.hstack( ( np.ones((5,1)), X_t) ) # no need to preprocess the input data X with column of 1's
y_t = np.vstack( np.array( [1,0,1,0,1]))
In [152]:
[J_t, grad_t] = lrCostFunction( theta_t, X_t, y_t, 3);
print( J_t) # Expected cost: 2.534819
Let's try to "vectorize" this more.
In [6]:
def lrCostFunction(theta, X, y, lambda_val):
"""
LRCOSTFUNCTION Compute cost and gradient for logistic regression with
regularization
J = LRCOSTFUNCTION(theta, X, y,lambda_val) computes the cost of using theta as the
parameter for regularized logistic regression and the gradient of the cost
w.r.t. to the parameters
theta
@type : numpy array of matrix size d+1,1 i.e. theta \in \mathbb{R}^{d+1}
@param : "weights"
RETURNS
=======
[J, grad]
"""
# Initialize some useful values
m = len(y) # number of training examples
# You need to return the following variables correctly
J = 0
grad = np.zeros( len(theta))
# Compute the cost of a particular choice of theta.
# You should set J to the cost.
# Compute the partial derivatives and set grad to the partial
# derivatives of the cost w.r.t. each parameter in theta
## assume preprocessing is needed (in the case when using pandas for DataFrame, only column of 1 is needed)
input_X = np.hstack( (np.ones((m,1)), X))
z = np.dot( input_X, theta )
predicted_vals = sigmoid( z ) # h_{\theta}
interpolation = - np.dot( y.T, np.log( predicted_vals) ) - np.dot( (1. -y).T, np.log( 1. - predicted_vals ))
J = interpolation[0][0] /float(m)
res = predicted_vals - y # res for residual
dJ = np.dot( input_X.T , res )
# regularized term
theta1 = theta[1:] # "Note that you should not regularize the parameter $\theta_0$"
reg_term = np.inner( theta1.flatten(), theta1.flatten() ) # \sum_{j=1}^n \theta_j^2
reg_term = lambda_val * (1./(2. * m)) * reg_term
J += reg_term
# regularization term for gradient : grad_reg_term
grad_reg_term = np.vstack( np.zeros( len(theta)) )
grad_reg_term[1:] = theta[1:]
grad_reg_term *= lambda_val / float(m)
grad = dJ + grad_reg_term
return [J, grad]
In [10]:
[J_t, grad_t] = lrCostFunction( theta_t, X_t, y_t, 3);
print( J_t) # Expected cost: 2.534819
In [176]:
X_t.shape
Out[176]:
In [160]:
input_X_t = np.hstack( (np.ones((5,1)), X_t))
z = np.dot( input_X_t, theta_t )
predicted_vals = sigmoid( z )
In [163]:
print( y_t.shape )
print( predicted_vals.shape )
In [167]:
.1 - y_t
Out[167]:
In [165]:
y_t
Out[165]:
In [168]:
interpolation = - np.dot( y_t.T, np.log( predicted_vals) ) - np.dot( (1. -y_t).T, np.log( 1. - predicted_vals ))
In [173]:
interpolation[0][0]
Out[173]:
In [11]:
grad_t
Out[11]:
In [15]:
y = multiclscls_data['y']
X = multiclscls_data['X']
In [17]:
num_labels= 10
lambda_val = 0.1
In [20]:
from scipy import optimize
In [42]:
print( X.shape )
#print( np.zeros( (5, 8) ) )
#initial_theta = np.vstack( np.zeros( ( X.shape[1]+1, 1)) )
In [48]:
theta_t = np.array( [-2, -1, 1, 2])
X_t = np.array( [i/10. for i in range(1,16)]).reshape((3,5)).T
#X_t = np.hstack( ( np.ones((5,1)), X_t) ) # no need to preprocess the input data X with column of 1's
y_t = np.vstack( np.array( [1,0,1,0,1]))
input_X_t = np.hstack( (np.ones(( len(y_t), 1)), X_t))
In [62]:
print( np.dot( input_X_t, theta_t).shape )
print( np.dot( y_t.T, np.log( sigmoid( np.dot( input_X_t, theta_t) ) ))[0] )
print( np.dot( input_X_t.T, y_t.flatten() ) )
np.inner( theta_t[1:].flatten(), theta_t[1:].flatten() )
theta_t[1:]
Out[62]:
In [74]:
def lrCostFunction(theta, X, y, lambda_val):
"""
LRCOSTFUNCTION Compute cost and gradient for logistic regression with
regularization
J = LRCOSTFUNCTION(theta, X, y,lambda_val) computes the cost of using theta as the
parameter for regularized logistic regression and the gradient of the cost
w.r.t. to the parameters
theta
@type : numpy array of matrix size (d+1,) i.e. theta \in \mathbb{R}^{d+1}
@param : "weights"
X
@type : numpy array of matrix size (m,d), i.e. X \in \text{Mat}_{\mathbb{R}}( d,m)
@param : input data i.e. input training data i.e. training examples
y
@type : numpy array of matrix size (m,1), i.e. X \in \mathbb{R}^m
@param : test data i.e. test values i.e. test examples
RETURNS
=======
J
"""
# Initialize some useful values
m = len(y) # number of training examples
# You need to return the following variables correctly
J = 0
grad = np.zeros( len(theta))
# Compute the cost of a particular choice of theta.
# You should set J to the cost.
# Compute the partial derivatives and set grad to the partial
# derivatives of the cost w.r.t. each parameter in theta
## assume preprocessing is needed (in the case when using pandas for DataFrame, only column of 1 is needed)
input_X = np.hstack( (np.ones((m,1)), X))
z = np.dot( input_X, theta ) # matrix size or dim. of (m,) dim(z) = m x 0
predicted_vals = sigmoid( z ) # h_{\theta}
interpolation = - np.dot( y.T, np.log( predicted_vals) ) - np.dot( (1. -y).T, np.log( 1. - predicted_vals )) # matrix size or dim. of (1,), i.e. dim(interpolation) = 1x0
J = interpolation[0] /float(m) # scalar value
res = predicted_vals - y.flatten() # res for residual, matrix size or dim. of (m,) dim(res) = mx0
dJ = np.dot( input_X.T , res ) # matrix size or dim. of (d+1,), i.e. dim(dJ) = (d+1)x0
# regularized term
theta1 = theta[1:] # "Note that you should not regularize the parameter $\theta_0$"
reg_term = np.inner( theta1.flatten(), theta1.flatten() ) # \sum_{j=1}^n \theta_j^2
reg_term = lambda_val * (1./(2. * m)) * reg_term
J += reg_term
# regularization term for gradient : grad_reg_term
grad_reg_term = np.zeros( theta.shape ) # matrix size or dim. of (d+1,) dim(grad_reg_term) = (d+1)x0
grad_reg_term[1:] = theta[1:]
grad_reg_term = lambda_val / float(m)
grad = dJ + grad_reg_term # matrix size or dim. of (d+1,), i.e. dim(grad) = (d+1)x0
return J
In [75]:
def oneVsAll(X, y, num_labels, lambda_value):
"""
ONEVSALL trains multiple logistic regression classifiers and returns all
the classifiers in a matrix all_theta, where the i-th row of all_theta
corresponds to the classifier for label i
[all_theta] = ONEVSALL(X, y, num_labels, lambda_value) trains num_labels
logistic regression classifiers and returns each of these classifiers
in a matrix all_theta, where the i-th row of all_theta corresponds
to the classifier for label i
"""
# Some useful variables
m = X.shape[0]
n = X.shape[1]
# You need to return the following variables correctly
# all_theta = np.zeros( (num_labels, n+1) )
all_theta = []
# Set Initial theta
initial_theta = np.zeros( (n+1,1))
for c in range(1,num_labels+1):
# Use y == c to obtain a vector of 1's and 0's that tell you whether the ground truth is true/false for this class.
y_c = (y == c).astype('float32')
# scipy.optimize.fmin_cg - scipy.optimize.fmin_cg(f, x0, fprime=None,args=()), fprime a function that returns the gradient of f
theta_c = optimize.fmin_cg(lrCostFunction, initial_theta, args=(X,y_c,lambda_value) )
all_theta.append( theta_c)
return all_theta
In [79]:
all_theta_digits = oneVsAll( X,y,10, 0.1)
In [89]:
print(type(all_theta_digits))
print(len(all_theta_digits))
for i in range( len(all_theta_digits)):
print( all_theta_digits[i].shape )
print( all_theta_digits[i][:3], all_theta_digits[i][-3:] )
In [95]:
print( np.array( all_theta_digits).shape )
print( X.shape )
input_X = np.hstack( (np.ones((X.shape[0], 1)), X))
z = np.dot( input_X , np.array( all_theta_digits).T)
In [97]:
h_theta = sigmoid(z)
h_theta.shape
Out[97]:
In [100]:
predicted_cls = np.argmax( h_theta, axis=1 )
In [101]:
predicted_cls.shape
Out[101]:
In [93]:
print( y.shape )
In [ ]:
def predictOneVsAll( all_theta, X):
"""
PREDICT Predict the label for a trained one-vs-all classifier. The labels
are in the range 1..K, where K = all_theta.shape[0], i.e. y \in \lbrace 1 \dots K \rbrace
p = PREDICTONEVSALL(all_theta, X) will return a vector of predictions
for each example in the matrix X. Note that X contains the examples in
rows. all_theta is a matrix where the i-th row is a trained logistic
regression theta vector for the i-th class.
RETURNS:
@type list of 2: numpy array of dims. (m,)
"""
m = X.shape[0] # total number of training examples
num_labels = all_theta.shape[0] # total number of "classes", i.e. y \in \lbrace 1 \dots K \rbrace and num_labels=K
# Add ones to the X data matrix, to include an "intercept"
input_X = np.hstack( (np.ones( (m,1) ), X))
z = np.dot( input_X, np.array( all_theta))
h_theta = sigmoid(z)
predicted_cls = np.argmax( h_theta, axis=1)
predicted_cls_p1 = predicted_cls + 1 # just because of how Python/numpy numbers from 0, as opposed to from 1
return predicted_cls_pl
In [107]:
sum( [predicted_cls[j] == y[j] for j in range(len(y))] )
print( pd.DataFrame( predicted_cls).describe() )
print( pd.DataFrame( y).describe() )
predicted_cls_p1 = predicted_cls + 1
In [114]:
print( predicted_cls[200:240] )
print( predicted_cls_p1[200:240] )
print( y[200:240])
sum( [predicted_cls_p1[j]==y[j] for j in range(len(y))] )[0]/ float( len(y) )
Out[114]:
In [80]:
%%timeit all_theta_digits = oneVsAll( X,y,10, 0.1)
In [78]:
print(y )
print( (y == 3).astype('float32') )
optimize.fmin_cg(lrCostFunction, np.zeros( X.shape[1]+1) ,args=(X, (y==3).astype('float32'), .1))
Out[78]:
In [46]:
np.asarray((0,0)).shape
Out[46]:
In [81]:
ex3weightsdata = scipy.io.loadmat('./coursera_Ng/machine-learning-ex3/ex3/ex3weights.mat')
cf. 2.2 Feedforward Propagation and Prediction, ex3.pdf
In [115]:
Theta1 = ex3weightsdata['Theta1']
Theta2 = ex3weightsdata['Theta2']
print( Theta1.shape )
print( Theta2.shape )
In [ ]:
a = []
a.append( )
In [125]:
def predict(Theta1, Theta2, X):
"""
PREDICT predict the label of an input given a trained neural network
p = PREDICT(Theta1, Theta2, X) outputs the predicted label of X given the
trained weights of a neural network (Theta1, Theta2)
PARAMETERS
==========
Theta1
@type numpy array of size dims. s_2 x (d+1), where s_2 is the number of "units" of "hidden layer", layer 2, with d = number of features
@param Theta1
Theta2
@type numpy array of size dims. M x (s_2 + 1), M=number of output classes
@param Theta2
"""
# Useful values
m = X.shape[0] # total number of training examples, m
num_labels = Theta2.shape[0] # number of classes, i.e. output y \in \lbrace 1, \dots, num_labels \rbrace
## assume preprocessing is needed (in the case when using pandas for DataFrame, only column of 1 is needed)
input_X = np.hstack( (np.ones((m,1)), X)) # size dims. (m,d+1) i.e. m x (d+1)
a_lst = [] # "activation"
a = input_X # a_0^{(1)} already added in input_X step as a_0^{(1)}=1, # a of size dims. (m,d+1)i.e. m x (d+1)
a_lst.append(a)
z = np.dot( Theta1, a.T ) # size dims. s_2 x m
a = sigmoid(z)
a = np.vstack( ( np.ones((1,m)), a)) # size dims. (s_2 +1) x m i.e. ((s_2+1),m)
a_lst.append(a)
z = np.dot( Theta2, a)
a = sigmoid(z)
predicted_vals = np.argmax( a, axis =0)
predicted_vals_p1 = predicted_vals + 1 # add 1 to "scale" outputs to predict y, since Python/numpy counts from 0
return predicted_vals_p1
In [128]:
predict_nn = predict(Theta1,Theta2, X)
print( predict_nn.shape)
pd.DataFrame(predict_nn).describe()
Out[128]:
In [129]:
sum( [predict_nn[j]==y[j] for j in range(len(y))] )[0]/ float( len(y) )
Out[129]:
In [122]:
m = X.shape[0]; print(m)
num_labels = Theta2.shape[0]; print( num_labels )
input_X = np.hstack( (np.ones( (m,1)), X))
a_lst = []
a = input_X
a_lst.append(a)
z=np.dot( Theta1, a.T)
a=sigmoid(z)
a=np.vstack(( np.ones((1,m)),a))
a_lst.append(a)
z = np.dot( Theta2, a)
a=sigmoid(z) ; print a.shape
In [124]:
print( np.argmax( a,axis=0).shape )
pd.DataFrame( np.argmax(a,axis=0) ).describe()
Out[124]:
cf. nnCostFunction.m
In [137]:
## ==================== Part 1: Loading and Visualizing Data ==============================
# We start the exercise by first loading and visualizing the dataset.
# You will be working with a dataset that contains handwritten digits.
#
# Load Training Data
print("Loading and Visualizing Data ... \n")
ex4data1 = scipy.io.loadmat('./coursera_Ng/machine-learning-ex4/ex4/ex4data1.mat')
In [142]:
ex4data1['X'] == X
ex4data1['y'] == y
Out[142]:
In [152]:
ex4data1['y'].shape
Out[152]:
ex4.m
In [136]:
# Load the weights into variables Theta1 and Theta2
In [130]:
ex4weightsdata = scipy.io.loadmat('./coursera_Ng/machine-learning-ex4/ex4/ex4weights.mat')
In [135]:
Theta1 = ex4weightsdata['Theta1']
Theta2 = ex4weightsdata['Theta2']
In [143]:
print( Theta1.shape ) # size dims. s_2 x ( d+1)
print( Theta2.shape ) # size dims. M x (s_2 + 1)
In [144]:
## ==================== Part 3: Compute Cost (Feedforward) ===================================
# To the neural network, you should first start by implementing the
# feedforward part of the neural network that returns the cost only. You
# should complete the code in nnCostFunction.m to return cost. After
# implementing the feedforward to compute the cost, you can verify that
# your implementation is correct by verifying that you get the same cost
# as us for the fixed debugging parameters.
print("\n Feedforward Using Neural Network ... \n")
# Weight regularization parameter (we set this to 0 here).
lambda_val = 0
input_layer_size = 400 # 20x20 Input Images of digits
hidden_layer_size = 25; # 25 hidden units
num_labels = 10
cf. 1.3. Feedforward and cost function ex4.pdf
In [211]:
def nnCostFunction( Theta1, Theta2, input_layer_size, hidden_layer_size, num_labels, X,y, lambda_val):
"""
NNCOSTFUNCTION Implements the neural network cost function for a two layer
neural network which performs classification
[J grad] = NNCOSTFUNCTION(Theta1, Theta2, input_layer_size, hidden_layer_size, num_labels,X,y, lambda_val)
computes the cost and gradient of the neural network. The parameters for the neural network are "unrolled"
into the vector nn_params and need to be converted back into the weight matrices.
"""
# Useful values
m = X.shape[0] # total number of training examples, m
#num_labels = Theta2.shape[0] # number of classes, i.e. output y \in \lbrace 1, \dots, num_labels \rbrace
d = input_layer_size
K = num_labels
## assume preprocessing is needed (in the case when using pandas for DataFrame, only column of 1 is needed)
input_X = np.hstack( (np.ones((m,1)), X)) # size dims. (m,d+1) i.e. m x (d+1)
a_lst = [] # "activation"
a = input_X # a_0^{(1)} already added in input_X step as a_0^{(1)}=1, # a of size dims. (m,d+1)i.e. m x (d+1)
a_lst.append(a)
z = np.dot( Theta1, a.T ) # size dims. s_2 x m
a = sigmoid(z)
a = np.vstack( ( np.ones((1,m)), a)) # size dims. (s_2 +1) x m i.e. ((s_2+1),m)
a_lst.append(a)
z = np.dot( Theta2, a)
a = sigmoid(z) # size dims. Kxm
# recall that whereas the original labels (in the variable y) were 1, 2, ..., 10, for the purpose of training a
# neural network, we need to recode the labels as vectors containing only values 0 or 1
y_prob = [np.zeros(K) for row in y] # list of 5000 numpy arrays of size dims. (10,)
for i in range( m):
y_prob[i][ y[i]-1] = 1
y_prob = np.array(y_prob) # size dims. (m,K)
# cost function for the neural network (without regularization)
J_theta = (-np.dot( np.log( a) ,y_prob).trace() - np.dot( np.log(1-a),(1-y_prob)).trace() )/m
# cost function with regularization
reg_term = lambda_val / (2. * m) * ( np.sum(Theta1[:,1:] * Theta1[:,1:]) + np.sum(Theta2[:,1:] * Theta2[:,1:]) )
J_theta += reg_term
return a, J_theta
cf. 1.4 Regularized cost function
cost function for neural networks with regularization given by
$$ J(\theta} = \frac{1}{m} \sum_{i=1}^m \sum_{k=1}^K $$ $$ J(\theta} = \frac{1}{m} \sum_{i=1}^m \sum_{k=1}^K \left[ -y_k^{(i)} \log{ (( h_{\theta}(x^{(i)} ) )_k ) } - (1-y^{(i)}_k ) \log{ (1- (h_{\theta}(x^{(i)}))_k ) } \right] $$
In [214]:
a_test, J_theta_test = nnCostFunction(Theta1,Theta2,input_layer_size,hidden_layer_size,num_labels,X,y,1.)
print( a_test.shape)
J_theta_test
Out[214]:
In [ ]:
In [151]:
y.shape
Out[151]:
In [154]:
#pd.DataFrame( a_test ).describe()
pd.DataFrame( y).describe()
Out[154]:
In [164]:
# recall that whereas the original labels (in the variable y) were 1, 2, ..., 10, for the purpose of training a
# neural network, we need to recode the labels as vectors containing only values 0 or 1
y_prob = [ np.zeros( Theta2.shape[0] ) for row in y];
In [163]:
print( len( [0 for row in y] ) )
[0 for row in y][:10]
y_prob[2][ 3]
Out[163]:
In [165]:
for i in range( y.shape[0]):
y_prob[i][ y[i]-1 ] = 1
In [170]:
y_prob[800]
Out[170]:
In [173]:
print( type(y_prob) )
print( len(y_prob))
In [174]:
y_prob = np.array( y_prob)
print(y_prob.shape)
In [176]:
np.dot( a_test, y_prob).shape
Out[176]:
In [177]:
np.dot( a_test,y_prob).trace()
Out[177]:
In [187]:
# interpolation = - np.dot( y.T, np.log( predicted_vals) ) - np.dot( (1. -y).T, np.log( 1. - predicted_vals )) # matrix size or dim. of (1,), i.e. dim(interpolation) = 1x0
J_theta_test
Out[187]:
In [189]:
y_prob_test = [np.zeros(10) for row in y]
y_prob_test = np.array(y_prob_test)
y_prob_test.shape
Out[189]:
In [195]:
print( Theta1.shape )
Theta1[:,-1]
Out[195]:
In [197]:
np.sum(Theta1[:,1:] * Theta1[:,1:])
Out[197]:
In [215]:
def sigmoidGradient(z):
"""
SIGMOIDGRADIENT returns the gradient of the sigmoid function
evaluated at z
g = SIGMOIDGRADIENT(z) computes the gradient of the sigmoid function
evaluated at z. This should work regardless if z is a matrix or a vector.
In particular, if z is a vector or matrix, you should return
the gradient for each element.
"""
g = sigmoid(z) * (1 - sigmoid(z))
return g
In [216]:
# Randomly initialize the weights to small values
epsilon_init = 0.12
L_out = 400
L_in = 25
W = np.random.uniform( low = -epsilon_init, high= epsilon_init, size=( L_in, L_out) )
In [217]:
W
Out[217]:
In [247]:
print( X[3].shape )
print( np.vstack(X[3]).shape )
a = np.vstack( (np.ones( ( 1, np.vstack(X[3]).shape[1]) ) , np.vstack(X[3]) ) )
print( a.shape)
z = np.dot( Theta1,a) # size dims. s_2 x 1
a = sigmoid(z)
a = np.vstack( ( np.ones((1,np.vstack(X[3]).shape[1])), a)) # size dims. (s_2 +1) x m i.e. ((s_2+1),m) with m =1
print(a.shape)
z = np.dot( Theta2, a) # size dim Kx1
a = sigmoid(z) # size dim Kx1
print(a.shape)
"""
a_lst = [] # "activation"
a = input_X # a_0^{(1)} already added in input_X step as a_0^{(1)}=1, # a of size dims. (m,d+1)i.e. m x (d+1)
a_lst.append(a)
z = np.dot( Theta1, a.T ) # size dims. s_2 x m
a = sigmoid(z)
a = np.vstack( ( np.ones((1,m)), a)) # size dims. (s_2 +1) x m i.e. ((s_2+1),m)
a_lst.append(a)
z = np.dot( Theta2, a)
a = sigmoid(z) # size dims. Kxm
"""
Out[247]:
In [234]:
X[3].reshape( len(X[3]), 1).shape
Out[234]:
In [225]:
np.ones(5,2)
In [ ]:
In [251]:
a = np.arange(60.).reshape(3,4,5)
b = np.arange(24.).reshape(4,3,2)
c = np.tensordot(a,b,axes=([1,0],[0,1]))
d = np.tensordot(a,b,axes=([0,1],[1,0]))
print(c.shape)
print(c)
print(d.shape)
print(d)
c==d
Out[251]:
In [315]:
# Feed forward
m = X.shape[0]
d = X.shape[1]
s_2 = Theta1.shape[0]
K = Theta2.shape[0]
print( X.shape) # m x d
print(Theta1.shape) # s_2 x (d+1)
print(Theta2.shape) # K x (s_2+1)
a = np.vstack( ( np.ones( (1,m)), X.T)) # (d+1)xm
z_lst = []
z = np.dot( Theta1, a) # s_2 x m
z_lst.append(z)
a_lst = []
a_lst.append(a)
a_l = np.vstack( (np.ones( (1,m)), sigmoid(z)) ) # (s_2+1)xm
#a_f = np.stack( (a,a_l),axis=0) # ValueError: all input arrays must have the same shape
a_lst.append(a_l)
z = np.dot( Theta2, a_l) # K x m
z_lst.append(z)
a_L = sigmoid(z) # K x m
a_lst.append( a_L )
In [317]:
def feedforward(X, Thetas, L ):
"""
PARAMETERS (INPUTS)
===================
@type X : numpy array of size dims. m x d
@param X : input data
@type Thetas : list of numpy arrays
@param Thetas : list of numpy arrays that are Theta's or weights for each layer l;
note l=1,2,..L-1,but Python lists start counting from 0,1,...L-2
@type L : (positive) integer
@param L : number of layers, e.g. input layer, 1 hidden layer, and output layer is L = 3
"""
m, d = X.shape
# s_l = [Theta.shape[0] for Theta in Thetas ]
# s_l.insert(0,d)
a_lst = []
z_lst = []
a = np.vstack( (np.ones( (1,m)), X.T)) # (d+1)xm
a_lst.append(a)
for l in range(L-2):
z = np.dot(Thetas[l] , a) # s_2 x m
z_lst.append(z)
g = sigmoid(z)
a_l = np.vstack( (np.ones((1,m)), g)) # (s_{l+1}+1)x m
a_lst.append(a_l)
z = np.dot( Thetas[L-2], a_l) # K x m
z_lst.append(z)
a_L = sigmoid(z) # K x m
a_lst.append( a_L)
return z_lst, a_lst
In [318]:
z_lst_test, a_lst_test = feedforward(X, [Theta1,Theta2], 3)
In [322]:
print( len(z_lst_test) , len(a_lst_test) )
print ( [z.shape for z in z_lst], [a.shape for a in a_lst ])
In [330]:
print( a_lst[-1].shape )
print( range(3-2,-1,-1) )
a_lst[-3]
Out[330]:
In [302]:
# Backpropagation
delta_lst = []
y_prob = [np.zeros(K) for row in y] # list of 5000 numpy arrays of size dims. (10,)
for i in range( m):
y_prob[i][ y[i]-1] = 1
y_prob = np.array(y_prob).T # size dims. (K,m)
print( y_prob.shape )
print( a_L.shape )
delta_L = a_L - y_prob # size dims. (K,m)
delta_lst = []
delta_lst.insert(0, delta_L)
dg = a_lst[-2] * ( 1. - a_lst[-2]) # size dims. ((s_2+1)xm)
print(dg.shape)
delta_l = np.dot( Theta2.T, delta_L) * dg
print( delta_l.shape )
delta_l = delta_l[1:,:]
delta_lst.insert(0, delta_l )
dg = a_lst[-3] * (1. - a_lst[-3]) # (s_1+1) x m
print( dg.shape) # (s_1+1) x m
delta_l = np.dot( Theta1.T, delta_l) * dg
delta_l = delta_l[1:,:]
delta_lst.insert(0, delta_l)
In [339]:
def backprop(y,z_lst, a_lst, Thetas, L):
"""
Backpropagation
@type L : (positive) integer
@param L : number of layers, e.g. input layer, 1 hidden layer, and output layer is L = 3
"""
m = y.shape[1] # number of training examples
K = a_lst[-1].shape[0] # number of classes
delta_lst = []
y_prob = [np.zeros(K) for row in y] # list of m numpy arrays of size dims. (K,)
for i in range(m):
y_prob[i][y[i]-1] = 1
y_prob = np.array(y_prob).T # size dims. (K,m)
delta_l = a_L - y_prob # size dims (K,m)
delta_lst = []
delta_lst.insert(0, delta_l)
for l in range(L-2,-1,-1): # L-2, L-3,...1,0, corresponding to l=L-1,L-2,...2,1 (just how Python indexes from 0)
dg = a_lst[l] * ( 1- a_lst[l])
delta_l = np.dot( Thetas[l].T, delta_l) * dg
delta_l = delta_l[1:,:]
delta_lst.insert(0,delta_l)
D_lst = []
for l in range(len(Thetas)): # 0,1,..L-2, corresponding to l = 1,2,...L-1
D_lst.append( np.tensordot( a_lst[l], delta_lst[l+1].T, axes=([1],[0])).T/m )
# regularization terms for grad(J)
for l in range(len(Thetas)): # 0,1,...L-2, corresponding to l = 1,2,...L-1
Theta_reg = np.zeros( Thetas[l].shape )
Theta_reg[:,1:] = Thetas[l][:,1:]
Theta_reg = lambda_val * Theta_reg / m
D_lst[l] += Theta_reg
return delta_lst, D_lst
In [340]:
delta_lst_test, D_lst_test = backprop(y,z_lst_test,a_lst_test, [Theta1,Theta2], 3)
In [341]:
print( len(delta_lst_test) )
for delta in delta_lst_test: print( delta.shape )
for Dl in D_lst_test: print(Dl.shape)
In [304]:
print( delta_lst[0].shape)
len(a_lst)
print( delta_l.shape)
print( delta_l[1:,:].shape )
for l in delta_lst: print( l.shape )
for a in a_lst: print(a.shape)
In [313]:
print( a_lst[0].shape )
print( delta_lst[1].shape )
print( np.tensordot( a_lst[0], delta_lst[1].T, axes=([1],[0])).shape )
print( np.tensordot( a_lst[1], delta_lst[2].T, axes=([1],[0])).shape )
D1 = np.tensordot( a_lst[0], delta_lst[1].T, axes=([1],[0])).T/m
D2 = np.tensordot( a_lst[1], delta_lst[2].T, axes=([1],[0])).T/m
In [257]:
np.empty() == None
In [316]:
range(2)
Out[316]:
In [345]:
Theta1.shape
np.zeros( Theta1.shape)[:,1:].shape
Out[345]:
In [8]:
ex4data1 = scipy.io.loadmat('./coursera_Ng/machine-learning-ex4/ex4/ex4data1.mat')
Theta_testvals = scipy.io.loadmat('./coursera_Ng/machine-learning-ex4/ex4/ex4weights.mat')
In [9]:
Theta1_testval = Theta_testvals['Theta1'][:,1:]
b1_testval = Theta_testvals['Theta1'][:,0:1]
Theta2_testval = Theta_testvals['Theta2'][:,1:]
b2_testval = Theta_testvals['Theta2'][:,0:1]
In [12]:
print( np.dot( Theta1_testval, ex4data1['X'].T).shape )
np.tile( b1_testval, (1,5000)).shape
Out[12]:
In [18]:
z2 = np.dot( Theta1_testval, ex4data1['X'].T) + np.tile( b1_testval, (1,5000))
In [14]:
z2.shape
Out[14]:
In [20]:
z3 = np.dot( Theta2_testval, sigmoid(z2)) + np.tile( b2_testval, (1,5000))
In [21]:
print(z3.shape)
a3 = sigmoid(z3)
In [22]:
ht = a3
yt = ex4data1['y']
In [23]:
yt.shape
Out[23]:
In [26]:
m = ex4data1['y'].shape[0]
y_prob = [np.zeros(10) for row in ex4data1['y']] # list of 5000 numpy arrays of size dims. (10,)
for i in range( m):
y_prob[i][ ex4data1['y'][i]-1] = 1
y_prob = np.array(y_prob).T # size dims. (K,m)
print(y_prob.shape)
In [30]:
J = - y_prob * np.log( a3) - (1-y_prob) * np.log( 1- a3)
print( J.shape )
In [31]:
np.sum(J,axis=0).shape
Out[31]:
In [32]:
np.mean( np.sum(J,axis=0))
Out[32]:
In [33]:
z2
Out[33]:
In [36]:
sigmoid(z2)
Out[36]:
In [ ]: