In [1]:
# Numpy handles matrix multiplication, see http://www.numpy.org/
import numpy as np
# PyPlot is a matlab like plotting framework, see https://matplotlib.org/api/pyplot_api.html
import matplotlib.pyplot as plt
# This line makes it easier to plot PyPlot graphs in Jupyter Notebooks
%matplotlib inline
In [2]:
import sklearn
import sklearn.datasets
import matplotlib
# Slightly larger plot rendering
matplotlib.rcParams['figure.figsize'] = (10.0, 8.0)
Let's jump straight into the code. In this chapter, we will create a python class for our logistic regressor. If you are unfamiliar with classes in python, check out Jeff Knup's blogpost for a nice overview. Read the code below carefully, we will deconstruct the different functions afterwards
In [3]:
class LogisticRegressor:
# Here we are just setting up some placeholder variables
# This is the dimensionality of our input, that is how many features our input has
input_dim = 0
# This is the learning rate alpha
learning_rate = 0.1
# We will store the parameters of our model in a dictionary
model = {}
# The values calculated in the forward propagation will be stored in this dictionary
cache = {}
# The gradients that we calculate during back propagation will be stored in a dictionary
gradients = {}
# Init function of the class
def __init__(self,input_dim, learning_rate):
'''
Assigns the given hyper parameters and initializes the initial parameters.
'''
# Assign input dimensionality
self.input_dim = input_dim
# Assign learning rate
self.learning_rate = learning_rate
# Trigger parameter setup
self.init_parameters()
# Parameter setup function
def init_parameters(self):
'''
Initializes weights with random number between -1 and 1
Initializes bias with 0
Assigns weights and parameters to model
'''
# Randomly init weights
W1 = 2*np.random.random((self.input_dim,1)) - 1
# Set bias to 0
b1 = 0
# Assign to model
self.model = {'W1':W1,'b1':b1}
return
# Sigmoid function
def sigmoid(self,x):
'''
Calculates the sigmoid activation of a given input x
See: https://en.wikipedia.org/wiki/Sigmoid_function
'''
return 1/(1+np.exp(-x))
#Log Loss function
def log_loss(self,y,y_hat):
'''
Calculates the logistic loss between a prediction y_hat and the labels y
See: http://wiki.fast.ai/index.php/Log_Loss
We need to clip values that get too close to zero to avoid zeroing out.
Zeroing out is when a number gets so small that the computer replaces it with 0.
Therefore, we clip numbers to a minimum value.
'''
minval = 0.000000000001
m = y.shape[0]
l = -1/m * np.sum(y * np.log(y_hat.clip(min=minval)) + (1-y) * np.log((1-y_hat).clip(min=minval)))
return l
# Derivative of log loss function
def log_loss_derivative(self,y,y_hat):
'''
Calculates the gradient (derivative) of the log loss between point y and y_hat
See: https://stats.stackexchange.com/questions/219241/gradient-for-logistic-loss-function
'''
return (y_hat-y)
# Forward prop (forward pass) function
def forward_propagation(self,A0):
'''
Forward propagates through the model, stores results in cache.
See: https://stats.stackexchange.com/questions/147954/neural-network-forward-propagation
A0 is the activation at layer zero, it is the same as X
'''
# Load parameters from model
W1, b1 = self.model['W1'],self.model['b1']
# Do the linear step
z1 = A0.dot(W1) + b1
#Pass the linear step through the activation function
A1 = self.sigmoid(z1)
# Store results in cache
self.cache = {'A0':X,'z1':z1,'A1':A1}
return
# Backprop function
def backward_propagation(self,y):
'''
Backward propagates through the model to calculate gradients.
Stores gradients in grads dictionary.
See: https://en.wikipedia.org/wiki/Backpropagation
'''
# Load results from forward pass
A0, z1, A1 = self.cache['A0'],self.cache['z1'], self.cache['A1']
# Load model parameters
W1, b1 = self.model['W1'], self.model['b1']
# Read m, the number of examples
m = A0.shape[0]
# Calculate the gradient of the loss function
dz1 = self.log_loss_derivative(y=y,y_hat=A1)
# Calculate the derivative of the loss with respect to the weights W1
dW1 = 1/m*(A0.T).dot(dz1)
# Calculate the derivative of the loss with respect to the bias b1
db1 = 1/m*np.sum(dz1, axis=0, keepdims=True)
#Make sure the weight derivative has the same shape as the weights
assert(dW1.shape == W1.shape)
# Store gradients in gradient dictionary
self.grads = {'dW1':dW1,'db1':db1}
return
# Parameter update
def update_parameters(self):
'''
Updates parameters accoarding to gradient descent algorithm
See: https://en.wikipedia.org/wiki/Gradient_descent
'''
# Load model parameters
W1, b1 = self.model['W1'],self.model['b1']
# Load gradients
dW1, db1 = self.grads['dW1'], self.grads['db1']
# Update weights
W1 -= self.learning_rate * dW1
# Update bias
b1 -= self.learning_rate * db1
# Store new parameters in model dictionary
self.model = {'W1':W1,'b1':b1}
return
# Prediction function
def predict(self,X):
'''
Predicts y_hat as 1 or 0 for a given input X
'''
# Do forward pass
self.forward_propagation(X)
# Get output of regressor
regressor_output = self.cache['A1']
# Turn values to either 1 or 0
regressor_output[regressor_output > 0.5] = 1
regressor_output[regressor_output < 0.5] = 0
# Return output
return regressor_output
# Train function
def train(self,X,y, epochs):
'''
Trains the regressor on a given training set X, y for the specified number of epochs.
'''
# Set up array to store losses
losses = []
# Loop through epochs
for i in range(epochs):
# Forward pass
self.forward_propagation(X)
# Calculate loss
loss = self.log_loss(y,self.cache['A1'])
# Store loss
losses.append(loss)
# Print loss every 10th iteration
if (i%10 == 0):
print('Epoch:',i,' Loss:', loss)
# Do the backward propagation
self.backward_propagation(y)
# Update parameters
self.update_parameters()
# Return losses for analysis
return losses
In [4]:
#Seed the random function to ensure that we always get the same result
np.random.seed(1)
#Variable definition
#define X
X = np.array([[0,1,0],
[1,0,0],
[1,1,1],
[0,1,1]])
#define y
y = np.array([[0,1,1,0]]).T
# Define instance of class
regressor = LogisticRegressor(input_dim=3,learning_rate=1)
In [5]:
# Train classifier
losses = regressor.train(X,y,epochs=100)
In [6]:
# Plot the losses for analyis
plt.plot(losses)
Out[6]:
As you can see, our classifier still works! We have improved modularity and created an easier to debug classifier. Let's have a look at its overall structure. As you can see, we make use of three dictionaries:
These dictionaries store all information required to run the training process:
We run this process many times over. One full cycle done with the full training set is called an epoch. How often we have to go through this process can vary, depending on the complexity of the problem we want to solve and the learning rate $\alpha$. You see alpha being used in the code above already so let's give it a closer look.
The learning rate is a lot like the throttle setting in our learning algorithm. It is the multiplier to the update the parameter experiences.
$$a := a - \alpha * \frac{dL(w)}{da}$$A high learning rate means that the parameters get updated by larger amounts. This can lead to faster training, but it can also mean that we might jump over a minimum.
As you can see with a bigger learning rate we are approaching the minimum much faster. But as we get close, our steps are too big and we are skipping over it. This can even lead to our loss going up over time.
Choosing the right learning rate is therefore crucial. Too small and our learning algorithm might be too slow. Too high and it might fail to converge at a minimum. So in the next step, we will have a look at how to tune this hyper parameter.
So far we have worked with a really simple dataset in which one input feature is perfectly correlated with the labels $y$. Now we will look at a slightly harder problem.
We generate a dataset of two point clouds and we want to train our regressor on separating them. The data generation is done with sklearn's dataset generator.
In [7]:
# Generate a dataset and plot it
np.random.seed(0)
X, y = sklearn.datasets.make_blobs(n_samples=200,centers=2)
y = y.reshape(200,1)
plt.scatter(X[:,0], X[:,1], s=40, c=y.flatten(), cmap=plt.cm.Spectral)
Out[7]:
Looking at the data we see that it is possible to separate the two clouds quite well, but there is a lot of noise so we can not hope to achieve zero loss. But we can get close to it. Let's set up a regressor. Here we will use a learning rate of 10, which is quite high.
In [8]:
# Define instance of class
# Learning rate = 1, same as no learning rate used
regressor = LogisticRegressor(input_dim=2,learning_rate=10)
In [9]:
# Train classifier
losses = regressor.train(X,y,epochs=100)
You will probably even get an error message mentioning an overflow and it doesn't look like the regressor converged smoothly. This was a bumpy ride.
In [10]:
plt.plot(losses)
Out[10]:
As you can see, the loss first went up quite significantly before then coming down. At multiple instances it moves up again. This is a clear sign that the learning rate is too large, let's try a lower one
In [11]:
# Define instance of class
# Learning rate = 0.05
regressor = LogisticRegressor(input_dim=2,learning_rate=0.05)
In [12]:
# Train classifier
losses = regressor.train(X,y,epochs=100)
In [13]:
plt.plot(losses)
Out[13]:
This looks a bit smoother already, and you can see that the error is nearly ten times lower in the end. Let's try an even lower learning rate to see where we can take this.
In [14]:
# Define instance of class
# Learning rate = 0.0005
regressor = LogisticRegressor(input_dim=2,learning_rate=0.0005)
In [15]:
# Train classifier
losses = regressor.train(X,y,epochs=100)
In [16]:
plt.plot(losses)
Out[16]:
This is a very smooth gradient descent but also a very slow one. The error is more than twice as high as before in the end. If we would let this run for a few more epochs we probably could achieve a very good model but at a very large computing expense.
A good learning rate converges fast and leads to low loss. But there is no silver bullet perfect learning rate that always works. It usually depends on your project. It is as much art as it is science to tune the learning rate and only repeated experimentation can lead you to a good result. Experience shows however, that a good learning rate is usually around 0.1, even though it can well be different for other projects. To practice tuning the learning rate, play around with the example below and see whether you can find an appropriate one that converges fast and at a low loss.
In [17]:
# Define instance of class
# Tweak learning rate here
regressor = LogisticRegressor(input_dim=2,learning_rate=1)
In [18]:
# Train classifier
losses = regressor.train(X,y,epochs=100)
In [19]:
plt.plot(losses)
Out[19]:
In [20]:
# Helper function to plot a decision boundary.
# If you don't fully understand this function don't worry, it just generates the boundary plot.
def plot_decision_boundary(pred_func):
# Set min and max values and give it some padding
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
h = 0.01
# Generate a grid of points with distance h between them
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Predict the function value for the whole gid
Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot the contour and training examples
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
plt.scatter(X[:, 0], X[:, 1], c=y.flatten(), cmap=plt.cm.Spectral)
To plot the boundary, we train a new regressor first.
In [21]:
# Define instance of class
# Learning rate = 0.05
regressor = LogisticRegressor(input_dim=2,learning_rate=0.05)
# Train classifier
losses = regressor.train(X,y,epochs=100)
And then we plot the boundary. Again, do not worry if you do not understand exactly what is going on here, as it is not part of the class.
In [22]:
# Plot the decision boundary
plot_decision_boundary(lambda x: regressor.predict(x))
plt.title("Decision Boundary for logistic regressor")
Out[22]:
As you can see, our logistic regressor seperates the two clouds with a simple line. This is appropriate for this case but might fail when the boundary is a more complex function. Let's try out a more complex function.
In [23]:
# Generate a dataset and plot it
np.random.seed(0)
X, y = sklearn.datasets.make_moons(200, noise=0.1)
y = y.reshape(200,1)
plt.scatter(X[:,0], X[:,1], s=40, c=y.flatten(), cmap=plt.cm.Spectral)
Out[23]:
In [24]:
# Define instance of class
# Learning rate = 0.05
y = y.reshape(200,1)
regressor = LogisticRegressor(input_dim=2,learning_rate=0.05)
# Train classifier
losses = regressor.train(X,y,epochs=100)
In [25]:
# Plot the decision boundary
plot_decision_boundary(lambda x: regressor.predict(x))
plt.title("Decision Boundary for hidden layer size 3")
Out[25]:
Our regressor fails to approximate the more complex function required for this example. In the next chapter we will create a deeper network for this purpose.
In this chapter you have seen a refactored version of our logistic regressor from last time. You have seen how data flows through a classifier and seen the training cycle. You have also learned about the learning rate $\alpha$ and how it can affect training. Finally you saw what a logistic regressor does and how it is limited.
In the week 1 folder, you can find an excel sheet called Excel Regressor NN. It is an implementation of a regressor just as we have used it in this chapter. Open it and observe the following: