In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import patches, cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
from IPython.display import display, Latex, Markdown
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
In order to better understand gradient descent, let's implement it to solve a familiar problem - least-squares linear regression. While we are able to find the solution to ordinary least-squares linear regression analytically (recall its value as $\theta = (X^TX)^{−1}X^TY$), we can also find it using gradient descent.
First, let's consider the gradient function for ordinary least squares regression. Recall the OLS loss function as
$$Loss(\theta) = \frac{1}{n} \sum_{i=1}^n \left(y_i - f_\theta(x_i)\right)^2$$And the function $f_\theta(x_i)$, for input data with $p$ dimensions, as
$$f_\theta(x_i) = \sum_{j=1}^p \theta_j x_{i,j} $$Given these functions, what is the gradient function for OLS regression? First, state it in terms of a single component of $\theta$, $\theta_j$, using a sum over each data point $i$ in $X$.
In [ ]:
q1_answer = r"""
Put your answer here, replacing this text.
$$\frac{\partial}{\partial \theta_j} Loss(\theta) = \frac{1}{n} \sum_{i=1}^n \dots$$
"""
display(Markdown(q1_answer))
In [ ]:
q1_answer = r"""
**SOLUTION:**
$$\frac{\partial}{\partial \theta_j} Loss(\theta) = \frac{2}{n} \sum_{i=1}^n -x_{i,j} \left(y_i - f_\theta(x_i)\right)$$
"""
display(Markdown(q1_answer))
In [ ]:
q2_answer = r"""
Put your answer here, replacing this text.
$$\frac{\partial}{\partial \theta} Loss(X) = \dots$$
"""
display(Markdown(q2_answer))
In [ ]:
q2_answer = r"""
**SOLUTION:**
$$\frac{\partial}{\partial \theta} Loss(X) = -\frac{2}{n} X^T (y - X^T \theta)$$
"""
display(Markdown(q2_answer))
In [ ]:
def linear_regression_grad(X, y, theta):
grad = -2/X.shape[0] * X.T @ (y - X @ theta) #SOLUTION
return grad
theta = [1, 4]
simple_X = np.vstack([np.ones(10), np.arange(10)]).T
simple_y = np.arange(10) * 3 + 2
linear_regression_grad(simple_X, simple_y, theta)
In [ ]:
def plot_surface_3d(X, Y, Z, angle):
highest_Z = max(Z.reshape(-1,1))
lowest_Z = min(Z.reshape(-1,1))
fig = plt.figure()
ax = fig.gca(projection='3d')
surf = ax.plot_surface(X, Y, Z,
cmap=cm.coolwarm,
linewidth=0,
antialiased=False,
rstride=5, cstride=5)
ax.zaxis.set_major_locator(LinearLocator(5))
ax.zaxis.set_major_formatter(FormatStrFormatter('%.1f'))
ax.view_init(45, angle)
fig.colorbar(surf, shrink=0.5, aspect=5)
plt.title("Regression Loss Function")
plt.xlabel("Theta_0")
plt.ylabel("Theta_1")
plt.show()
We create some toy data in two dimensions to perform our regressions on:
In [ ]:
np.random.seed(100)
X_1 = np.arange(50)/5 + 5
X = np.vstack([np.ones(50), X_1]).T
y = (X_1 * 2 + 3) + np.random.normal(0, 2.5, size=50)
plt.plot(X_1, y, ".")
And plot our loss:
In [ ]:
angle_slider = widgets.FloatSlider(min=0, max=360, step=15, value=45)
def plot_regression_loss(angle):
t0_vals = np.linspace(-10,10,100)
t1_vals = np.linspace(-2,5,100)
theta_0,theta_1 = np.meshgrid(t0_vals, t1_vals)
thetas = np.vstack((theta_0.flatten(), theta_1.flatten()))
loss_vals = 2/X.shape[0] * sum(((y - (X @ thetas).T)**2).T)
loss_vals = loss_vals.reshape(100, -100)
plot_surface_3d(theta_0, theta_1, loss_vals, angle)
interact(plot_regression_loss, angle=angle_slider);
Consider:
Now, let's implement a general function to perform batch gradient descent. Given data X and y, initial weights $\theta_0$, a learning rate $\rho$, and a function gradient_function that has the same function signature as linear_regression_grad, implement a general gradient descent algorithm for finding optimal $\theta$.
In [ ]:
def gradient_descent(X, y, theta0, gradient_function, learning_rate = 0.001, max_iter=1000000, epsilon=0.001):
theta_hat = theta0 # Initial guess
for t in range(1, max_iter):
grad = gradient_function(X, y, theta_hat)
# Now for the update step
theta_hat = theta_hat - learning_rate * grad #SOLUTION
# When our gradient is small enough, we have converged
if np.linalg.norm(grad) < epsilon:
print("converged after {} steps".format(t))
return theta_hat
# If we hit max_iter iterations
print("Warning - Failed to converge")
return theta_hat
In [ ]:
theta_0 = [10, -1]
gradient_descent(X, y, theta_0, linear_regression_grad)
Now let's visualize how our regression estimates change as we perform gradient descent:
In [ ]:
theta_0s = []
theta_1s = []
plot_idx = [1, 5, 20, 100, 500, 2000, 10000]
def plot_gradient_wrapper(X, y, theta):
grad = linear_regression_grad(X, y, theta)
theta_0s.append(theta[0])
theta_1s.append(theta[1])
t = len(theta_0s)
if t in plot_idx:
plt.subplot(121)
plt.xlim([4, 12])
plt.ylim([-2, 3])
plt.plot(theta_0s, theta_1s)
plt.plot(theta[0], theta[1], ".", color="b")
plt.title('theta(s) over time, t={}'.format(t))
plt.subplot(122)
plt.xlim([0, 20])
plt.ylim([-10, 40])
plt.plot(np.arange(50)/2.5, y, ".")
plt.plot(np.arange(50)/2.5, X @ theta)
plt.title('Regression line vs. data, t={}'.format(t))
plt.show()
return grad
gradient_descent(X, y, theta_0, plot_gradient_wrapper)
In Prof. Gonzalez's lecture, instead of using a constant learning rate, he used a learning rate that decreased over time, according to a function: $$\rho(t) = \frac{r}{t}$$ Where $r$ represents some initial learning rate. This has the feature of decreasing the learning rate as we get closer to the optimal solution.
As discussed in lecture, while ordinary least squares has a simple analytical solution, logistic regression must be fitted using gradient descent. Using the tools we've constructed, we can do just that. First, create a new function, logistic_regression_grad, which functions similarly to its counterpart linear_regression_grad. In the case of logistic regression, this should be the gradient of the logistic regression log-likelihood function - you may wish to refer to the lecture slides to find this gradient equation.
First, we define the sigmoid function:
In [ ]:
def sigmoid(t):
return 1/(1 + np.e**-t)
And then complete the gradient function. You should get a gradient of about $[0.65, 0.61]$ for the given values $\theta$ on this example dataset.
In [ ]:
def logistic_regression_grad(X, y, theta):
grad = (sigmoid(X @ theta) - y) @ X #SOLUTION
return grad
theta = [0, 1]
simple_X_1 = np.hstack([np.arange(10)/10, np.arange(10)/10 + 0.75])
simple_X = np.vstack([np.ones(20), simple_X_1]).T
simple_y = np.hstack([np.zeros(10), np.ones(10)])
linear_regression_grad(simple_X, simple_y, theta)
Now let's see how we can use our gradient descent tools to fit a regression on some real data! First, let's load the breast cancer dataset from lecture, and plot breast mass radius versus category - malignant or benign. As in lecture, we jitter the response variable to avoid overplotting.
In [ ]:
import sklearn.datasets
data_dict = sklearn.datasets.load_breast_cancer()
data = pd.DataFrame(data_dict['data'], columns=data_dict['feature_names'])
data['malignant'] = (data_dict['target'] == 0)
data['malignant'] = data['malignant'] + 0.1*np.random.rand(len(data['malignant'])) - 0.05
X_log_1 = data['mean radius']
X_log = np.vstack([np.ones(len(X_log_1)), X_log_1.values]).T
y_log = data['malignant'].values
plt.plot(X_log_1, y_log, ".")
Now, using our earlier defined gradient_descent function, find optimal parameters $(\theta_0, \theta_1)$ to fit the breast cancer data. You will have to tune the learning rate beyond the default of the function, and think of what a good initial guess for $\theta$ would be, in both dimensions.
In [ ]:
theta_log = gradient_descent(X_log, y_log, [0, 1], logistic_regression_grad, learning_rate=0.0001) #SOLUTION
theta_log
With optimal $\theta$ chosen, we can now plot our logistic curve and our decision boundary, and look at how our model categorizes our data:
In [ ]:
y_lowX = X_log_1[sigmoid(X_log @ theta_log) < 0.5]
y_lowy = y_log[sigmoid(X_log @ theta_log) < 0.5]
y_highX = X_log_1[sigmoid(X_log @ theta_log) > 0.5]
y_highy = y_log[sigmoid(X_log @ theta_log) > 0.5]
sigrange = np.arange(5, 30, 0.05)
sigrange_X = np.vstack([np.ones(500), sigrange]).T
d_boundary = -theta_log[0]/theta_log[1]
plt.plot(sigrange, sigmoid(sigrange_X @ theta_log), ".", color="g")
plt.hlines(0.5, 5, 30, "g")
plt.vlines(d_boundary, -0.2, 1.2, "g")
plt.plot(y_lowX, y_lowy, ".", color="b")
plt.plot(y_highX, y_highy, ".", color="r")
plt.title("Classification (blue=benign, red=malignant), assuming a P=0.5 decision boundary")
And, we can calculate our classification accuracy.
In [ ]:
n_errors = sum(y_lowy > 0.5) + sum(y_highy < 0.5)
accuracy = round((len(y_log)-n_errors)/len(y_log) * 1000)/10
print("Classification Accuracy - {}%".format(accuracy))
In [ ]: