In [1]:
# same data as in linreg
import numpy as np

X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)
X_b = np.c_[np.ones((100, 1)), X] # add x0 = 1 to each instance, c_ contatenates arrays!

# ----

# (p. 115) gradient descent
eta = 0.1 # learning rate
n_iterations = 1000
m = 100

theta = np.random.randn(2, 1) # random initialization

for iteration in range(n_iterations):
    gradients = 2/m * X_b.T.dot((X_b.dot(theta) - y))
    theta = theta - eta * gradients

theta


Out[1]:
array([[ 4.39761351],
       [ 2.72366607]])

In [2]:
# (p. 118) Simulated annealing (reducing the learning rate for SGD)
n_epochs = 50
t0, t1 = 5, 50 # learning schedule hyperparameters

def learning_schedule(t):
    return t0 / (t + t1)

theta = np.random.randn(2, 1)  # random initialization

for epoch in range(n_epochs):
    for i in range(m):
        random_index = np.random.randint(m)
        xi = X_b[random_index:random_index + 1]
        yi = y[random_index:random_index + 1]
        gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
        eta = learning_schedule(epoch * m + i)
        theta = theta - eta * gradients

theta


Out[2]:
array([[ 4.35514283],
       [ 2.74706623]])

In [4]:
# (p. 119) same, but using sklearn
from sklearn.linear_model import SGDRegressor

sgd_reg = SGDRegressor(n_iter=50, penalty=None, eta0=0.01)
sgd_reg.fit(X, y.ravel())

sgd_reg.intercept_, sgd_reg.coef_


Out[4]:
(array([ 4.08501501]), array([ 2.99421965]))

In [ ]: