Intro

Exploratory notebook related to the theory and concepts behind linear regression. Includes toy examples implementation and visualization.

Regression

Regression is a supervised learning task concerned with the prediction of continuous numerical values. It contrasts with Classification, which is instead about the prediction of classes (categorical values).


In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt, animation, rc

%matplotlib notebook
#%matplotlib inline

Data


In [22]:
# class for generic line defined by a slope and intercept
class Line:
    def __init__(self, slope, intercept):
        self.slope = slope
        self.intercept = intercept
        self.line_fun = lambda x : x * self.slope + self.intercept
        
    # get linspaces sample in the specified interval
    def get_sample(self, n, start=0.0, stop=1.0, noise=None):
        x = np.linspace(start, stop, n)
        y = self.get_y(x, noise=noise)
        return (x, y)
    
    # get random sample in the specified interval
    def get_rand_sample(self, n, start=0.0, stop=1.0, noise=None):
        x = (stop-start) * np.random.random_sample(n) + start
        y = self.get_y(x, noise=noise)
        return (x, y)
    
    def get_y(self, x, noise=None):
        y = self.line_fun(x)
        if noise:
            y += ((np.random.normal(scale=0.5, size=len(y)))*noise)
        return y

In [23]:
# plot
line = Line(slope=1.5, intercept=5)
(x, y) = line.get_sample(100, noise=1)
# plot (scatter) noisy sample
sns.regplot(x, y, fit_reg=False, label='Noisy Sample')
# plot line
plt.plot(*line.get_sample(100), label='True Line')
plt.legend()
plt.show()


Residuals and Cost

Residuals are the differences between the true and predicted values along the sole prediction axis. Mean-squared-error is a common way to compute the quality of a prediction set compared to the true set.


In [ ]:
# mean squared error between true and predicted values
def compute_mse(y_true, y_pred):
    residuals = (y_pred-y_true)
    ms = sum(residuals**2)/len(y_true)
    return ms

In [ ]:
# Test with different slopes
x, y_true = line.get_sample(100)
plt.plot(x, y_true, label='true_sample')
#sns.regplot(x, y, fit_reg=False, label="True")
for s in [-1, 0, 1, 1.5, 1.7, 2]:
    l = Line(slope=s, intercept=5)
    y_pred = l.get_y(x)
    mse = compute_mse(y_true, y_pred)
    plt.plot(x, y_pred, label="s={},c={:.3f}".format(s, mse))
plt.legend(loc='upper left')
plt.show()

In [ ]:
# Cost, testing with different slopes
costs = []
for s in np.linspace(0, 3, 20):
    l = Line(slope=s, intercept=5)
    y_pred = l.get_y(x)
    mse = compute_mse(y, y_pred)
    costs.append((s, mse))
plt.scatter(x=[x[0] for x in costs], y=[x[1] for x in costs])
plt.ylabel("cost")
plt.xlabel("slope")
plt.show()

In [ ]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
import numpy as np


fig = plt.figure()
ax = fig.gca(projection='3d')

# Make data.
slope = np.linspace(-2, 6, 20)
intercept = np.linspace(-100, 200, 20)
slope_s, intercept_s = np.meshgrid(slope, intercept)
slope_m, intercept_m, x_m = np.meshgrid(slope, intercept, x)
Y_pred = x_m * slope_m + intercept_m
cost = np.array([compute_mse(y, Y_pred[i][j]) for i in range(20) for j in range(20)]).reshape(20, 20)
#print(Y_pred.shape)
#print(slope.shape)
#print(intercept.shape)

# Plot the surface.
surf = ax.plot_surface(slope_s, intercept_s, cost, cmap=cm.coolwarm, linewidth=0, antialiased=False)

# Customize the z axis.
#ax.set_zlim(-1.01, 1.01)
#ax.zaxis.set_major_locator(LinearLocator(10))
#ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))

# Add a color bar which maps values to colors.
#fig.colorbar(surf, shrink=0.5, aspect=5)

plt.show()

Regression (Scipy)


In [27]:
# using scipy
from scipy import stats

line = Line(slope=2, intercept=6)
(x, y) = line.get_rand_sample(100, 0, 10, noise=1)
slope, intercept, r, p, _ = stats.linregress(x, y)
print('Slope = {:.3f} (r = {:.3f}, p = {:.5f})'.format(slope, r, p))


Slope = 2.016 (r = 0.996, p = 0.00000)

Polynomial Regression

Using polynomial features can help to better model our data. Polynomial features can be obtained by raising to the power or by combining our original base features. Notice that the resulting polynomial function, even if not linear for the features, is a linear function of our target coefficients, so we are still dealing with a linear model.


In [55]:
# plot 
line = Line(slope=1.5, intercept=5)
line.line_fun = lambda x : np.sin(2*np.pi*x)
(x, y) = line.get_sample(20, noise=1)
# plot (scatter) noisy sample
sns.regplot(x, y, fit_reg=False, label='Noisy Sample')
# plot line
plt.plot(*line.get_sample(100), label='True Line')
plt.legend()
plt.show()


Solve (SKlearn)


In [59]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# create a polynomial features generator
M = 9 # order of the polynomial
poly = PolynomialFeatures(M)
# generate polynomial features from original training data
poly_x = poly.fit_transform(x.reshape(-1, 1))

In [60]:
# fir a linear regression model
lr = LinearRegression()
lr.fit(poly_x, y)


Out[60]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [61]:
# plot results
# plot training data
sns.regplot(x, y, fit_reg=False, label='Noisy Sample')
# plot true line
plt.plot(*line.get_sample(100), label='True Line')
# plot predicted line
x_pred = np.linspace(0, 1, 40)
y_pred = lr.predict(poly.fit_transform(x_pred.reshape(-1, 1)))
plt.plot(x_pred, y_pred, label='Predicted Line (M={})'.format(M))
plt.legend()
plt.show()


Gradient Descent

Fit regression line using gradient descent.

Link 1 Link 2


In [3]:
# Squared error cost function
def compute_cost(X, y, theta):
    m = len(y)
    # compute predictions
    pred = X.dot(theta).flatten()
    # compute cost
    mse = ((y - pred)** 2).sum()/(2*m)
    return mse

In [4]:
# single gradient descent step
def gradient_descent_step(X, y, theta, alpha):
    # compute predictions
    pred = X.dot(theta).flatten()

    # get error
    errors = -np.sum((y-pred)*X.T, axis=1).reshape(2,1)

    # With regularization (notice bias should not be regularized) 
    #theta -= alpha * ((errors/len(y)) + (lambda*theta)/len(y))
    theta -= alpha * (errors/len(y))
    return theta

In [5]:
# run an entire training cycle
def train(X, y, alpha, iters):
    cost_history = np.zeros(shape=(iters, 1))
    theta_history = []
    
    # our parameters are slope and intercepts (bias)
    theta = np.random.randn(2, 1)
    for i in range(iters):
        theta = gradient_descent_step(X, y, theta, alpha)
        
        cost_history[i, 0] = compute_cost(X, y, theta)
        theta_history.append(theta.copy())
    
    return theta_history, cost_history

In [8]:
# target line
true_line = Line(slope=1.5, intercept=5)

#training data
(x_data, y) = true_line.get_rand_sample(100, noise=1)

# train data including bias/intercept input (set to 1)
X = np.ones(shape=(len(y), 2))
X[:,1] = x_data

print(X.shape)
print(y.shape)


(100, 2)
(100,)

In [ ]:
alpha = 0.01
epochs = 1000
theta_history, cost_history = train(X, y, alpha, epochs)

In [ ]:
# Plot history
fig, axes = plt.subplots(2, 1)
# plot cost
axes[0].set_title('Cost History')
axes[0].plot(cost_history.reshape(-1))
axes[0].set_ylabel("cost")
# plot theta
axes[1].set_title('Theta History')
axes[1].plot([t[0] for t in theta_history], label='intercept')
axes[1].plot([t[1] for t in theta_history], label='slope')
axes[1].set_xlabel("epoch")
plt.legend()
plt.show()

Training Animation


In [22]:
alpha = 0.01
epochs = 1000

# Plot SGD animation
fig, ax = sns.plt.subplots()
#ax.set_xlim(0, 2)
#ax.set_ylim(-10, 10)
# plot true line
ax.plot(*true_line.get_sample(2), label='target line')
ax.scatter(x_data, y, label='train data')
# initial fitted line
theta = np.random.randn(2, 1)
fit_line = Line(slope=theta[0], intercept=theta[1])
# plot initial fitted line
p_line, = ax.plot(*fit_line.get_sample(2), 'k-', label='regression line')
epoch_text = ax.text(0, 0, "Epoch 0")

def animate(i):
    global X, y, theta, alpha
    theta = gradient_descent_step(X, y, theta, alpha)
    fit_line.intercept = theta[0]
    fit_line.slope = theta[1]
    l_x, l_y = fit_line.get_sample(2)
    p_line.set_data(list(l_x), list(l_y))
    cost = compute_cost(X, y, theta)
    epoch_text.set_text("Epoch {}, Cost {:.4f}".format(i, cost))
    return epoch_text, p_line

ani = animation.FuncAnimation(fig, animate, epochs, interval=10, repeat=False)
plt.legend(loc='lower right')
plt.show()



In [ ]: