In [1]:
%matplotlib inline
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
from linreg import *
np.random.seed(2016)
def make_data(N):
X = np.linspace(0, 20, N)
Y = stats.norm.rvs(size=N, loc=-1.5*X + X*X/9, scale=2)
return X, Y
X, Y = make_data(21)
print(np.column_stack((X,Y)))
plot_xy(X, Y)
plt.show()
Data is not really linear, but let's just do what the exercise tells us to do. Thus, our model is
\begin{equation} y_i \sim \mathcal{N}(w_0 + w_1x_i, \sigma^2), \end{equation}or written in vector notation,
\begin{equation} \mathbf{y} \sim \mathcal{N}\left(w_0\mathbf{1} + w_1\mathbf{x}, \sigma^2I\right). \end{equation}Thus, we have that
\begin{align} p(\mathbf{y} \mid w_0,w_1,\sigma^2,\mathbf{x}) &= \prod_{i=1}^N\frac{1}{\sqrt{2\pi\sigma^2}}\exp\left(-\frac{1}{2\sigma^2}\left(y_i - w_0 - w_1x_i\right)^2\right) \\ l(w_0,w_1,\sigma^2) = \log p(\mathbf{y} \mid w_0,w_1,\sigma^2,\mathbf{x}) &= -\frac{N}{2}\log(2\pi) - \frac{N}{2}\log(\sigma^2) - \frac{1}{2\sigma^2}\sum_{i=1}^N\left(y_i - w_0 - w_1x_i\right)^2. \end{align}Let us try to maximize the log-likelihood. We first solve for $w_0$.
\begin{align} \frac{\partial{l}}{\partial w_0} = \frac{1}{\sigma^2}\sum_{i=1}^N \left(y_i - w_0 - w_1x_i\right) = \frac{1}{\sigma^2}\left(-Nw_0 + \sum_{i=1}^N y_i - w_1 \sum_{i=1}^Nx_i\right). \end{align}Setting $\frac{\partial{l}}{\partial w_0} = 0$ and solving for $w_0$, we find that
\begin{equation} \hat{w}_0 = \frac{\sum_{i=1}^N y_i}{N} - \hat{w}_1\frac{\sum_{i=1}^N x_i}{N} = \bar{y} - \hat{w}_1\bar{x} \end{equation}Next, we solve for $w_1$. Taking the derivative, we have
\begin{align} \frac{\partial{l}}{\partial w_1} = \frac{1}{\sigma^2}\sum_{i=1}^N x_i\left(y_i - w_0 - w_1x_i\right) = \frac{1}{\sigma^2}\sum_{i=1}^N\left(x_iy_i - w_0x_i - w_1x_i^2\right). \end{align}Setting $frac{\partial{l}}{\partial w_1} = 0$ and substituting $\hat{w}_0$ for $w_0$, we have that
\begin{align} 0 &= \frac{1}{\sigma^2}\sum_{i=1}^N\left(x_iy_i - (\bar{y} - \hat{w}_1\bar{x})x_i - \hat{w}_1x_i^2\right) \\ &= \sum_{i=1}^N\left(x_iy_i - x_i\bar{y}\right) -\hat{w}_1\sum_{i=1}^N\left(x_i^2 - x_i\bar{x}\right). \end{align}Since $\sum_{i=1}^N x_i = N\bar{x}$, we have that
\begin{align} 0 &= \sum_{i=1}^N\left(x_iy_i - \bar{x}\bar{y}\right) -\hat{w}_1\sum_{i=1}^N\left(x_i^2 - \bar{x}^2\right) \\ \hat{w}_1 &= \frac{\sum_{i=1}^N\left(x_iy_i - \bar{x}\bar{y}\right)}{\sum_{i=1}^N\left(x_i^2 - \bar{x}^2\right)} = \frac{\sum_{i=1}^N\left(x_iy_i - x_i\bar{y} -\bar{x}y_i + \bar{x}\bar{y}\right)}{\sum_{i=1}^N\left(x_i^2 - 2x_i\bar{x} + \bar{x}^2\right)} \\ &= \frac{\sum_{i=1}(x_i - \bar{x})(y_i-\bar{y})}{\sum_{i=1}(x_i - \bar{x})^2} = \frac{\frac{1}{N}\sum_{i=1}(x_i - \bar{x})(y_i-\bar{y})}{\frac{1}{N}\sum_{i=1}(x_i - \bar{x})^2}, \end{align}which is just the MLE for the covariance of $X$ and $Y$ over the variance of $X$. This can also be written as
\begin{equation} \hat{w}_1 = \frac{\frac{1}{N}\sum_{i=1}(x_i - \bar{x})(y_i-\bar{y})}{\frac{1}{N}\sum_{i=1}(x_i - \bar{x})^2} = \frac{\sum_{i=1}^N x_iy_i - \frac{1}{N}\left(\sum_{i=1}^Nx_i\right)\left(\sum_{i=1}^Ny_i\right)}{\sum_{i=1}^N x_i^2 - \frac{1}{N}\left(\sum_{i=1}^Nx_i\right)^2}. \end{equation}Finally, solving for $\sigma^2$, we have that
\begin{align} \frac{\partial{l}}{\partial w_1} = -\frac{N}{2\sigma^2} + \frac{1}{2(\sigma^2)^2}\sum_{i=1}^N\left(y_i - \left(w_0 +w_1x_i\right)\right)^2. \end{align}Setting this equal to $0$, substituting for $w_0$ and $w_1$, we have that
\begin{align} \hat{\sigma}^2 &= \frac{1}{N}\sum_{i=1}^N\left(y_i - \left(\hat{w}_0 +\hat{w}_1x_i\right)\right)^2 = \frac{1}{N}\sum_{i=1}^N\left(y_i^2 - 2y_i\left(\hat{w}_0 +\hat{w}_1x_i\right) + \left(\hat{w}_0 +\hat{w}_1x_i\right)^2\right) \\ &= \hat{w}_0^2 + \frac{1}{N}\left(\sum_{i=1}^Ny_i^2 - 2\hat{w}_0\sum_{i=1}^Ny_i - 2\hat{w}_1\sum_{i=1}^N x_iy_i + 2\hat{w}_0\hat{w}_1\sum_{i=1}^N x_i + \hat{w}_1^2\sum_{i=1}^N x_i^2\right). \end{align}Thus, our sufficient statistics are
\begin{equation} \left(N, \sum_{i=1}^N x_i, \sum_{i=1}^N y_i,\sum_{i=1}^N x_i^2, \sum_{i=1}^N y_i^2, \sum_{i=1}^N x_iy_i\right). \end{equation}
In [2]:
linReg = SimpleOnlineLinearRegressor()
linReg.fit(X, Y)
## visualize model
plot_xy(X, Y)
plot_abline(linReg.get_params()['slope'], linReg.get_params()['intercept'],
np.min(X) - 1, np.max(X) + 1,
ax=plt.gca())
plt.title("Training data with best fit line")
plt.show()
print(linReg.get_params())
Now, let's verify that the online version comes to the same numbers.
In [3]:
onlineLinReg = SimpleOnlineLinearRegressor()
w_estimates = pd.DataFrame(index=np.arange(2,22), columns=['w0_est', 'w1_est', 'sigma2'], dtype=np.float64)
for i in range(len(Y)):
onlineLinReg.partial_fit(X[i], Y[i])
if i >= 1:
w_estimates.loc[i + 1] = {'w0_est': onlineLinReg.get_params()['intercept'],
'w1_est': onlineLinReg.get_params()['slope'],
'sigma2': onlineLinReg.get_params()['variance']}
print(w_estimates)
print(onlineLinReg.get_params())
plt.figure(figsize=(12,8))
plt.plot(w_estimates.index, w_estimates['w0_est'], 'o',
markeredgecolor='black', markerfacecolor='none', markeredgewidth=1,
label='Intercept estimate')
plt.plot(w_estimates.index, w_estimates['w1_est'], 'o',
markeredgecolor='red', markerfacecolor='none', markeredgewidth=1,
label='Slope estimate')
plt.grid()
plt.ylabel('Estimate')
plt.xlabel('# of data points')
plt.title('Online Linear Regression Estimates')
plt.hlines(onlineLinReg.get_params()['intercept'], xmin=np.min(X), xmax=np.max(X) + 5, linestyle='--',
label='Final intercept estimate')
plt.hlines(onlineLinReg.get_params()['slope'], xmin=np.min(X), xmax=np.max(X) + 5, linestyle='--', color='red',
label='Final slope estimate')
plt.legend(loc='center left', bbox_to_anchor=(1,0.5))
plt.show()