Regression


Timothy Helton


In [ ]:
from collections import OrderedDict
import itertools
import os
import os.path as osp

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics.regressionplots as smrp
import statsmodels.sandbox.regression as smsr

from k2datascience.utils import save_fig, size

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline


Framework


In [ ]:
def correlation_grid(data, bins=30, title=None, save=False):
    """
    Plot the correlation grid.

    :param pd.DataFrame data: original data
    :param int bins: number of bins for diagonal histogram plots
    :param bool save: if True the figure will be saved
    :param str title: dataset title
    """
    plot_title = 'Dataset Correlation'
    if title:
        title = f'{title} {plot_title}'
    else:
        title = plot_title
    
    grid = sns.pairplot(data,
                        diag_kws={'alpha': 0.5, 'bins': bins,
                                  'edgecolor': 'black'},
                        plot_kws={'alpha': 0.7})

    grid.fig.suptitle(title,
                      fontsize=size['super_title'], y=1.03)

    cols = data.columns
    for n, col in enumerate(cols):
        grid.axes[cols.size - 1, n].set_xlabel(cols[n],
                                               fontsize=size['label'])
        grid.axes[n, 0].set_ylabel(cols[n], fontsize=size['label'])

    save_fig(title, save)

In [ ]:
def correlation_heatmap(data, title=None, save=False):
    """
    Plot the correlation values as a heatmap.

    :param pd.DataFrame data: data object
    :param str title: dataset title
    :param bool save: if True the figure will be saved
    """
    plot_title = 'Dataset Correlation'
    if title:
        title = f'{title} {plot_title}'
    else:
        title = plot_title

    fig = plt.figure('Correlation Heatmap', figsize=(10, 8),
                     facecolor='white', edgecolor='black')
    rows, cols = (1, 1)
    ax0 = plt.subplot2grid((rows, cols), (0, 0))

    sns.heatmap(data.corr(),
                annot=True, cbar_kws={'orientation': 'vertical'},
                fmt='.2f', linewidths=5, vmin=-1, vmax=1, ax=ax0)

    ax0.set_title(title, fontsize=size['title'])
    ax0.set_xticklabels(ax0.xaxis.get_majorticklabels(),
                        fontsize=size['label'], rotation=80)
    ax0.set_yticklabels(ax0.yaxis.get_majorticklabels(),
                        fontsize=size['label'], rotation=0)

    save_fig(title, save)

In [ ]:
def plot_regression(data, lin_reg, features, target,
                    exact_reg=None, save=False, title=None):
    """
    Plot the original data with linear regression line.

    :param pd.DataFrame data: data object
    :param lin_reg: linear regression model
    :type: statsmodels.regression.linear_model.RegressionRes
    :param list features: names of feature columns
    :param str target: name of target column
    :param pd.DataFrame exact_reg: x and y points defining /
        exact linear regression
    :param bool save: if True the figure will be saved
    :param str title: data set title
    """
    plot_title = 'Linear Regression'
    if title:
        title = f'{title} {plot_title}'
    else:
        title = plot_title

    fig = plt.figure(title, figsize=(8, 6),
                     facecolor='white', edgecolor='black')
    rows, cols = (1, 1)
    ax = plt.subplot2grid((rows, cols), (0, 0))

    x = data.loc[:, features]
    y = data[target].values

    for n, feature in enumerate(x.columns):
        x = np.squeeze(data.loc[:, feature].values)
        ax.scatter(x, y, alpha=0.5, color=f'C{n}', label=feature)

        # Regression Line
        sort_idx = np.argsort(x)
        ax.plot(x[sort_idx], predict(x, lin_reg.params)[sort_idx],
                color='black', label='Linear Regression', linestyle='--')
        
        # Upper and Lower Confidence Intervals
        std, upper, lower = smsr.predstd.wls_prediction_std(lin_reg)
        ax.plot(x[sort_idx], lower[sort_idx], alpha=0.5, color='green',
                label='Upper Confidence Interval', linestyle='-.')
        ax.plot(x[sort_idx], upper[sort_idx], alpha=0.5, color='red',
                label='Lower Confidence Interval', linestyle='-.')
        
        if not exact_reg.empty:
            exact_reg.plot(x='x', y='y', color='magenta',
                           label='Exact Linear Regression',
                           linestyle=':', ax=ax)

        
    features_list = f'({", ".join(features)})'
    ax.set_title(f'{target} vs {features_list}',
                 fontsize=size['title'])
    ax.legend()
    ax.set_xlabel(features_list, fontsize=size['label'])
    ax.set_ylabel(target, fontsize=size['label'])

    save_fig(title, save)

In [ ]:
def plot_residuals(lin_reg, save=False, title=None):
    """
    Plot resdual statistics
    
    :param lin_reg: linear regression model
    :type: statsmodels.regression.linear_model.RegressionRes
    :param bool save: if True the figure will be saved
    :param str title: data set title
    """
    plot_title = 'Dataset Residuals'
    if title:
        title = f'{title} {plot_title}'
    else:
        title = plot_title

    fig = plt.figure(title, figsize=(14, 21),
                     facecolor='white', edgecolor='black')
    rows, cols = (3, 2)
    ax0 = plt.subplot2grid((rows, cols), (0, 0))
    ax1 = plt.subplot2grid((rows, cols), (0, 1))
    ax2 = plt.subplot2grid((rows, cols), (1, 0), rowspan=2)
    ax3 = plt.subplot2grid((rows, cols), (1, 1), rowspan=2)

    # Normalized Residuals Histogram
    ax0.hist(lin_reg.resid_pearson, alpha=0.5, edgecolor='black')

    ax0.set_title('Normalized Residuals Histogram',
                  fontsize=size['title'])
    ax0.set_xlabel('Normalized Residuals', fontsize=size['label'])
    ax0.set_ylabel('Counts', fontsize=size['label'])
    
    # Residuals vs Fitted Values
    ax1.scatter(lin_reg.fittedvalues, lin_reg.resid)
    
    ax1.set_title('Raw Residuals vs Fitted Values',
                  fontsize=size['title'])
    ax1.set_xlabel('Fitted Values', fontsize=size['label'])
    ax1.set_ylabel('Raw Residuals', fontsize=size['label'])
    
    # Leverage vs Normalized Residuals Squared
    leverage = smrp.plot_leverage_resid2(lin_reg, ax=ax2)

    ax2.set_title('Leverage vs Normalized $Residuals^2$',
                  fontsize=size['title'])
    ax2.set_xlabel('Normalized $Residuals^2$',
                   fontsize=size['label'])
    ax2.set_ylabel('Leverage', fontsize=size['label'])

    # Influence Plot
    influence = smrp.influence_plot(lin_reg, ax=ax3)

    ax3.set_title('Influence Plot',
                  fontsize=size['title'])
    ax3.set_xlabel('H Leverage',
                   fontsize=size['label'])
    ax3.set_ylabel('Studentized Residuals',
                   fontsize=size['label'])

    plt.tight_layout
    plt.suptitle(title, fontsize=size['super_title'], y=0.92)

    save_fig(title, save)

In [ ]:
def predict(x, parameters):
    """
    Return predicted values provided regression parameters.
    
    .. note:: StatsModels provides regression coefficients in increasing
        order, while NumPy would like to recive them in decreasing order.
        This function is designed to recive the StatsModels format,
        reverse the dimensionality, and then allow NumPy to perform the
        calculation.
    
    :param np.array x: array of input values
    :param pd.Series parameters: linear regression coefficients in
        order of increasing dimension
    :return: predicted target values
    :rtype: np.array
    """ 
    p = np.poly1d(parameters.values[::-1])
    return p(x)

In [ ]:
data_dir = osp.realpath(osp.join(os.getcwd(), '..', 'data',
                                 'linear_regression'))

Auto Dataset

Table of Contents


In [ ]:
auto = pd.read_csv(osp.join(data_dir, 'auto.csv'))
data = auto
data.info()
data.head()
data.describe()

Boston Dataset

Table of Contents


In [ ]:
boston = pd.read_csv(osp.join(data_dir, 'boston.csv'), index_col=0)
data = boston
data.info()
data.head()
data.describe()

Carseats Dataset

Table of Contents


In [ ]:
carseats = pd.read_csv(osp.join(data_dir, 'carseats.csv'))
data = carseats
data.info()
data.head()
data.describe()

Exercise 1 - Use simple linear regression on the Auto data set.

  1. Use statsmodels or scikit-learn to perform a simple linear regression with mpg as the response and horsepower as the predictor. Print the results. Comment on the output. For example:

    • Is there a relationship between the predictor and the response?
    • How strong is the relationship between the predictor and the response?
    • Is the relationship between the predictor and the response positive or negative?
    • What is the predicted mpg associated with a horsepower of 98? Optional: What are the associated 95% confidence and prediction intervals?
  2. Plot the response and the predictor. Plot the least squares regression line.

  3. Produce diagnostic plots of the least squares regression fit. Comment on any problems you see with the fit.

Table of Contents

1) Use statsmodels or scikit-learn to perform a simple linear regression with mpg as the response and horsepower as the predictor. Print the results. Comment on the output.


In [ ]:
lr = smf.ols(formula='mpg ~ horsepower', data=auto).fit()
lr.summary()

In [ ]:
print(f'Horsepower: 98\tMPG:{predict(98, lr.params):.2f}')
Findings
  • The F-statistic is not equal to zero, so a relationship exists between MPG and Horsepower.
    • This is confirmed by the P-value of zero.
  • The model has an $R^2$ value of 0.6, which implies poor accuracy.
    • In the next cell the relationship model appears to have the following form. $$y = 1/x$$
  • The T-statistic has a mixed response for this model.
    • The intercept coefficient appears to be well suited, but the Horsepower coefficient is too imprecise to determine the effect on the response.

2) Plot the response and the predictor. Plot the least squares regression line.


In [ ]:
plot_regression(data=auto, lin_reg=lr, features=['horsepower'],
                target='mpg')

3) Produce diagnostic plots of the least squares regression fit. Comment on any problems you see with the fit.


In [ ]:
plot_residuals(lr)

Exercise 2 - Use multiple linear regression on the Auto data set.

  1. Produce a scatterplot matrix which includes all of the variables in the data set.

  2. Compute the matrix of correlations between the variables using the function corr(). Plot a matrix correlation heatmap as well.

  3. Perform a multiple linear regression with mpg as the response and all other variables except name as the predictors. Print the results. Comment on the output. For instance:

    • Is there a relationship between the predictors and the response?
    • Which predictors appear to have a statistically significant relationship to the response?
    • What does the coefficient for the year variable suggest?
  4. Produce diagnostic plots of the linear regression fit. Comment on any problems you see with the fit. Do the residual plots suggest any unusually large outliers? Does the leverage plot identify any observations with unusually high leverage?

  5. Use the - and + symbols to fit linear regression models with interaction effects. Do any interactions appear to be statistically significant?

  6. Try a few different transformations of the variables, such as $log(X)$, $\sqrt{X}$, $X^2$. Comment on your findings.

Table of Contents

1) Produce a scatterplot matrix which includes all of the variables in the data set.


In [ ]:
numeric = auto.select_dtypes(include=[np.int, np.float])
correlation_grid(numeric, bins=10)

2) Compute the matrix of correlations between the variables using the function corr(). Plot a matrix correlation heatmap as well.


In [ ]:
auto.corr()

In [ ]:
correlation_heatmap(auto)

3) Perform a multiple linear regression with mpg as the response and all other variables except name as the predictors. Print the results. Comment on the output.


In [ ]:
features = numeric.drop('mpg', axis=1)
features = ' + '.join(features.columns)

In [ ]:
lr = smf.ols(formula=f'mpg ~ {features}', data=auto).fit()
lr.summary()
Findings
  • The F-statistic is not equal to zero, so a relationship exists between MPG and at least one of the features.
    • This is confirmed by the P-values. Features with a relationship to MPG are the following:
      • Displacement
      • Weight
      • Year
      • Origin
  • The model has an $R^2$ value of 0.821
    • This will have to be improved for the model to be used in a predictive manner.
  • The Year coefficient states the relationship to MPG is $$\frac{0.75\ MPG}{1 \ Year}$$

4) Produce diagnostic plots of the linear regression fit. Comment on any problems you see with the fit. Do the residual plots suggest any unusually large outliers? Does the leverage plot identify any observations with unusually high leverage?


In [ ]:
plot_residuals(lr)

In [ ]:
auto.loc[10:15, :]

In [ ]:
numeric.loc[13, :] / numeric.max()
Findings
  • The residules are fair for this model, but not great.
  • Some outliers exist in this dataset.
  • Observation 13 has a large influence on the data.
    • Displacement: 100 percentile
    • Horsepower: 97.8 percentile
    • Year: 85.3 percentile

5) Use the - and + symbols to fit linear regression models with interaction effects. Do any interactions appear to be statistically significant?


In [ ]:
features = numeric.drop('mpg', axis=1)
interactions = [f'{x[0]} * {x[1]}' for x
                in itertools.combinations(features.columns, 2)]
interactions = ' + '.join(interactions)
inter_formula = f'mpg ~ {" + ".join(features)} + {interactions}'

In [ ]:
new_lr = smf.ols(formula=inter_formula, data=auto).fit()
new_lr.summary()
Findings
  • The F-statistic is not equal to zero, so a relationship exists between MPG and at least one of the features.
    • This is confirmed by the P-values. Features with a relationship to MPG are the following:
      • Acceleration
      • Displacement
      • Origin
      • Acceleration + Origin
      • Acceleration + Year
      • Displacement + Year
  • Acceleration appears multiple times and also masks the weight feature.
  • The model has an $R^2$ value of 0.881
    • This will have to be improved for the model to be used in a predictive manner.

In [ ]:
plot_residuals(new_lr)

6) Try a few different transformations of the variables, such as $log(X)$, $\sqrt{X}$, $X^2$. Comment on your findings.


In [ ]:
transformations = OrderedDict()
for trans in ('', 'np.log', 'np.sqrt', 'np.square'):
    model = smf.ols(formula=f'mpg ~ {trans}(horsepower)',
                    data=auto).fit()
    transformations[trans] = model.rsquared
    fig = sm.qqplot(model.resid)
    
    if not trans:
        trans = 'Native'
    plt.title(trans, fontsize=size['title']);
    plt.show();
    
transformations
Findings
  • The log(horsepower) has the most linear fit of the models tested.

In [ ]:
trans_lr = smf.ols(formula=(
    'mpg ~ cylinders + np.log(displacement) + horsepower'
    '+ weight + acceleration - year + origin - 1'), data=auto).fit()
trans_lr.summary()
Findings
  • From the correlation scatter plot I would like to transform displacement, horsepower and weight to be inverse relastionships, but it appears StatsModels patsy does not support this transformation.
  • Converting displacement to be a logrithmic relationship was the only positive effect on the residules I was able to identify.

In [ ]:
plot_residuals(trans_lr)

Exercise 3 - Use multiple regression using the Carseats data set.

  1. Fit a multiple regression model to predict Sales using Price, Urban, and US.

  2. Provide an interpretation of each coefficient in the model. Be careful—some of the variables in the model are qualitative!

  3. Write out the model in equation form, being careful to handle the qualitative variables properly.

  4. For which of the predictors can you reject the null hypothesis H: β = 0?

  5. On the basis of your response to the previous question, fit a smaller model that only uses the predictors for which there is evidence of association with the outcome.

  6. How well do the models in (1) and (5) fit the data?

  7. Using the model from (5), obtain 95% confidence intervals for the coefficient(s).

  8. Is there evidence of outliers or high leverage observations in the model from (5)?

Table of Contents

1) Fit a multiple regression model to predict Sales using Price, Urban, and US.


In [ ]:
features = ' + '.join(['Price', 'Urban', 'US'])

lr = smf.ols(formula=f'Sales ~ {features}', data=carseats).fit()
lr.summary()

2) Provide an interpretation of each coefficient in the model. Be careful—some of the variables in the model are qualitative!

  • Price $$\frac{1\ Price}{-5.45\ Sales}$$
  • Urban $$\frac{2.2\ Rural}{1\ Urban}$$
  • US $$\frac{1.2\ US}{1\ Non-US}$$

3) Write out the model in equation form, being careful to handle the qualitative variables properly.

$$Sales = 13.0435 - 0.0545\ Price - 0.0219\ Urban + 1.2006\ US$$

4) For which of the predictors can you reject the null hypothesis H: β = 0?

  • $H_0$: There is no relationship between X and Y.
    • Features with high P-values:
      • Price
      • US

5) On the basis of your response to the previous question, fit a smaller model that only uses the predictors for which there is evidence of association with the outcome.


In [ ]:
lr = smf.ols(formula='Sales ~ Price + US', data=carseats).fit()
lr.summary()

6) How well do the models in (1) and (5) fit the data ?

Neither model is very good with $R^2$ values of 0.239.

7) Using the model from (5), obtain 95% confidence intervals for the coefficient(s).


In [ ]:
(lr.conf_int(0.05)
 .rename(columns={0: 'Lower', 1: 'Upper'}))

8) Is there evidence of outliers or high leverage observations in the model from (5) ?


In [ ]:
plot_residuals(lr)
Findings
  • There are Studentized Residuals outside the range {-2, 2} indicating the presents of outliers.

Exercise 4 - Investigate the t-statistic for the null hypothesis.

In this problem we will investigate the t-statistic for the null hypothesis H: β = 0 in simple linear regression without an intercept. To begin, we generate a predictor x and a response y as follows.

import numpy as np
np.random.seed(1)
x = np.random.randn(100)
y = 2 * x + np.random.randn(100)

  1. Perform a simple linear regression of y onto x, without an intercept. Report the coefficient estimate β, the standard error of this coefficient estimate, and the t-statistic and p-value associated with the null hypothesis H: β = 0. Comment on these results. (You can perform regression without an intercept)

  2. Now perform a simple linear regression of x onto y without an intercept, and report the coefficient estimate, its standard error, and the corresponding t-statistic and p-values associated with the null hypothesis H: β = 0. Comment on these results.

  3. What is the relationship between the results obtained in (1) and (2)?

  4. For the regrssion of Y onto X without an intercept, the t-statistic for H0:β=0 takes the form β^/SE(β^), where β^ is given by (3.38), and where

$$SE(\hat{\beta}) = \sqrt{\frac{\sum_{i=1}^n(y_i - x_i\hat{\beta})^2}{(n - 1)\sum_{i=1}^nx_i^2}}$$

Confirm numerically in Python, that the t-statistic can be written as

$$\frac{\sqrt{n - 1}\sum_{i=1}^nx_iy_i}{\sqrt{(\sum_{i=1}^nx_i^2)(\sum_{i=1}^ny_i^2) - (\sum_{i=1}^nx_iy_i)}}$$

5 . Using the results from (4), argue that the t-statistic for the regression of y onto x is the same t-statistic for the regression of x onto y.

6 . In Python, show that when regression is performed with an intercept, the t-statistic for H0:β1=0 is the same for the regression of y onto x as it is the regression of x onto y.

Table of Contents


In [ ]:
np.random.seed(1)
x = np.random.randn(100)
y = 2 * x + np.random.randn(100)
data = pd.DataFrame(np.c_[x, y], columns=['x', 'y'])

data.info()
data.head()

In [ ]:
correlation_grid(data, title='Random')

In [ ]:
correlation_heatmap(data, title='Random')

1) Perform a simple linear regression of y onto x, without an intercept. Report the coefficient estimate β, the standard error of this coefficient estimate, and the t-statistic and p-value associated with the null hypothesis H0. Comment on these results.


In [ ]:
lr = smf.ols(formula='y ~ x - 1', data=data).fit()
lr.summary()

In [ ]:
lr = smf.OLS(y, x).fit()
lr.summary()
Findings
  • Coefficient estimate: 2.1067
  • Coefficient estimate standard error: 0.106
  • The F-statistic is not equal to zero, so a relationship exists between y and x.
    • This is confirmed by the P-value of zero.
  • The model has an $R^2$ value of 0.798, which is fair.
  • The T-statistic is large indicating the x coefficient estimate is both large and precise.

2) Now perform a simple linear regression of x onto y, without an intercept. Report the coefficient estimate β, the standard error of this coefficient estimate, and the t-statistic and p-value associated with the null hypothesis H0. Comment on these results.

Findings
  • Coefficient estimate: 0.3789
  • Coefficient estimate standard error: 0.019
  • The F-statistic is not equal to zero, so a relationship exists between x and y.
    • This is confirmed by the P-value of zero.
  • The model has an $R^2$ value of 0.798, which is fair.
  • The T-statistic is large indicating the y coefficient estimate is both large and precise.

3) What is the relationship between the results obtained in (1) and (2)?

Both regresions are just rearragements of the same fit.

4) For the regrssion of Y onto X without an intercept, the t-statistic for H0:β=0 takes the form β^/SE(β^), where β^ is given by (3.38), and where

$$SE(\hat{\beta}) = \sqrt{\frac{\sum_{i=1}^n(y_i - x_i\hat{\beta})^2}{(n - 1)\sum_{i=1}^nx_i^2}}$$

Show algebraically, and confirm numerically in Python, that the t-statistic can be written as

$$\frac{\sqrt{n - 1}\sum_{i=1}^nx_iy_i}{\sqrt{(\sum_{i=1}^nx_i^2)(\sum_{i=1}^ny_i^2) - (\sum_{i=1}^nx_iy_i)}}$$

We have

$$t = \frac{\sum_ix_iy_y/\sum_jx_j^2}{\sqrt{\sum_i(y_i - x_i\hat{\beta})^2/(n - 1)\sum_jx_j^2}} = \frac{\sqrt{n - 1}\sum_ix_iy_i}{\sqrt{\sum_jx_j^2\sum_i(y_i - x_i\sum_jx_jy_j/\sum_jx_j^2)^2}} = \frac{\sqrt{n - 1}\sum_ix_iy_i}{\sqrt{(\sum_jx_j^2)(\sum_jy_j^2) - (\sum_jx_jy_j)^2}}$$

Now let’s verify this result numerically.


In [ ]:
n = x.size
((np.sqrt(n - 1) * np.sum(x * y))
 / np.sqrt(np.sum(x**2) * np.sum(y**2) - np.sum(x * y)**2))

This matches the calculated value above

5) Using the results from (4), argue that the t-statistic for the regression of y onto x is the same t-statistic for the regression of x onto y.

  • The commutative property of multiplication states that two numbers can be multiplied in either order.
    • Inverting the response and feature will not alter the equations outcome.

6) In Python, show that when regression is performed with an intercept, the t-statistic for H0:β1=0 is the same for the regression of y onto x as it is the regression of x onto y.


In [ ]:
lr = smf.ols(formula='y ~ x', data=data).fit()
lr.summary()

In [ ]:
lr = smf.ols(formula='x ~ y', data=data).fit()
lr.summary()
  • In both cases above the T-statistic is 19.783.

Exercise 5 - Explore linear regression without an intercept.

  1. Recall that the coefficient estimate β^ for the linear regression of Y onto X witout an intercept is given by (3.38). Under what circumstance is the coefficient estimate for the regression of X onto Y the same as the coefficient estimate for the regression of Y onto X ?

  2. Generate an example in Python with n = 100 observations in which the coefficient estimate for the regression of X onto Y is different from the coefficient estimate for the regression of Y onto X.

  3. Generate an example in Python with n = 100 observations in which the coefficient estimate for the regression of X onto Y is the same as the coefficient estimate for the regression of Y onto X.

Table of Contents

1) Recall that the coefficient estimate β^ for the linear regression of Y onto X witout an intercept is given by (3.38). Under what circumstance is the coefficient estimate for the regression of X onto Y the same as the coefficient estimate for the regression of Y onto X?

The coefficient estimate for the regression of Y onto X is

$$\hat{\beta} = \frac{\sum_ix_iy_i}{\sum_jx_j^2}$$

The coefficient estimate for the regression of X onto Y is

$$\hat{\beta}' = \frac{\sum_ix_iy_i}{\sum_jy_j^2}$$

The coefficients are the same iff $\sum_jx_j^2 = \sum_jy_j^2$

2) Generate an example in Python with n = 100 observations in which the coefficient estimate for the regression of X onto Y is different from the coefficient estimate for the regression of Y onto X.


In [ ]:
np.random.seed(1)
x = np.random.randn(100)
y = 2 * x + np.random.randn(100)

assert np.sum(x**2) != np.sum(y**2)

3) Generate an example in Python with n = 100 observations in which the coefficient estimate for the regression of X onto Y is the same as the coefficient estimate for the regression of Y onto X.


In [ ]:
np.random.seed(1)
x = np.arange(100)
y = x.copy()
np.random.shuffle(x)
np.random.shuffle(y)
data = pd.DataFrame(np.c_[x, y], columns=['x', 'y'])

x[:10]
y[:10]
if np.sum(np.square(x)) == np.sum(np.square(y)):
    print('The sums of squares are equal.')

In [ ]:
lr = smf.ols(formula='y ~ x', data=data).fit()
lr.summary()

In [ ]:
lr = smf.ols(formula='x ~ y', data=data).fit()
lr.summary()

Exercise 6 - Explore linear regression with simulated data.

In this exercise you will create some simulated data and will fit simple linear regression models to it. Make sure to set the seed prior to starting part (1) to ensure consistent results.

  1. Create a vector, x, containing 100 observations drawn from a N(0, 1) distribution. This represents a feature, X.

  2. Create a vector, eps, containing 100 observations drawn from a N(0, 0.25) distribution i.e. a normal distribution with mean zero and variance 0.25.

  3. Using x and eps, generate a vector y according to the model

Y = −1 + 0.5X + e

What is the length of the vector y? What are the values of β0 and β1 in this linear model?

  1. Create a scatterplot displaying the relationship between x and y. Comment on what you observe.

  2. Fit a least squares linear model to predict y using x. Comment on the model obtained. How do β0 and β1 compare to β0 and β1?

  3. Display the least squares line on the scatterplot obtained in (4). Draw the population regression line on the plot, in a different color. Create an appropriate legend.

  4. Now fit a polynomial regression model that predicts y using x and x^2. Is there evidence that the quadratic term improves the model fit? Explain your answer.

  5. Repeat (1)–(6) after modifying the data generation process in such a way that there is less noise in the data. The model (3.39) should remain the same. You can do this by decreasing the variance of the normal distribution used to generate the error term e in (2). Describe your results.

  6. Repeat (1)–(6) after modifying the data generation process in such a way that there is more noise in the data. The model (3.39) should remain the same. You can do this by increasing the variance of the normal distribution used to generate the error term  in (b). Describe your results.

  7. What are the confidence intervals for β0 and β1 based on the original data set, the noisier data set, and the less noisy data set? Comment on your results.

Table of Contents

1) Create a vector, x, containing 100 observations drawn from a N(0, 1) distribution. This represents a feature, X.


In [ ]:
np.random.seed(1)
x = 1 * np.random.randn(100) + 0

2) Create a vector, eps, containing 100 observations drawn from a N(0, 0.25) distribution i.e. a normal distribution with mean zero and variance 0.25.


In [ ]:
eps = 0.25 * np.random.randn(100) + 0

3) Using x and eps, generate a vector y according to the model

Y = −1 + 0.5X + eps

What is the length of the vector y? What are the values of β0 and β1 in this linear model?


In [ ]:
y = -1 + 0.5 * x + eps
data = pd.DataFrame(np.c_[x, y], columns=['x', 'y'])

In [ ]:
f'Length of Vector y: {y.shape[0]}'
  • β0: y-intercept (-1)
  • β1: slope of regression line (0.5)

4) Create a scatterplot displaying the relationship between x and y. Comment on what you observe.


In [ ]:
ax = data.plot(kind='scatter', x='x', y='y')

ax.set_title('y vs x', fontsize=size['title'])
ax.set_xlabel('x', fontsize=size['label'])
ax.set_ylabel('y', fontsize=size['label'])

plt.show();
Findings
  • The relationship has a linear distribution.

5) Fit a least squares linear model to predict y using x. Comment on the model obtained. How do β^0 and β^1 compare to β0 and β1?


In [ ]:
lr = smf.ols(formula='y ~ x', data=data).fit()
lr.summary()
Findings
  • predicted intercept is -0.9969 (actual = -1)
  • predicted x-coefficient is 0.4897 (actual = 0.5)
  • The F-statistic is not equal to zero, so a relationship exists between x and y.
    • This is confirmed by the P-value of zero.
  • The model has an $R^2$ value of 0.749, which is fair.
  • The T-statistic is large indicating the y coefficient estimate is both large and precise.

6) Display the least squares line on the scatterplot obtained in (4). Draw the population regression line on the plot, in a different color. Use the legend() function to create an appropriate legend.


In [ ]:
exact_x = np.linspace(-2, 2, 100)
exact_y = -1 + 0.5 * exact_x
exact_data = pd.DataFrame(np.c_[exact_x, exact_y],
                          columns=['x', 'y'])

plot_regression(data, lr, ['x'], 'y', exact_reg=exact_data)

In [ ]:
original_confidence = (lr.conf_int(0.05)
                       .rename(columns={0: 'Lower', 1: 'Upper'}))

7) Now fit a polynomial regression model that predicts y using x and $x^2$. Is there evidence that the quadratic term improves the model fit ? Explain your answer.


In [ ]:
lr = smf.ols(formula='y ~ x + np.square(x)', data=data).fit()
lr.summary()
Findings
  • The model's $R^2$ value only showed negligent improvement.
  • Going to a quadradic model is not justified here.

8) Repeat (1)-(6) after modifying the data generation process in such a way that there is less noise in the data. The initial model should remain the same. Describe your results.


In [ ]:
np.random.seed(1)
x = 1 * np.random.randn(100) + 0
eps = 0.05 * np.random.randn(100) + 0
y = -1 + 0.5 * x + eps
data = pd.DataFrame(np.c_[x, y], columns=['x', 'y'])
lr = smf.ols(formula='y ~ x', data=data).fit()
lr.summary()

In [ ]:
plot_regression(data, lr, ['x'], 'y', exact_reg=exact_data)

In [ ]:
low_error_confidence = (lr.conf_int(0.05)
                        .rename(columns={0: 'Lower', 1: 'Upper'}))
Findings
  • predicted intercept is -0.9926 (actual = -1)
  • predicted x-coefficient is 0.5048 (actual = 0.5)
  • The F-statistic is not equal to zero, so a relationship exists between x and y.
    • This is confirmed by the P-value of zero.
  • The model has an $R^2$ value of 0.989, which is very good.
    • Acurate predictions could be made with this model.
  • The T-statistic is large indicating the y coefficient estimate is both large and precise.

In [ ]:
lr = smf.ols(formula='y ~ x + np.square(x)', data=data).fit()
lr.summary()
Findings
  • The model's $R^2$ value did not imporove.
  • The P-value for the $x^2$ coefficient is large implying this coefficient is not a good estimate.
  • Going to a quadradic model is not justified here.

9) Repeat (1)-(6) after modifying the data generation process in such a way that there is more noise in the data. The initial model should remain the same. Describe your results.


In [ ]:
np.random.seed(1)
x = 1 * np.random.randn(100) + 0
eps = 1 * np.random.randn(100) + 0
y = -1 + 0.5 * x + eps
data = pd.DataFrame(np.c_[x, y], columns=['x', 'y'])
lr = smf.ols(formula='y ~ x', data=data).fit()
lr.summary()
Findings
  • The F-statistic is not equal to zero, so a relationship exists between x and y.
    • This is confirmed by the P-value of zero.
  • The model has an $R^2$ value of 0.244, which is poor.
    • This model does not represent the data.
  • The T-statistics are not large indicating the x coefficient estimate is imprecise.

In [ ]:
plot_regression(data, lr, ['x'], 'y', exact_reg=exact_data)

In [ ]:
high_error_confidence = (lr.conf_int(0.05)
                         .rename(columns={0: 'Lower', 1: 'Upper'}))

In [ ]:
lr = smf.ols(formula='y ~ x + np.square(x)', data=data).fit()
lr.summary()
Findings
  • The model's $R^2$ value did not imporove.
    • This model does not represent the data.
  • The P-value for the $x^2$ coefficient is large implying this coefficient is not a good estimate.
  • Going to a quadradic model is not justified here.

10) What are the confidence intervals for β0 and β1 based on the original data set, the noisier data set, and the less noisy data set ? Comment on your results.


In [ ]:
print('Original Confidence Intervals')
original_confidence

print('Low Error Confidence Intervals')
low_error_confidence

print('High Error Confidence Intervals')
high_error_confidence
Findings
  • As expected the confidence interval is directory proportional to the amount of error present in the data.

Exercise 7 - Explore the problem of collinearity.

Perform the following commands:

np.random.seed(8)

x1 = np.random.rand(100)

x2 = .5 * x1 + np.random.rand(100) / 10

y = 2 + 2 x1 + .3 x2 + np.random.randn(100)

  1. The last line corresponds to creating a linear model in which y is a function of x1 and x2. Write out the form of the linear model. What are the regression coefficients?

  2. What is the correlation between x1 and x2? Create a scatterplot displaying the relationship between the variables.

  3. Using this data, fit a least squares regression to predict y using x1 and x2. Describe the results obtained. What are β0, β1, and β2? How do these relate to the true β0, β1, and β2? Can you reject the null hypothesis Ho:β1 = 0? How about the null hypothesis Ho:β2 = 0?

  4. Now fit a least squares regression to predict y using only x1. Comment on your results. Can you reject the null hypothesis Ho: β1 = 0?

  5. Now fit a least squares regression to predict y using only x2. Comment on your results. Can you reject the null hypothesis Ho: β1 = 0?

  6. Do the results obtained in (3)–(5) contradict each other? Explain your answer.

  7. Now suppose we obtain one additional observation, which was unfortunately mismeasured.

x1=c(x1 , 0.1)

x2=c(x2 , 0.8)

y=c(y,6)

Re-fit the linear models from (3) to (5) using this new data. What effect does this new observation have on the each of the models? In each model, is this observation an outlier? A high-leverage point? Both? Explain your answers.

Table of Contents


In [ ]:
np.random.seed(8)
x1 = np.random.rand(100)
x2 = 0.5 * x1 + np.random.rand(100) / 10
y = 2 + 2 * x1 + 0.3 * x2 + np.random.randn(100)
data = pd.DataFrame(np.c_[x1, x2, y], columns=['x1', 'x2', 'y'])

1) The last line corresponds to creating a linear model in which y is a function of x1 and x2. Write out the form of the linear model. What are the regression coefficients?

y = β0 + 2β1 + 0.3β2 + error

2) What is the correlation between x1 and x2? Create a scatterplot displaying the relationship between the variables.


In [ ]:
data.corr()
correlation_grid(data)
Findings
  • The variables x1 and x2 are highly correlated.

3) Using this data, fit a least squares regression to predict y using x1 and x2. Describe the results obtained. What are β0, β1, and β2? How do these relate to the true β0, β1, and β2? Can you reject the null hypothesis Ho:β1 = 0? How about the null hypothesis Ho:β2 = 0?


In [ ]:
lr = smf.ols(formula='y ~ x1 + x2', data=data).fit()
lr.summary()
Findings
  • The model does not acurately represent the data.
  • Coefficients:
    • β0: 1.9615 (actual: 1)
    • β1: 5.9895 (actual: 2)
    • β2: -6.3538 (actual: 0.3)
  • The x1 variable has a P-value of 0 and is statistically significant.
    • The null hypothesis my be rejected.
  • The x2 variable has a P-value greater than 0.05 implying it is not statistically significant.
    • The null hypothesis may be not rejected.

4) Now fit a least squares regression to predict y using only x1. Comment on your results. Can you reject the null hypothesis H0:β1=0 ?


In [ ]:
lr = smf.ols(formula='y ~ x1', data=data).fit()
lr.summary()
Findings
  • The model improved, but is still poor.
  • Based on the F-statistic there is a relationship between x and y.
  • The null hypothesis may be rejected for x1.

5) Now fit a least squares regression to predict y using only x2. Comment on your results. Can you reject the null hypothesis Ho: β1 = 0?


In [ ]:
lr = smf.ols(formula='y ~ x2', data=data).fit()
lr.summary()
Findings
  • The model is better than the x1 + x2 combined model, not as good as the y = x1 model.
    • All three of them are poor predictors of the data.
  • Based on the F-statistic there is a relationship between x and y.
  • The null hypothesis may be rejected for x2.

6) Do the results obtained in (3)–(5) contradict each other? Explain your answer.

The answers do not contradict each other, but display the interdependence of x1 and x2. Only one of these features should be used in the model and x1 produced better results.

7) Now suppose we obtain one additional observation, which was unfortunately mismeasured.

x1=c(x1, 0.1)

x2=c(x2, 0.8)

y=c(y, 6)

Re-fit the linear models from (3) to (5) using this new data. What effect does this new observation have on the each of the models? In each model, is this observation an outlier? A high-leverage point? Both? Explain your answers.


In [ ]:
data.loc[100] = [0.1, 0.8, 6]
lr = smf.ols(formula='y ~ x1 + x2', data=data).fit()
lr.summary()
plot_residuals(lr)
Findings
  • $R^2$ value decresed
  • F-statisic value decresed
  • x1 may no longer reject the null hypothesis
  • Observation 100 is an outlier
  • Observastion 100 has high leverage

In [ ]:
lr = smf.ols(formula='y ~ x1', data=data).fit()
lr.summary()
plot_residuals(lr)
Findings
  • $R^2$ value increased
  • F-statisic value decresed
  • x1 may still reject the null hypothesis
  • Observation 100 is an outlier
  • Observastion 100 has low leverage

In [ ]:
lr = smf.ols(formula='y ~ x2', data=data).fit()
lr.summary()
plot_residuals(lr)
Findings
  • $R^2$ value increased
  • F-statisic value increased
  • x2 may still reject the null hypothesis
  • Observation 100 is not an outlier
  • Observastion 100 has high leverage
  • With the erroneous data entry now the feature x2 is preferred over x1.

Exercise 8 - Predict per capita crime rate.

This problem involves the Boston data set. We will now try to predict per capita crime rate using the other variables in this data set. In other words, per capita crime rate is the response, and the other variables are the predictors.

  1. For each predictor, fit a simple linear regression model to predict the response. Describe your results. In which of the models is there a statistically significant association between the predictor and the response? Create some plots to back up your assertions.

  2. Fit a multiple regression model to predict the response using all of the predictors. Describe your results. For which predictors can we reject the null hypothesis H: β = 0?

  3. How do your results from (1) compare to your results from (2)? Create a plot displaying the univariate regression coefficients from (1) on the x-axis, and the multiple regression coefficients from (2) on the y-axis. That is, each predictor is displayed as a single point in the plot. Its coefficient in a simple linear regression model is shown on the x-axis, and its coefficient estimate in the multiple linear regression model is shown on the y-axis.

  4. Is there evidence of non-linear association between any of the predictors and the response? To answer this question, for each predictor X, fit a model of the form Y = β0 + β1X + β2X^2 + β3X^3 + E.

Table of Contents

1) For each predictor, fit a simple linear regression model to predict the response. Describe your results. In which of the models is there a statistically significant association between the predictor and the response? Create some plots to back up your assertions.


In [ ]:
correlation_heatmap(boston, 'Boston')

In [ ]:
correlation_grid(boston, title='Boston')

In [ ]:
single_params = pd.Series()
features = [x for x in boston.columns
            if x not in ('crim')]
for feature in features:
    print('{}{}\n{}'.format('\n' * 2, '*' * 80, feature))
    lr = smf.ols(formula=f'crim ~ {feature}', data=boston).fit()
    lr.summary()
    single_params.loc[feature] = lr.params.loc[feature]
    boston.plot(kind='scatter', x='crim', y=feature)
Findings
  • All features except for chas have P-values below 0.05 and have a relationship to crim.

2) Fit a multiple regression model to predict the response using all of the predictors. Describe your results. For which predictors can we reject the null hypothesis H: β = 0?


In [ ]:
features = ' + '.join([x for x in boston.columns
                       if x not in ('crim')])
lr = smf.ols(formula=f'crim ~ {features}', data=boston).fit()
lr.summary()
Findings
  • The following features were able to reject the null hypothesis and have a relationship to crim.
    • zn
    • nox
    • dis
    • rad
    • black
    • medv
  • $R^2$ value of 0.454 implys the model is a poor predictor of the data

3) How do your results from (1) compare to your results from (2)? Create a plot displaying the univariate regression coefficients from (1) on the x-axis, and the multiple regression coefficients from (2) on the y-axis. That is, each predictor is displayed as a single point in the plot. Its coefficient in a simple linear regression model is shown on the x-axis, and its coefficient estimate in the multiple linear regression model is shown on the y-axis.


In [ ]:
mult_params = lr.params.iloc[1:]
models = pd.DataFrame({'multiple': mult_params, 'single': single_params})
models

In [ ]:
ax = models.plot(kind='scatter', x='multiple', y='single')

ax.set_title('Multiple Regression vs Single Regression',
             fontsize=size['title'])
ax.set_xlabel('Multiple', fontsize=size['label'])
ax.set_ylabel('Single', fontsize=size['label'])

plt.show();

3) Is there evidence of non-linear association between any of the predictors and the response? To answer this question, for each predictor X, fit a model of the form $Y = β0 + β1X + β2X^2 + β3X^3 + E$.


In [ ]:
features = [x for x in boston.columns
            if x not in ('crim')]
for feature in features:
    print('{}{}\n{}'.format('\n' * 2, '*' * 80, feature))
    model = f'{feature} + np.square({feature}) + np.power({feature}, 3)'
    lr = smf.ols(formula=f'crim ~ {model}', data=boston).fit()
    lr.summary()
Findings
  • The following features show a cubic relationship to crim:
    • indus
    • nox
    • age
    • dis
    • ptratio
    • medv