In [ ]:
from collections import OrderedDict
import itertools
import os
import os.path as osp
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics.regressionplots as smrp
import statsmodels.sandbox.regression as smsr
from k2datascience.utils import save_fig, size
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
In [ ]:
def correlation_grid(data, bins=30, title=None, save=False):
"""
Plot the correlation grid.
:param pd.DataFrame data: original data
:param int bins: number of bins for diagonal histogram plots
:param bool save: if True the figure will be saved
:param str title: dataset title
"""
plot_title = 'Dataset Correlation'
if title:
title = f'{title} {plot_title}'
else:
title = plot_title
grid = sns.pairplot(data,
diag_kws={'alpha': 0.5, 'bins': bins,
'edgecolor': 'black'},
plot_kws={'alpha': 0.7})
grid.fig.suptitle(title,
fontsize=size['super_title'], y=1.03)
cols = data.columns
for n, col in enumerate(cols):
grid.axes[cols.size - 1, n].set_xlabel(cols[n],
fontsize=size['label'])
grid.axes[n, 0].set_ylabel(cols[n], fontsize=size['label'])
save_fig(title, save)
In [ ]:
def correlation_heatmap(data, title=None, save=False):
"""
Plot the correlation values as a heatmap.
:param pd.DataFrame data: data object
:param str title: dataset title
:param bool save: if True the figure will be saved
"""
plot_title = 'Dataset Correlation'
if title:
title = f'{title} {plot_title}'
else:
title = plot_title
fig = plt.figure('Correlation Heatmap', figsize=(10, 8),
facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
sns.heatmap(data.corr(),
annot=True, cbar_kws={'orientation': 'vertical'},
fmt='.2f', linewidths=5, vmin=-1, vmax=1, ax=ax0)
ax0.set_title(title, fontsize=size['title'])
ax0.set_xticklabels(ax0.xaxis.get_majorticklabels(),
fontsize=size['label'], rotation=80)
ax0.set_yticklabels(ax0.yaxis.get_majorticklabels(),
fontsize=size['label'], rotation=0)
save_fig(title, save)
In [ ]:
def plot_regression(data, lin_reg, features, target,
exact_reg=None, save=False, title=None):
"""
Plot the original data with linear regression line.
:param pd.DataFrame data: data object
:param lin_reg: linear regression model
:type: statsmodels.regression.linear_model.RegressionRes
:param list features: names of feature columns
:param str target: name of target column
:param pd.DataFrame exact_reg: x and y points defining /
exact linear regression
:param bool save: if True the figure will be saved
:param str title: data set title
"""
plot_title = 'Linear Regression'
if title:
title = f'{title} {plot_title}'
else:
title = plot_title
fig = plt.figure(title, figsize=(8, 6),
facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax = plt.subplot2grid((rows, cols), (0, 0))
x = data.loc[:, features]
y = data[target].values
for n, feature in enumerate(x.columns):
x = np.squeeze(data.loc[:, feature].values)
ax.scatter(x, y, alpha=0.5, color=f'C{n}', label=feature)
# Regression Line
sort_idx = np.argsort(x)
ax.plot(x[sort_idx], predict(x, lin_reg.params)[sort_idx],
color='black', label='Linear Regression', linestyle='--')
# Upper and Lower Confidence Intervals
std, upper, lower = smsr.predstd.wls_prediction_std(lin_reg)
ax.plot(x[sort_idx], lower[sort_idx], alpha=0.5, color='green',
label='Upper Confidence Interval', linestyle='-.')
ax.plot(x[sort_idx], upper[sort_idx], alpha=0.5, color='red',
label='Lower Confidence Interval', linestyle='-.')
if not exact_reg.empty:
exact_reg.plot(x='x', y='y', color='magenta',
label='Exact Linear Regression',
linestyle=':', ax=ax)
features_list = f'({", ".join(features)})'
ax.set_title(f'{target} vs {features_list}',
fontsize=size['title'])
ax.legend()
ax.set_xlabel(features_list, fontsize=size['label'])
ax.set_ylabel(target, fontsize=size['label'])
save_fig(title, save)
In [ ]:
def plot_residuals(lin_reg, save=False, title=None):
"""
Plot resdual statistics
:param lin_reg: linear regression model
:type: statsmodels.regression.linear_model.RegressionRes
:param bool save: if True the figure will be saved
:param str title: data set title
"""
plot_title = 'Dataset Residuals'
if title:
title = f'{title} {plot_title}'
else:
title = plot_title
fig = plt.figure(title, figsize=(14, 21),
facecolor='white', edgecolor='black')
rows, cols = (3, 2)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
ax1 = plt.subplot2grid((rows, cols), (0, 1))
ax2 = plt.subplot2grid((rows, cols), (1, 0), rowspan=2)
ax3 = plt.subplot2grid((rows, cols), (1, 1), rowspan=2)
# Normalized Residuals Histogram
ax0.hist(lin_reg.resid_pearson, alpha=0.5, edgecolor='black')
ax0.set_title('Normalized Residuals Histogram',
fontsize=size['title'])
ax0.set_xlabel('Normalized Residuals', fontsize=size['label'])
ax0.set_ylabel('Counts', fontsize=size['label'])
# Residuals vs Fitted Values
ax1.scatter(lin_reg.fittedvalues, lin_reg.resid)
ax1.set_title('Raw Residuals vs Fitted Values',
fontsize=size['title'])
ax1.set_xlabel('Fitted Values', fontsize=size['label'])
ax1.set_ylabel('Raw Residuals', fontsize=size['label'])
# Leverage vs Normalized Residuals Squared
leverage = smrp.plot_leverage_resid2(lin_reg, ax=ax2)
ax2.set_title('Leverage vs Normalized $Residuals^2$',
fontsize=size['title'])
ax2.set_xlabel('Normalized $Residuals^2$',
fontsize=size['label'])
ax2.set_ylabel('Leverage', fontsize=size['label'])
# Influence Plot
influence = smrp.influence_plot(lin_reg, ax=ax3)
ax3.set_title('Influence Plot',
fontsize=size['title'])
ax3.set_xlabel('H Leverage',
fontsize=size['label'])
ax3.set_ylabel('Studentized Residuals',
fontsize=size['label'])
plt.tight_layout
plt.suptitle(title, fontsize=size['super_title'], y=0.92)
save_fig(title, save)
In [ ]:
def predict(x, parameters):
"""
Return predicted values provided regression parameters.
.. note:: StatsModels provides regression coefficients in increasing
order, while NumPy would like to recive them in decreasing order.
This function is designed to recive the StatsModels format,
reverse the dimensionality, and then allow NumPy to perform the
calculation.
:param np.array x: array of input values
:param pd.Series parameters: linear regression coefficients in
order of increasing dimension
:return: predicted target values
:rtype: np.array
"""
p = np.poly1d(parameters.values[::-1])
return p(x)
In [ ]:
data_dir = osp.realpath(osp.join(os.getcwd(), '..', 'data',
'linear_regression'))
In [ ]:
auto = pd.read_csv(osp.join(data_dir, 'auto.csv'))
data = auto
data.info()
data.head()
data.describe()
In [ ]:
boston = pd.read_csv(osp.join(data_dir, 'boston.csv'), index_col=0)
data = boston
data.info()
data.head()
data.describe()
In [ ]:
carseats = pd.read_csv(osp.join(data_dir, 'carseats.csv'))
data = carseats
data.info()
data.head()
data.describe()
Auto
data set.Use statsmodels
or scikit-learn
to perform a simple linear regression with
mpg
as the response and horsepower
as the predictor. Print the results. Comment on the output.
For example:
mpg
associated with a horsepower
of 98? Optional: What are the associated 95% confidence and prediction intervals?Plot the response and the predictor. Plot the least squares regression line.
Produce diagnostic plots of the least squares regression fit. Comment on any problems you see with the fit.
1) Use statsmodels or scikit-learn to perform a simple linear regression with mpg as the response and horsepower as the predictor. Print the results. Comment on the output.
In [ ]:
lr = smf.ols(formula='mpg ~ horsepower', data=auto).fit()
lr.summary()
In [ ]:
print(f'Horsepower: 98\tMPG:{predict(98, lr.params):.2f}')
2) Plot the response and the predictor. Plot the least squares regression line.
In [ ]:
plot_regression(data=auto, lin_reg=lr, features=['horsepower'],
target='mpg')
3) Produce diagnostic plots of the least squares regression fit. Comment on any problems you see with the fit.
In [ ]:
plot_residuals(lr)
Auto
data set.Produce a scatterplot matrix which includes all of the variables in the data set.
Compute the matrix of correlations between the variables using the function corr(). Plot a matrix correlation heatmap as well.
Perform a multiple linear regression with mpg
as the response and all other variables except name
as the predictors. Print the results. Comment on the output. For instance:
Produce diagnostic plots of the linear regression fit. Comment on any problems you see with the fit. Do the residual plots suggest any unusually large outliers? Does the leverage plot identify any observations with unusually high leverage?
Use the - and + symbols to fit linear regression models with interaction effects. Do any interactions appear to be statistically significant?
Try a few different transformations of the variables, such as $log(X)$, $\sqrt{X}$, $X^2$. Comment on your findings.
1) Produce a scatterplot matrix which includes all of the variables in the data set.
In [ ]:
numeric = auto.select_dtypes(include=[np.int, np.float])
correlation_grid(numeric, bins=10)
2) Compute the matrix of correlations between the variables using the function corr(). Plot a matrix correlation heatmap as well.
In [ ]:
auto.corr()
In [ ]:
correlation_heatmap(auto)
3) Perform a multiple linear regression with mpg as the response and all other variables except name as the predictors. Print the results. Comment on the output.
In [ ]:
features = numeric.drop('mpg', axis=1)
features = ' + '.join(features.columns)
In [ ]:
lr = smf.ols(formula=f'mpg ~ {features}', data=auto).fit()
lr.summary()
4) Produce diagnostic plots of the linear regression fit. Comment on any problems you see with the fit. Do the residual plots suggest any unusually large outliers? Does the leverage plot identify any observations with unusually high leverage?
In [ ]:
plot_residuals(lr)
In [ ]:
auto.loc[10:15, :]
In [ ]:
numeric.loc[13, :] / numeric.max()
5) Use the - and + symbols to fit linear regression models with interaction effects. Do any interactions appear to be statistically significant?
In [ ]:
features = numeric.drop('mpg', axis=1)
interactions = [f'{x[0]} * {x[1]}' for x
in itertools.combinations(features.columns, 2)]
interactions = ' + '.join(interactions)
inter_formula = f'mpg ~ {" + ".join(features)} + {interactions}'
In [ ]:
new_lr = smf.ols(formula=inter_formula, data=auto).fit()
new_lr.summary()
In [ ]:
plot_residuals(new_lr)
6) Try a few different transformations of the variables, such as $log(X)$, $\sqrt{X}$, $X^2$. Comment on your findings.
In [ ]:
transformations = OrderedDict()
for trans in ('', 'np.log', 'np.sqrt', 'np.square'):
model = smf.ols(formula=f'mpg ~ {trans}(horsepower)',
data=auto).fit()
transformations[trans] = model.rsquared
fig = sm.qqplot(model.resid)
if not trans:
trans = 'Native'
plt.title(trans, fontsize=size['title']);
plt.show();
transformations
In [ ]:
trans_lr = smf.ols(formula=(
'mpg ~ cylinders + np.log(displacement) + horsepower'
'+ weight + acceleration - year + origin - 1'), data=auto).fit()
trans_lr.summary()
In [ ]:
plot_residuals(trans_lr)
Carseats
data set.Fit a multiple regression model to predict Sales
using Price
,
Urban
, and US
.
Provide an interpretation of each coefficient in the model. Be careful—some of the variables in the model are qualitative!
Write out the model in equation form, being careful to handle the qualitative variables properly.
For which of the predictors can you reject the null hypothesis H: β = 0?
On the basis of your response to the previous question, fit a smaller model that only uses the predictors for which there is evidence of association with the outcome.
How well do the models in (1) and (5) fit the data?
Using the model from (5), obtain 95% confidence intervals for the coefficient(s).
Is there evidence of outliers or high leverage observations in the model from (5)?
1) Fit a multiple regression model to predict Sales using Price, Urban, and US.
In [ ]:
features = ' + '.join(['Price', 'Urban', 'US'])
lr = smf.ols(formula=f'Sales ~ {features}', data=carseats).fit()
lr.summary()
2) Provide an interpretation of each coefficient in the model. Be careful—some of the variables in the model are qualitative!
3) Write out the model in equation form, being careful to handle the qualitative variables properly.
4) For which of the predictors can you reject the null hypothesis H: β = 0?
5) On the basis of your response to the previous question, fit a smaller model that only uses the predictors for which there is evidence of association with the outcome.
In [ ]:
lr = smf.ols(formula='Sales ~ Price + US', data=carseats).fit()
lr.summary()
6) How well do the models in (1) and (5) fit the data ?
Neither model is very good with $R^2$ values of 0.239.
7) Using the model from (5), obtain 95% confidence intervals for the coefficient(s).
In [ ]:
(lr.conf_int(0.05)
.rename(columns={0: 'Lower', 1: 'Upper'}))
8) Is there evidence of outliers or high leverage observations in the model from (5) ?
In [ ]:
plot_residuals(lr)
In this problem we will investigate the t-statistic for the null hypothesis H: β = 0 in simple linear regression without an intercept. To begin, we generate a predictor x and a response y as follows.
import numpy as np
np.random.seed(1)
x = np.random.randn(100)
y = 2 * x + np.random.randn(100)
Perform a simple linear regression of y onto x, without an intercept. Report the coefficient estimate β, the standard error of this coefficient estimate, and the t-statistic and p-value associated with the null hypothesis H: β = 0. Comment on these results. (You can perform regression without an intercept)
Now perform a simple linear regression of x onto y without an intercept, and report the coefficient estimate, its standard error, and the corresponding t-statistic and p-values associated with the null hypothesis H: β = 0. Comment on these results.
What is the relationship between the results obtained in (1) and (2)?
For the regrssion of Y onto X without an intercept, the t-statistic for H0:β=0 takes the form β^/SE(β^), where β^ is given by (3.38), and where
Confirm numerically in Python, that the t-statistic can be written as
$$\frac{\sqrt{n - 1}\sum_{i=1}^nx_iy_i}{\sqrt{(\sum_{i=1}^nx_i^2)(\sum_{i=1}^ny_i^2) - (\sum_{i=1}^nx_iy_i)}}$$5 . Using the results from (4), argue that the t-statistic for the regression of y onto x is the same t-statistic for the regression of x onto y.
6 . In Python, show that when regression is performed with an intercept, the t-statistic for H0:β1=0 is the same for the regression of y onto x as it is the regression of x onto y.
In [ ]:
np.random.seed(1)
x = np.random.randn(100)
y = 2 * x + np.random.randn(100)
data = pd.DataFrame(np.c_[x, y], columns=['x', 'y'])
data.info()
data.head()
In [ ]:
correlation_grid(data, title='Random')
In [ ]:
correlation_heatmap(data, title='Random')
1) Perform a simple linear regression of y onto x, without an intercept. Report the coefficient estimate β, the standard error of this coefficient estimate, and the t-statistic and p-value associated with the null hypothesis H0. Comment on these results.
In [ ]:
lr = smf.ols(formula='y ~ x - 1', data=data).fit()
lr.summary()
In [ ]:
lr = smf.OLS(y, x).fit()
lr.summary()
2) Now perform a simple linear regression of x onto y, without an intercept. Report the coefficient estimate β, the standard error of this coefficient estimate, and the t-statistic and p-value associated with the null hypothesis H0. Comment on these results.
3) What is the relationship between the results obtained in (1) and (2)?
Both regresions are just rearragements of the same fit.
4) For the regrssion of Y onto X without an intercept, the t-statistic for H0:β=0 takes the form β^/SE(β^), where β^ is given by (3.38), and where
$$SE(\hat{\beta}) = \sqrt{\frac{\sum_{i=1}^n(y_i - x_i\hat{\beta})^2}{(n - 1)\sum_{i=1}^nx_i^2}}$$Show algebraically, and confirm numerically in Python, that the t-statistic can be written as
$$\frac{\sqrt{n - 1}\sum_{i=1}^nx_iy_i}{\sqrt{(\sum_{i=1}^nx_i^2)(\sum_{i=1}^ny_i^2) - (\sum_{i=1}^nx_iy_i)}}$$We have
$$t = \frac{\sum_ix_iy_y/\sum_jx_j^2}{\sqrt{\sum_i(y_i - x_i\hat{\beta})^2/(n - 1)\sum_jx_j^2}} = \frac{\sqrt{n - 1}\sum_ix_iy_i}{\sqrt{\sum_jx_j^2\sum_i(y_i - x_i\sum_jx_jy_j/\sum_jx_j^2)^2}} = \frac{\sqrt{n - 1}\sum_ix_iy_i}{\sqrt{(\sum_jx_j^2)(\sum_jy_j^2) - (\sum_jx_jy_j)^2}}$$Now let’s verify this result numerically.
In [ ]:
n = x.size
((np.sqrt(n - 1) * np.sum(x * y))
/ np.sqrt(np.sum(x**2) * np.sum(y**2) - np.sum(x * y)**2))
This matches the calculated value above
5) Using the results from (4), argue that the t-statistic for the regression of y onto x is the same t-statistic for the regression of x onto y.
6) In Python, show that when regression is performed with an intercept, the t-statistic for H0:β1=0 is the same for the regression of y onto x as it is the regression of x onto y.
In [ ]:
lr = smf.ols(formula='y ~ x', data=data).fit()
lr.summary()
In [ ]:
lr = smf.ols(formula='x ~ y', data=data).fit()
lr.summary()
Recall that the coefficient estimate β^ for the linear regression of Y onto X witout an intercept is given by (3.38). Under what circumstance is the coefficient estimate for the regression of X onto Y the same as the coefficient estimate for the regression of Y onto X ?
Generate an example in Python with n = 100 observations in which the coefficient estimate for the regression of X onto Y is different from the coefficient estimate for the regression of Y onto X.
Generate an example in Python with n = 100 observations in which the coefficient estimate for the regression of X onto Y is the same as the coefficient estimate for the regression of Y onto X.
1) Recall that the coefficient estimate β^ for the linear regression of Y onto X witout an intercept is given by (3.38). Under what circumstance is the coefficient estimate for the regression of X onto Y the same as the coefficient estimate for the regression of Y onto X?
The coefficient estimate for the regression of Y onto X is
$$\hat{\beta} = \frac{\sum_ix_iy_i}{\sum_jx_j^2}$$The coefficient estimate for the regression of X onto Y is
$$\hat{\beta}' = \frac{\sum_ix_iy_i}{\sum_jy_j^2}$$The coefficients are the same iff $\sum_jx_j^2 = \sum_jy_j^2$
2) Generate an example in Python with n = 100 observations in which the coefficient estimate for the regression of X onto Y is different from the coefficient estimate for the regression of Y onto X.
In [ ]:
np.random.seed(1)
x = np.random.randn(100)
y = 2 * x + np.random.randn(100)
assert np.sum(x**2) != np.sum(y**2)
3) Generate an example in Python with n = 100 observations in which the coefficient estimate for the regression of X onto Y is the same as the coefficient estimate for the regression of Y onto X.
In [ ]:
np.random.seed(1)
x = np.arange(100)
y = x.copy()
np.random.shuffle(x)
np.random.shuffle(y)
data = pd.DataFrame(np.c_[x, y], columns=['x', 'y'])
x[:10]
y[:10]
if np.sum(np.square(x)) == np.sum(np.square(y)):
print('The sums of squares are equal.')
In [ ]:
lr = smf.ols(formula='y ~ x', data=data).fit()
lr.summary()
In [ ]:
lr = smf.ols(formula='x ~ y', data=data).fit()
lr.summary()
In this exercise you will create some simulated data and will fit simple linear regression models to it. Make sure to set the seed prior to starting part (1) to ensure consistent results.
Create a vector, x, containing 100 observations drawn from a N(0, 1) distribution. This represents a feature, X.
Create a vector, eps, containing 100 observations drawn from a N(0, 0.25) distribution i.e. a normal distribution with mean zero and variance 0.25.
Using x and eps, generate a vector y according to the model
Y = −1 + 0.5X + e
What is the length of the vector y
? What are the values of β0 and β1 in this linear model?
Create a scatterplot displaying the relationship between x and y. Comment on what you observe.
Fit a least squares linear model to predict y using x
. Comment
on the model obtained. How do β0 and β1 compare to β0 and
β1?
Display the least squares line on the scatterplot obtained in (4). Draw the population regression line on the plot, in a different color. Create an appropriate legend.
Now fit a polynomial regression model that predicts y
using x
and x^2
. Is there evidence that the quadratic term improves the
model fit? Explain your answer.
Repeat (1)–(6) after modifying the data generation process in
such a way that there is less noise in the data. The model (3.39)
should remain the same. You can do this by decreasing the variance
of the normal distribution used to generate the error term
e
in (2). Describe your results.
Repeat (1)–(6) after modifying the data generation process in such a way that there is more noise in the data. The model (3.39) should remain the same. You can do this by increasing the variance of the normal distribution used to generate the error term in (b). Describe your results.
What are the confidence intervals for β0 and β1 based on the original data set, the noisier data set, and the less noisy data set? Comment on your results.
1) Create a vector, x, containing 100 observations drawn from a N(0, 1) distribution. This represents a feature, X.
In [ ]:
np.random.seed(1)
x = 1 * np.random.randn(100) + 0
2) Create a vector, eps, containing 100 observations drawn from a N(0, 0.25) distribution i.e. a normal distribution with mean zero and variance 0.25.
In [ ]:
eps = 0.25 * np.random.randn(100) + 0
3) Using x and eps, generate a vector y according to the model
Y = −1 + 0.5X + eps
What is the length of the vector y? What are the values of β0 and β1 in this linear model?
In [ ]:
y = -1 + 0.5 * x + eps
data = pd.DataFrame(np.c_[x, y], columns=['x', 'y'])
In [ ]:
f'Length of Vector y: {y.shape[0]}'
4) Create a scatterplot displaying the relationship between x and y. Comment on what you observe.
In [ ]:
ax = data.plot(kind='scatter', x='x', y='y')
ax.set_title('y vs x', fontsize=size['title'])
ax.set_xlabel('x', fontsize=size['label'])
ax.set_ylabel('y', fontsize=size['label'])
plt.show();
5) Fit a least squares linear model to predict y
using x
. Comment on the model obtained. How do β^0 and β^1 compare to β0 and β1?
In [ ]:
lr = smf.ols(formula='y ~ x', data=data).fit()
lr.summary()
6) Display the least squares line on the scatterplot obtained in (4). Draw the population regression line on the plot, in a different color. Use the legend() function to create an appropriate legend.
In [ ]:
exact_x = np.linspace(-2, 2, 100)
exact_y = -1 + 0.5 * exact_x
exact_data = pd.DataFrame(np.c_[exact_x, exact_y],
columns=['x', 'y'])
plot_regression(data, lr, ['x'], 'y', exact_reg=exact_data)
In [ ]:
original_confidence = (lr.conf_int(0.05)
.rename(columns={0: 'Lower', 1: 'Upper'}))
7) Now fit a polynomial regression model that predicts y using x and $x^2$. Is there evidence that the quadratic term improves the model fit ? Explain your answer.
In [ ]:
lr = smf.ols(formula='y ~ x + np.square(x)', data=data).fit()
lr.summary()
8) Repeat (1)-(6) after modifying the data generation process in such a way that there is less noise in the data. The initial model should remain the same. Describe your results.
In [ ]:
np.random.seed(1)
x = 1 * np.random.randn(100) + 0
eps = 0.05 * np.random.randn(100) + 0
y = -1 + 0.5 * x + eps
data = pd.DataFrame(np.c_[x, y], columns=['x', 'y'])
lr = smf.ols(formula='y ~ x', data=data).fit()
lr.summary()
In [ ]:
plot_regression(data, lr, ['x'], 'y', exact_reg=exact_data)
In [ ]:
low_error_confidence = (lr.conf_int(0.05)
.rename(columns={0: 'Lower', 1: 'Upper'}))
In [ ]:
lr = smf.ols(formula='y ~ x + np.square(x)', data=data).fit()
lr.summary()
9) Repeat (1)-(6) after modifying the data generation process in such a way that there is more noise in the data. The initial model should remain the same. Describe your results.
In [ ]:
np.random.seed(1)
x = 1 * np.random.randn(100) + 0
eps = 1 * np.random.randn(100) + 0
y = -1 + 0.5 * x + eps
data = pd.DataFrame(np.c_[x, y], columns=['x', 'y'])
lr = smf.ols(formula='y ~ x', data=data).fit()
lr.summary()
In [ ]:
plot_regression(data, lr, ['x'], 'y', exact_reg=exact_data)
In [ ]:
high_error_confidence = (lr.conf_int(0.05)
.rename(columns={0: 'Lower', 1: 'Upper'}))
In [ ]:
lr = smf.ols(formula='y ~ x + np.square(x)', data=data).fit()
lr.summary()
10) What are the confidence intervals for β0 and β1 based on the original data set, the noisier data set, and the less noisy data set ? Comment on your results.
In [ ]:
print('Original Confidence Intervals')
original_confidence
print('Low Error Confidence Intervals')
low_error_confidence
print('High Error Confidence Intervals')
high_error_confidence
Perform the following commands:
np.random.seed(8)
x1 = np.random.rand(100)
x2 = .5 * x1 + np.random.rand(100) / 10
y = 2 + 2 x1 + .3 x2 + np.random.randn(100)
The last line corresponds to creating a linear model in which y is a function of x1 and x2. Write out the form of the linear model. What are the regression coefficients?
What is the correlation between x1 and x2? Create a scatterplot displaying the relationship between the variables.
Using this data, fit a least squares regression to predict y using x1 and x2. Describe the results obtained. What are β0, β1, and β2? How do these relate to the true β0, β1, and β2? Can you reject the null hypothesis Ho:β1 = 0? How about the null hypothesis Ho:β2 = 0?
Now fit a least squares regression to predict y using only x1. Comment on your results. Can you reject the null hypothesis Ho: β1 = 0?
Now fit a least squares regression to predict y using only x2. Comment on your results. Can you reject the null hypothesis Ho: β1 = 0?
Do the results obtained in (3)–(5) contradict each other? Explain your answer.
Now suppose we obtain one additional observation, which was unfortunately mismeasured.
x1=c(x1 , 0.1)
x2=c(x2 , 0.8)
y=c(y,6)
Re-fit the linear models from (3) to (5) using this new data. What effect does this new observation have on the each of the models? In each model, is this observation an outlier? A high-leverage point? Both? Explain your answers.
In [ ]:
np.random.seed(8)
x1 = np.random.rand(100)
x2 = 0.5 * x1 + np.random.rand(100) / 10
y = 2 + 2 * x1 + 0.3 * x2 + np.random.randn(100)
data = pd.DataFrame(np.c_[x1, x2, y], columns=['x1', 'x2', 'y'])
1) The last line corresponds to creating a linear model in which y is a function of x1 and x2. Write out the form of the linear model. What are the regression coefficients?
y = β0 + 2β1 + 0.3β2 + error
2) What is the correlation between x1 and x2? Create a scatterplot displaying the relationship between the variables.
In [ ]:
data.corr()
correlation_grid(data)
3) Using this data, fit a least squares regression to predict y using x1 and x2. Describe the results obtained. What are β0, β1, and β2? How do these relate to the true β0, β1, and β2? Can you reject the null hypothesis Ho:β1 = 0? How about the null hypothesis Ho:β2 = 0?
In [ ]:
lr = smf.ols(formula='y ~ x1 + x2', data=data).fit()
lr.summary()
4) Now fit a least squares regression to predict y
using only x1
. Comment on your results. Can you reject the null hypothesis H0:β1=0 ?
In [ ]:
lr = smf.ols(formula='y ~ x1', data=data).fit()
lr.summary()
5) Now fit a least squares regression to predict y using only x2. Comment on your results. Can you reject the null hypothesis Ho: β1 = 0?
In [ ]:
lr = smf.ols(formula='y ~ x2', data=data).fit()
lr.summary()
6) Do the results obtained in (3)–(5) contradict each other? Explain your answer.
The answers do not contradict each other, but display the interdependence of x1 and x2. Only one of these features should be used in the model and x1 produced better results.
7) Now suppose we obtain one additional observation, which was unfortunately mismeasured.
x1=c(x1, 0.1)
x2=c(x2, 0.8)
y=c(y, 6)
Re-fit the linear models from (3) to (5) using this new data. What effect does this new observation have on the each of the models? In each model, is this observation an outlier? A high-leverage point? Both? Explain your answers.
In [ ]:
data.loc[100] = [0.1, 0.8, 6]
lr = smf.ols(formula='y ~ x1 + x2', data=data).fit()
lr.summary()
plot_residuals(lr)
In [ ]:
lr = smf.ols(formula='y ~ x1', data=data).fit()
lr.summary()
plot_residuals(lr)
In [ ]:
lr = smf.ols(formula='y ~ x2', data=data).fit()
lr.summary()
plot_residuals(lr)
This problem involves the Boston
data set. We will now try to predict per capita crime rate
using the other variables in this data set. In other words, per capita
crime rate is the response, and the other variables are the predictors.
For each predictor, fit a simple linear regression model to predict the response. Describe your results. In which of the models is there a statistically significant association between the predictor and the response? Create some plots to back up your assertions.
Fit a multiple regression model to predict the response using all of the predictors. Describe your results. For which predictors can we reject the null hypothesis H: β = 0?
How do your results from (1) compare to your results from (2)? Create a plot displaying the univariate regression coefficients from (1) on the x-axis, and the multiple regression coefficients from (2) on the y-axis. That is, each predictor is displayed as a single point in the plot. Its coefficient in a simple linear regression model is shown on the x-axis, and its coefficient estimate in the multiple linear regression model is shown on the y-axis.
Is there evidence of non-linear association between any of the predictors and the response? To answer this question, for each predictor X, fit a model of the form Y = β0 + β1X + β2X^2 + β3X^3 + E.
1) For each predictor, fit a simple linear regression model to predict the response. Describe your results. In which of the models is there a statistically significant association between the predictor and the response? Create some plots to back up your assertions.
In [ ]:
correlation_heatmap(boston, 'Boston')
In [ ]:
correlation_grid(boston, title='Boston')
In [ ]:
single_params = pd.Series()
features = [x for x in boston.columns
if x not in ('crim')]
for feature in features:
print('{}{}\n{}'.format('\n' * 2, '*' * 80, feature))
lr = smf.ols(formula=f'crim ~ {feature}', data=boston).fit()
lr.summary()
single_params.loc[feature] = lr.params.loc[feature]
boston.plot(kind='scatter', x='crim', y=feature)
2) Fit a multiple regression model to predict the response using all of the predictors. Describe your results. For which predictors can we reject the null hypothesis H: β = 0?
In [ ]:
features = ' + '.join([x for x in boston.columns
if x not in ('crim')])
lr = smf.ols(formula=f'crim ~ {features}', data=boston).fit()
lr.summary()
3) How do your results from (1) compare to your results from (2)? Create a plot displaying the univariate regression coefficients from (1) on the x-axis, and the multiple regression coefficients from (2) on the y-axis. That is, each predictor is displayed as a single point in the plot. Its coefficient in a simple linear regression model is shown on the x-axis, and its coefficient estimate in the multiple linear regression model is shown on the y-axis.
In [ ]:
mult_params = lr.params.iloc[1:]
models = pd.DataFrame({'multiple': mult_params, 'single': single_params})
models
In [ ]:
ax = models.plot(kind='scatter', x='multiple', y='single')
ax.set_title('Multiple Regression vs Single Regression',
fontsize=size['title'])
ax.set_xlabel('Multiple', fontsize=size['label'])
ax.set_ylabel('Single', fontsize=size['label'])
plt.show();
3) Is there evidence of non-linear association between any of the predictors and the response? To answer this question, for each predictor X, fit a model of the form $Y = β0 + β1X + β2X^2 + β3X^3 + E$.
In [ ]:
features = [x for x in boston.columns
if x not in ('crim')]
for feature in features:
print('{}{}\n{}'.format('\n' * 2, '*' * 80, feature))
model = f'{feature} + np.square({feature}) + np.power({feature}, 3)'
lr = smf.ols(formula=f'crim ~ {model}', data=boston).fit()
lr.summary()