```
In [ ]:
```from collections import OrderedDict
import itertools
import os
import os.path as osp
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics.regressionplots as smrp
import statsmodels.sandbox.regression as smsr
from k2datascience.utils import save_fig, size
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

```
In [ ]:
```def correlation_grid(data, bins=30, title=None, save=False):
"""
Plot the correlation grid.
:param pd.DataFrame data: original data
:param int bins: number of bins for diagonal histogram plots
:param bool save: if True the figure will be saved
:param str title: dataset title
"""
plot_title = 'Dataset Correlation'
if title:
title = f'{title} {plot_title}'
else:
title = plot_title
grid = sns.pairplot(data,
diag_kws={'alpha': 0.5, 'bins': bins,
'edgecolor': 'black'},
plot_kws={'alpha': 0.7})
grid.fig.suptitle(title,
fontsize=size['super_title'], y=1.03)
cols = data.columns
for n, col in enumerate(cols):
grid.axes[cols.size - 1, n].set_xlabel(cols[n],
fontsize=size['label'])
grid.axes[n, 0].set_ylabel(cols[n], fontsize=size['label'])
save_fig(title, save)

```
In [ ]:
```def correlation_heatmap(data, title=None, save=False):
"""
Plot the correlation values as a heatmap.
:param pd.DataFrame data: data object
:param str title: dataset title
:param bool save: if True the figure will be saved
"""
plot_title = 'Dataset Correlation'
if title:
title = f'{title} {plot_title}'
else:
title = plot_title
fig = plt.figure('Correlation Heatmap', figsize=(10, 8),
facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
sns.heatmap(data.corr(),
annot=True, cbar_kws={'orientation': 'vertical'},
fmt='.2f', linewidths=5, vmin=-1, vmax=1, ax=ax0)
ax0.set_title(title, fontsize=size['title'])
ax0.set_xticklabels(ax0.xaxis.get_majorticklabels(),
fontsize=size['label'], rotation=80)
ax0.set_yticklabels(ax0.yaxis.get_majorticklabels(),
fontsize=size['label'], rotation=0)
save_fig(title, save)

```
In [ ]:
```def plot_regression(data, lin_reg, features, target,
exact_reg=None, save=False, title=None):
"""
Plot the original data with linear regression line.
:param pd.DataFrame data: data object
:param lin_reg: linear regression model
:type: statsmodels.regression.linear_model.RegressionRes
:param list features: names of feature columns
:param str target: name of target column
:param pd.DataFrame exact_reg: x and y points defining /
exact linear regression
:param bool save: if True the figure will be saved
:param str title: data set title
"""
plot_title = 'Linear Regression'
if title:
title = f'{title} {plot_title}'
else:
title = plot_title
fig = plt.figure(title, figsize=(8, 6),
facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax = plt.subplot2grid((rows, cols), (0, 0))
x = data.loc[:, features]
y = data[target].values
for n, feature in enumerate(x.columns):
x = np.squeeze(data.loc[:, feature].values)
ax.scatter(x, y, alpha=0.5, color=f'C{n}', label=feature)
# Regression Line
sort_idx = np.argsort(x)
ax.plot(x[sort_idx], predict(x, lin_reg.params)[sort_idx],
color='black', label='Linear Regression', linestyle='--')
# Upper and Lower Confidence Intervals
std, upper, lower = smsr.predstd.wls_prediction_std(lin_reg)
ax.plot(x[sort_idx], lower[sort_idx], alpha=0.5, color='green',
label='Upper Confidence Interval', linestyle='-.')
ax.plot(x[sort_idx], upper[sort_idx], alpha=0.5, color='red',
label='Lower Confidence Interval', linestyle='-.')
if not exact_reg.empty:
exact_reg.plot(x='x', y='y', color='magenta',
label='Exact Linear Regression',
linestyle=':', ax=ax)
features_list = f'({", ".join(features)})'
ax.set_title(f'{target} vs {features_list}',
fontsize=size['title'])
ax.legend()
ax.set_xlabel(features_list, fontsize=size['label'])
ax.set_ylabel(target, fontsize=size['label'])
save_fig(title, save)

```
In [ ]:
```def plot_residuals(lin_reg, save=False, title=None):
"""
Plot resdual statistics
:param lin_reg: linear regression model
:type: statsmodels.regression.linear_model.RegressionRes
:param bool save: if True the figure will be saved
:param str title: data set title
"""
plot_title = 'Dataset Residuals'
if title:
title = f'{title} {plot_title}'
else:
title = plot_title
fig = plt.figure(title, figsize=(14, 21),
facecolor='white', edgecolor='black')
rows, cols = (3, 2)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
ax1 = plt.subplot2grid((rows, cols), (0, 1))
ax2 = plt.subplot2grid((rows, cols), (1, 0), rowspan=2)
ax3 = plt.subplot2grid((rows, cols), (1, 1), rowspan=2)
# Normalized Residuals Histogram
ax0.hist(lin_reg.resid_pearson, alpha=0.5, edgecolor='black')
ax0.set_title('Normalized Residuals Histogram',
fontsize=size['title'])
ax0.set_xlabel('Normalized Residuals', fontsize=size['label'])
ax0.set_ylabel('Counts', fontsize=size['label'])
# Residuals vs Fitted Values
ax1.scatter(lin_reg.fittedvalues, lin_reg.resid)
ax1.set_title('Raw Residuals vs Fitted Values',
fontsize=size['title'])
ax1.set_xlabel('Fitted Values', fontsize=size['label'])
ax1.set_ylabel('Raw Residuals', fontsize=size['label'])
# Leverage vs Normalized Residuals Squared
leverage = smrp.plot_leverage_resid2(lin_reg, ax=ax2)
ax2.set_title('Leverage vs Normalized $Residuals^2$',
fontsize=size['title'])
ax2.set_xlabel('Normalized $Residuals^2$',
fontsize=size['label'])
ax2.set_ylabel('Leverage', fontsize=size['label'])
# Influence Plot
influence = smrp.influence_plot(lin_reg, ax=ax3)
ax3.set_title('Influence Plot',
fontsize=size['title'])
ax3.set_xlabel('H Leverage',
fontsize=size['label'])
ax3.set_ylabel('Studentized Residuals',
fontsize=size['label'])
plt.tight_layout
plt.suptitle(title, fontsize=size['super_title'], y=0.92)
save_fig(title, save)

```
In [ ]:
```def predict(x, parameters):
"""
Return predicted values provided regression parameters.
.. note:: StatsModels provides regression coefficients in increasing
order, while NumPy would like to recive them in decreasing order.
This function is designed to recive the StatsModels format,
reverse the dimensionality, and then allow NumPy to perform the
calculation.
:param np.array x: array of input values
:param pd.Series parameters: linear regression coefficients in
order of increasing dimension
:return: predicted target values
:rtype: np.array
"""
p = np.poly1d(parameters.values[::-1])
return p(x)

```
In [ ]:
```data_dir = osp.realpath(osp.join(os.getcwd(), '..', 'data',
'linear_regression'))

```
In [ ]:
```auto = pd.read_csv(osp.join(data_dir, 'auto.csv'))
data = auto
data.info()
data.head()
data.describe()

```
In [ ]:
```boston = pd.read_csv(osp.join(data_dir, 'boston.csv'), index_col=0)
data = boston
data.info()
data.head()
data.describe()

```
In [ ]:
```carseats = pd.read_csv(osp.join(data_dir, 'carseats.csv'))
data = carseats
data.info()
data.head()
data.describe()

`Auto`

data set.Use

`statsmodels`

or`scikit-learn`

to perform a simple linear regression with`mpg`

as the response and`horsepower`

as the predictor. Print the results. Comment on the output. For example:- Is there a relationship between the predictor and the response?
- How strong is the relationship between the predictor and the response?
- Is the relationship between the predictor and the response positive or negative?
- What is the predicted
`mpg`

associated with a`horsepower`

of 98?*Optional: What are the associated 95% confidence and prediction intervals?*

Plot the response and the predictor. Plot the least squares regression line.

Produce diagnostic plots of the least squares regression fit. Comment on any problems you see with the fit.

```
In [ ]:
```lr = smf.ols(formula='mpg ~ horsepower', data=auto).fit()
lr.summary()

```
In [ ]:
```print(f'Horsepower: 98\tMPG:{predict(98, lr.params):.2f}')

- The F-statistic is not equal to zero, so a relationship exists between MPG and Horsepower.
- This is confirmed by the P-value of zero.

- The model has an $R^2$ value of 0.6, which implies poor accuracy.
- In the next cell the relationship model appears to have the following form. $$y = 1/x$$

- The T-statistic has a mixed response for this model.
- The intercept coefficient appears to be well suited, but the Horsepower coefficient is too imprecise to determine the effect on the response.

2) Plot the response and the predictor. Plot the least squares regression line.

```
In [ ]:
```plot_regression(data=auto, lin_reg=lr, features=['horsepower'],
target='mpg')

```
In [ ]:
```plot_residuals(lr)

`Auto`

data set.Produce a scatterplot matrix which includes all of the variables in the data set.

Compute the matrix of correlations between the variables using the function corr(). Plot a matrix correlation heatmap as well.

Perform a multiple linear regression with

`mpg`

as the response and all other variables except`name`

as the predictors. Print the results. Comment on the output. For instance:- Is there a relationship between the predictors and the response?
- Which predictors appear to have a statistically significant relationship to the response?
- What does the coefficient for the year variable suggest?

Produce diagnostic plots of the linear regression fit. Comment on any problems you see with the fit. Do the residual plots suggest any unusually large outliers? Does the leverage plot identify any observations with unusually high leverage?

Use the - and + symbols to fit linear regression models with interaction effects. Do any interactions appear to be statistically significant?

Try a few different transformations of the variables, such as $log(X)$, $\sqrt{X}$, $X^2$. Comment on your findings.

1) Produce a scatterplot matrix which includes all of the variables in the data set.

```
In [ ]:
```numeric = auto.select_dtypes(include=[np.int, np.float])
correlation_grid(numeric, bins=10)

```
In [ ]:
```auto.corr()

```
In [ ]:
```correlation_heatmap(auto)

```
In [ ]:
```features = numeric.drop('mpg', axis=1)
features = ' + '.join(features.columns)

```
In [ ]:
```lr = smf.ols(formula=f'mpg ~ {features}', data=auto).fit()
lr.summary()

- The F-statistic is not equal to zero, so a relationship exists between MPG and at least one of the features.
- This is confirmed by the P-values. Features with a relationship to MPG are the following:
- Displacement
- Weight
- Year
- Origin

- This is confirmed by the P-values. Features with a relationship to MPG are the following:
- The model has an $R^2$ value of 0.821
- This will have to be improved for the model to be used in a predictive manner.

- The Year coefficient states the relationship to MPG is $$\frac{0.75\ MPG}{1 \ Year}$$

```
In [ ]:
```plot_residuals(lr)

```
In [ ]:
```auto.loc[10:15, :]

```
In [ ]:
```numeric.loc[13, :] / numeric.max()

```
In [ ]:
```features = numeric.drop('mpg', axis=1)
interactions = [f'{x[0]} * {x[1]}' for x
in itertools.combinations(features.columns, 2)]
interactions = ' + '.join(interactions)
inter_formula = f'mpg ~ {" + ".join(features)} + {interactions}'

```
In [ ]:
```new_lr = smf.ols(formula=inter_formula, data=auto).fit()
new_lr.summary()

- The F-statistic is not equal to zero, so a relationship exists between MPG and at least one of the features.
- This is confirmed by the P-values. Features with a relationship to MPG are the following:
- Acceleration
- Displacement
- Origin
- Acceleration + Origin
- Acceleration + Year
- Displacement + Year

- This is confirmed by the P-values. Features with a relationship to MPG are the following:
- Acceleration appears multiple times and also masks the weight feature.
- The model has an $R^2$ value of 0.881
- This will have to be improved for the model to be used in a predictive manner.

```
In [ ]:
```plot_residuals(new_lr)

```
In [ ]:
```transformations = OrderedDict()
for trans in ('', 'np.log', 'np.sqrt', 'np.square'):
model = smf.ols(formula=f'mpg ~ {trans}(horsepower)',
data=auto).fit()
transformations[trans] = model.rsquared
fig = sm.qqplot(model.resid)
if not trans:
trans = 'Native'
plt.title(trans, fontsize=size['title']);
plt.show();
transformations

```
In [ ]:
```trans_lr = smf.ols(formula=(
'mpg ~ cylinders + np.log(displacement) + horsepower'
'+ weight + acceleration - year + origin - 1'), data=auto).fit()
trans_lr.summary()

- From the correlation scatter plot I would like to transform displacement, horsepower and weight to be inverse relastionships, but it appears StatsModels patsy does not support this transformation.
- Converting displacement to be a logrithmic relationship was the only positive effect on the residules I was able to identify.

```
In [ ]:
```plot_residuals(trans_lr)

`Carseats`

data set.Fit a multiple regression model to predict

`Sales`

using`Price`

,`Urban`

, and`US`

.Provide an interpretation of each coefficient in the model. Be careful—some of the variables in the model are qualitative!

Write out the model in equation form, being careful to handle the qualitative variables properly.

For which of the predictors can you reject the null hypothesis H: β = 0?

On the basis of your response to the previous question, fit a smaller model that only uses the predictors for which there is evidence of association with the outcome.

How well do the models in (1) and (5) fit the data?

Using the model from (5), obtain 95% confidence intervals for the coefficient(s).

Is there evidence of outliers or high leverage observations in the model from (5)?

1) Fit a multiple regression model to predict Sales using Price, Urban, and US.

```
In [ ]:
```features = ' + '.join(['Price', 'Urban', 'US'])
lr = smf.ols(formula=f'Sales ~ {features}', data=carseats).fit()
lr.summary()

- Price $$\frac{1\ Price}{-5.45\ Sales}$$
- Urban $$\frac{2.2\ Rural}{1\ Urban}$$
- US $$\frac{1.2\ US}{1\ Non-US}$$

3) Write out the model in equation form, being careful to handle the qualitative variables properly.

$$Sales = 13.0435 - 0.0545\ Price - 0.0219\ Urban + 1.2006\ US$$

4) For which of the predictors can you reject the null hypothesis H: β = 0?

- $H_0$: There is no relationship between X and Y.
- Features with high P-values:
- Price
- US

- Features with high P-values:

```
In [ ]:
```lr = smf.ols(formula='Sales ~ Price + US', data=carseats).fit()
lr.summary()

6) How well do the models in (1) and (5) fit the data ?

Neither model is very good with $R^2$ values of 0.239.

7) Using the model from (5), obtain 95% confidence intervals for the coefficient(s).

```
In [ ]:
```(lr.conf_int(0.05)
.rename(columns={0: 'Lower', 1: 'Upper'}))

8) Is there evidence of outliers or high leverage observations in the model from (5) ?

```
In [ ]:
```plot_residuals(lr)

In this problem we will investigate the t-statistic for the null hypothesis H: β = 0 in simple linear regression without an intercept. To begin, we generate a predictor x and a response y as follows.

```
import numpy as np
np.random.seed(1)
x = np.random.randn(100)
y = 2 * x + np.random.randn(100)
```

Perform a simple linear regression of y onto x, without an intercept. Report the coefficient estimate β, the standard error of this coefficient estimate, and the t-statistic and p-value associated with the null hypothesis H: β = 0. Comment on these results. (You can perform regression without an intercept)

Now perform a simple linear regression of x onto y without an intercept, and report the coefficient estimate, its standard error, and the corresponding t-statistic and p-values associated with the null hypothesis H: β = 0. Comment on these results.

What is the relationship between the results obtained in (1) and (2)?

For the regrssion of Y onto X without an intercept, the t-statistic for H0:β=0 takes the form β^/SE(β^), where β^ is given by (3.38), and where

*Confirm numerically in Python, that the t-statistic can be written as*

5 . Using the results from (4), argue that the t-statistic for the regression of y onto x is the same t-statistic for the regression of x onto y.

6 . In Python, show that when regression is performed with an intercept, the t-statistic for H0:β1=0 is the same for the regression of y onto x as it is the regression of x onto y.

```
In [ ]:
```np.random.seed(1)
x = np.random.randn(100)
y = 2 * x + np.random.randn(100)
data = pd.DataFrame(np.c_[x, y], columns=['x', 'y'])
data.info()
data.head()

```
In [ ]:
```correlation_grid(data, title='Random')

```
In [ ]:
```correlation_heatmap(data, title='Random')

```
In [ ]:
```lr = smf.ols(formula='y ~ x - 1', data=data).fit()
lr.summary()

```
In [ ]:
```lr = smf.OLS(y, x).fit()
lr.summary()

- Coefficient estimate: 2.1067
- Coefficient estimate standard error: 0.106
- The F-statistic is not equal to zero, so a relationship exists between y and x.
- This is confirmed by the P-value of zero.

- The model has an $R^2$ value of 0.798, which is fair.
- The T-statistic is large indicating the x coefficient estimate is both large and precise.

- Coefficient estimate: 0.3789
- Coefficient estimate standard error: 0.019
- The F-statistic is not equal to zero, so a relationship exists between x and y.
- This is confirmed by the P-value of zero.

- The model has an $R^2$ value of 0.798, which is fair.
- The T-statistic is large indicating the y coefficient estimate is both large and precise.

3) What is the relationship between the results obtained in (1) and (2)?

Both regresions are just rearragements of the same fit.

4) For the regrssion of Y onto X without an intercept, the t-statistic for H0:β=0 takes the form β^/SE(β^), where β^ is given by (3.38), and where

$$SE(\hat{\beta}) = \sqrt{\frac{\sum_{i=1}^n(y_i - x_i\hat{\beta})^2}{(n - 1)\sum_{i=1}^nx_i^2}}$$Show algebraically, and confirm numerically in Python, that the t-statistic can be written as

$$\frac{\sqrt{n - 1}\sum_{i=1}^nx_iy_i}{\sqrt{(\sum_{i=1}^nx_i^2)(\sum_{i=1}^ny_i^2) - (\sum_{i=1}^nx_iy_i)}}$$We have

$$t = \frac{\sum_ix_iy_y/\sum_jx_j^2}{\sqrt{\sum_i(y_i - x_i\hat{\beta})^2/(n - 1)\sum_jx_j^2}} = \frac{\sqrt{n - 1}\sum_ix_iy_i}{\sqrt{\sum_jx_j^2\sum_i(y_i - x_i\sum_jx_jy_j/\sum_jx_j^2)^2}} = \frac{\sqrt{n - 1}\sum_ix_iy_i}{\sqrt{(\sum_jx_j^2)(\sum_jy_j^2) - (\sum_jx_jy_j)^2}}$$Now let’s verify this result numerically.

```
In [ ]:
```n = x.size
((np.sqrt(n - 1) * np.sum(x * y))
/ np.sqrt(np.sum(x**2) * np.sum(y**2) - np.sum(x * y)**2))

This matches the calculated value above

- The commutative property of multiplication states that two numbers can be multiplied in either order.
- Inverting the response and feature will not alter the equations outcome.

```
In [ ]:
```lr = smf.ols(formula='y ~ x', data=data).fit()
lr.summary()

```
In [ ]:
```lr = smf.ols(formula='x ~ y', data=data).fit()
lr.summary()

- In both cases above the T-statistic is 19.783.

Recall that the coefficient estimate β^ for the linear regression of Y onto X witout an intercept is given by (3.38). Under what circumstance is the coefficient estimate for the regression of X onto Y the same as the coefficient estimate for the regression of Y onto X ?

Generate an example in Python with n = 100 observations in which the coefficient estimate for the regression of X onto Y is different from the coefficient estimate for the regression of Y onto X.

Generate an example in Python with n = 100 observations in which the coefficient estimate for the regression of X onto Y is the same as the coefficient estimate for the regression of Y onto X.

1) Recall that the coefficient estimate β^ for the linear regression of Y onto X witout an intercept is given by (3.38). Under what circumstance is the coefficient estimate for the regression of X onto Y the same as the coefficient estimate for the regression of Y onto X?

The coefficient estimate for the regression of Y onto X is

$$\hat{\beta} = \frac{\sum_ix_iy_i}{\sum_jx_j^2}$$The coefficient estimate for the regression of X onto Y is

$$\hat{\beta}' = \frac{\sum_ix_iy_i}{\sum_jy_j^2}$$The coefficients are the same iff $\sum_jx_j^2 = \sum_jy_j^2$

```
In [ ]:
```np.random.seed(1)
x = np.random.randn(100)
y = 2 * x + np.random.randn(100)
assert np.sum(x**2) != np.sum(y**2)

```
In [ ]:
```np.random.seed(1)
x = np.arange(100)
y = x.copy()
np.random.shuffle(x)
np.random.shuffle(y)
data = pd.DataFrame(np.c_[x, y], columns=['x', 'y'])
x[:10]
y[:10]
if np.sum(np.square(x)) == np.sum(np.square(y)):
print('The sums of squares are equal.')

```
In [ ]:
```lr = smf.ols(formula='y ~ x', data=data).fit()
lr.summary()

```
In [ ]:
```lr = smf.ols(formula='x ~ y', data=data).fit()
lr.summary()

In this exercise you will create some simulated data and will fit simple linear regression models to it. Make sure to set the seed prior to starting part (1) to ensure consistent results.

Create a vector, x, containing 100 observations drawn from a N(0, 1) distribution. This represents a feature, X.

Create a vector, eps, containing 100 observations drawn from a N(0, 0.25) distribution i.e. a normal distribution with mean zero and variance 0.25.

Using x and eps, generate a vector y according to the model

Y = −1 + 0.5X + e

What is the length of the vector `y`

? What are the values of β0 and β1 in this linear model?

Create a scatterplot displaying the relationship between x and y. Comment on what you observe.

Fit a least squares linear model to predict y using

`x`

. Comment on the model obtained. How do β0 and β1 compare to β0 and β1?Display the least squares line on the scatterplot obtained in (4). Draw the population regression line on the plot, in a different color. Create an appropriate legend.

Now fit a polynomial regression model that predicts

`y`

using`x`

and`x^2`

. Is there evidence that the quadratic term improves the model fit? Explain your answer.Repeat (1)–(6) after modifying the data generation process in such a way that there is less noise in the data. The model (3.39) should remain the same. You can do this by decreasing the variance of the normal distribution used to generate the error term

`e`

in (2). Describe your results.Repeat (1)–(6) after modifying the data generation process in such a way that there is more noise in the data. The model (3.39) should remain the same. You can do this by increasing the variance of the normal distribution used to generate the error term in (b). Describe your results.

What are the confidence intervals for β0 and β1 based on the original data set, the noisier data set, and the less noisy data set? Comment on your results.

```
In [ ]:
```np.random.seed(1)
x = 1 * np.random.randn(100) + 0

```
In [ ]:
```eps = 0.25 * np.random.randn(100) + 0

3) Using x and eps, generate a vector y according to the model

```
Y = −1 + 0.5X + eps
```

What is the length of the vector y? What are the values of β0 and β1 in this linear model?

```
In [ ]:
```y = -1 + 0.5 * x + eps
data = pd.DataFrame(np.c_[x, y], columns=['x', 'y'])

```
In [ ]:
```f'Length of Vector y: {y.shape[0]}'

- β0: y-intercept (-1)
- β1: slope of regression line (0.5)

4) Create a scatterplot displaying the relationship between x and y. Comment on what you observe.

```
In [ ]:
```ax = data.plot(kind='scatter', x='x', y='y')
ax.set_title('y vs x', fontsize=size['title'])
ax.set_xlabel('x', fontsize=size['label'])
ax.set_ylabel('y', fontsize=size['label'])
plt.show();

`y`

using `x`

. Comment on the model obtained. How do β^0 and β^1 compare to β0 and β1?

```
In [ ]:
```lr = smf.ols(formula='y ~ x', data=data).fit()
lr.summary()

- predicted intercept is -0.9969 (actual = -1)
- predicted x-coefficient is 0.4897 (actual = 0.5)
- The F-statistic is not equal to zero, so a relationship exists between x and y.
- This is confirmed by the P-value of zero.

- The model has an $R^2$ value of 0.749, which is fair.
- The T-statistic is large indicating the y coefficient estimate is both large and precise.

```
In [ ]:
```exact_x = np.linspace(-2, 2, 100)
exact_y = -1 + 0.5 * exact_x
exact_data = pd.DataFrame(np.c_[exact_x, exact_y],
columns=['x', 'y'])
plot_regression(data, lr, ['x'], 'y', exact_reg=exact_data)

```
In [ ]:
```original_confidence = (lr.conf_int(0.05)
.rename(columns={0: 'Lower', 1: 'Upper'}))

```
In [ ]:
```lr = smf.ols(formula='y ~ x + np.square(x)', data=data).fit()
lr.summary()

```
In [ ]:
```np.random.seed(1)
x = 1 * np.random.randn(100) + 0
eps = 0.05 * np.random.randn(100) + 0
y = -1 + 0.5 * x + eps
data = pd.DataFrame(np.c_[x, y], columns=['x', 'y'])
lr = smf.ols(formula='y ~ x', data=data).fit()
lr.summary()

```
In [ ]:
```plot_regression(data, lr, ['x'], 'y', exact_reg=exact_data)

```
In [ ]:
```low_error_confidence = (lr.conf_int(0.05)
.rename(columns={0: 'Lower', 1: 'Upper'}))

- predicted intercept is -0.9926 (actual = -1)
- predicted x-coefficient is 0.5048 (actual = 0.5)
- The F-statistic is not equal to zero, so a relationship exists between x and y.
- This is confirmed by the P-value of zero.

- The model has an $R^2$ value of 0.989, which is very good.
- Acurate predictions could be made with this model.

- The T-statistic is large indicating the y coefficient estimate is both large and precise.

```
In [ ]:
```lr = smf.ols(formula='y ~ x + np.square(x)', data=data).fit()
lr.summary()

```
In [ ]:
```np.random.seed(1)
x = 1 * np.random.randn(100) + 0
eps = 1 * np.random.randn(100) + 0
y = -1 + 0.5 * x + eps
data = pd.DataFrame(np.c_[x, y], columns=['x', 'y'])
lr = smf.ols(formula='y ~ x', data=data).fit()
lr.summary()

- The F-statistic is not equal to zero, so a relationship exists between x and y.
- This is confirmed by the P-value of zero.

- The model has an $R^2$ value of 0.244, which is poor.
- This model does not represent the data.

- The T-statistics are not large indicating the x coefficient estimate is imprecise.

```
In [ ]:
```plot_regression(data, lr, ['x'], 'y', exact_reg=exact_data)

```
In [ ]:
```high_error_confidence = (lr.conf_int(0.05)
.rename(columns={0: 'Lower', 1: 'Upper'}))

```
In [ ]:
```lr = smf.ols(formula='y ~ x + np.square(x)', data=data).fit()
lr.summary()

```
In [ ]:
```print('Original Confidence Intervals')
original_confidence
print('Low Error Confidence Intervals')
low_error_confidence
print('High Error Confidence Intervals')
high_error_confidence

Perform the following commands:

np.random.seed(8)

x1 = np.random.rand(100)

x2 = .5 * x1 + np.random.rand(100) / 10

y = 2 + 2

x1 + .3x2 + np.random.randn(100)

The last line corresponds to creating a linear model in which y is a function of x1 and x2. Write out the form of the linear model. What are the regression coefficients?

What is the correlation between x1 and x2? Create a scatterplot displaying the relationship between the variables.

Using this data, fit a least squares regression to predict y using x1 and x2. Describe the results obtained. What are β0, β1, and β2? How do these relate to the true β0, β1, and β2? Can you reject the null hypothesis Ho:β1 = 0? How about the null hypothesis Ho:β2 = 0?

Now fit a least squares regression to predict y using only x1. Comment on your results. Can you reject the null hypothesis Ho: β1 = 0?

Now fit a least squares regression to predict y using only x2. Comment on your results. Can you reject the null hypothesis Ho: β1 = 0?

Do the results obtained in (3)–(5) contradict each other? Explain your answer.

Now suppose we obtain one additional observation, which was unfortunately mismeasured.

x1=c(x1 , 0.1)

x2=c(x2 , 0.8)

y=c(y,6)

Re-fit the linear models from (3) to (5) using this new data. What effect does this new observation have on the each of the models? In each model, is this observation an outlier? A high-leverage point? Both? Explain your answers.

```
In [ ]:
```np.random.seed(8)
x1 = np.random.rand(100)
x2 = 0.5 * x1 + np.random.rand(100) / 10
y = 2 + 2 * x1 + 0.3 * x2 + np.random.randn(100)
data = pd.DataFrame(np.c_[x1, x2, y], columns=['x1', 'x2', 'y'])

y = β0 + 2β1 + 0.3β2 + error

```
In [ ]:
```data.corr()
correlation_grid(data)

```
In [ ]:
```lr = smf.ols(formula='y ~ x1 + x2', data=data).fit()
lr.summary()

- The model does not acurately represent the data.
- Coefficients:
- β0: 1.9615 (actual: 1)
- β1: 5.9895 (actual: 2)
- β2: -6.3538 (actual: 0.3)

- The x1 variable has a P-value of 0 and is statistically significant.
- The null hypothesis my be rejected.

- The x2 variable has a P-value greater than 0.05 implying it is not statistically significant.
- The null hypothesis may be not rejected.

`y`

using only `x1`

. Comment on your results. Can you reject the null hypothesis H0:β1=0 ?

```
In [ ]:
```lr = smf.ols(formula='y ~ x1', data=data).fit()
lr.summary()

```
In [ ]:
```lr = smf.ols(formula='y ~ x2', data=data).fit()
lr.summary()

6) Do the results obtained in (3)–(5) contradict each other? Explain your answer.

7) Now suppose we obtain one additional observation, which was unfortunately mismeasured.

x1=c(x1, 0.1)

x2=c(x2, 0.8)

y=c(y, 6)

Re-fit the linear models from (3) to (5) using this new data. What effect does this new observation have on the each of the models? In each model, is this observation an outlier? A high-leverage point? Both? Explain your answers.

```
In [ ]:
```data.loc[100] = [0.1, 0.8, 6]
lr = smf.ols(formula='y ~ x1 + x2', data=data).fit()
lr.summary()
plot_residuals(lr)

```
In [ ]:
```lr = smf.ols(formula='y ~ x1', data=data).fit()
lr.summary()
plot_residuals(lr)

```
In [ ]:
```lr = smf.ols(formula='y ~ x2', data=data).fit()
lr.summary()
plot_residuals(lr)

This problem involves the `Boston`

data set. We will now try to predict per capita crime rate
using the other variables in this data set. In other words, per capita
crime rate is the response, and the other variables are the predictors.

For each predictor, fit a simple linear regression model to predict the response. Describe your results. In which of the models is there a statistically significant association between the predictor and the response? Create some plots to back up your assertions.

Fit a multiple regression model to predict the response using all of the predictors. Describe your results. For which predictors can we reject the null hypothesis H: β = 0?

How do your results from (1) compare to your results from (2)? Create a plot displaying the univariate regression coefficients from (1) on the x-axis, and the multiple regression coefficients from (2) on the y-axis. That is, each predictor is displayed as a single point in the plot. Its coefficient in a simple linear regression model is shown on the x-axis, and its coefficient estimate in the multiple linear regression model is shown on the y-axis.

Is there evidence of non-linear association between any of the predictors and the response? To answer this question, for each predictor X, fit a model of the form Y = β0 + β1X + β2X^2 + β3X^3 + E.

```
In [ ]:
```correlation_heatmap(boston, 'Boston')

```
In [ ]:
```correlation_grid(boston, title='Boston')

```
In [ ]:
```single_params = pd.Series()
features = [x for x in boston.columns
if x not in ('crim')]
for feature in features:
print('{}{}\n{}'.format('\n' * 2, '*' * 80, feature))
lr = smf.ols(formula=f'crim ~ {feature}', data=boston).fit()
lr.summary()
single_params.loc[feature] = lr.params.loc[feature]
boston.plot(kind='scatter', x='crim', y=feature)

```
In [ ]:
```features = ' + '.join([x for x in boston.columns
if x not in ('crim')])
lr = smf.ols(formula=f'crim ~ {features}', data=boston).fit()
lr.summary()

```
In [ ]:
```mult_params = lr.params.iloc[1:]
models = pd.DataFrame({'multiple': mult_params, 'single': single_params})
models

```
In [ ]:
```ax = models.plot(kind='scatter', x='multiple', y='single')
ax.set_title('Multiple Regression vs Single Regression',
fontsize=size['title'])
ax.set_xlabel('Multiple', fontsize=size['label'])
ax.set_ylabel('Single', fontsize=size['label'])
plt.show();

```
In [ ]:
```features = [x for x in boston.columns
if x not in ('crim')]
for feature in features:
print('{}{}\n{}'.format('\n' * 2, '*' * 80, feature))
model = f'{feature} + np.square({feature}) + np.power({feature}, 3)'
lr = smf.ols(formula=f'crim ~ {model}', data=boston).fit()
lr.summary()