In [ ]:
# Pandas Data Library
!pip install --upgrade pandas
In [ ]:
# Matplotlib Visualization
!pip install --upgrade matplotlib
In [ ]:
# Statsmodel LM library
!pip install --upgrade statsmodels
In [ ]:
# import notebook libraries
import pandas as pd
import matplotlib.pyplot as plt
# this allows plots to appear directly in the notebook
%matplotlib inline
In [ ]:
# read data into a Pandas DataFrame
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
data.head()
In [ ]:
# Shape of the data frame
data.shape
In [ ]:
# visualize the relationship between the features using scatterplots
fig, axs = plt.subplots(1, 3, sharey=True)
data.plot(kind='scatter', x='TV', y='sales', ax=axs[0], figsize=(16, 8))
data.plot(kind='scatter', x='radio', y='sales', ax=axs[1])
data.plot(kind='scatter', x='newspaper', y='sales', ax=axs[2])
In [ ]:
# Standard LM Library
import statsmodels.formula.api as smf
# create a fitted model in one line
lm = smf.ols(formula='sales ~ TV', data=data).fit()
# print the coefficients
lm.params
In [ ]:
# you have to create a DataFrame
X_new = pd.DataFrame({'TV': [50]})
X_new.head()
In [ ]:
# use the model to make predictions on a new value
lm.predict(X_new)
In [ ]:
# manually calculate the prediction
7.032594 + 0.047537*50
In [ ]:
# Create new data frame with min / max values
X_new = pd.DataFrame({'TV': [data.TV.min(), data.TV.max()]})
X_new.head()
In [ ]:
# make predictions for those x values and store them
preds = lm.predict(X_new)
preds
In [ ]:
# first, plot the observed data
data.plot(kind='scatter', x='TV', y='sales')
# then, plot the least squares line
plt.plot(X_new, preds, c='red', linewidth=2)