In [1]:
%matplotlib inline
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
In [2]:
# Advertising data located from link: http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv
# Read in advertising data
adv = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
In [3]:
adv.head()
Out[3]:
In [4]:
#Create a scatter matrix plot using pandas
pd.scatter_matrix(adv)
Out[4]:
In [5]:
est_m=smf.ols(formula='Sales ~ TV', data=adv).fit()
In [6]:
est_m.summary()
Out[6]:
*TV avd has an impact on sell.
In [7]:
# Plot the data and fitted line
x_prime = pd.DataFrame({'TV': np.linspace(adv.TV.min(),
adv.TV.max(),
100)})
y_hat = est_m.predict(x_prime)
In [8]:
plt.xlabel("TV")
plt.ylabel("Sales")
plt.title("Example of Heteroskedasticity")
plt.scatter(adv.TV, adv.Sales, alpha=0.3)
plt.plot(x_prime, y_hat, 'r', linewidth=2, alpha=0.9)
Out[8]:
In [9]:
# View the residuals
plt.figure()
plt.scatter(est_m.predict(adv), est_m.resid, alpha=0.3)
plt.xlabel("Predicted Sales")
plt.ylabel("Residuals")
Out[9]:
In [10]:
est_l = smf.ols(formula='np.log(Sales) ~ TV', data=adv).fit()
In [11]:
y_hat = est_l.predict(x_prime)
In [12]:
# Plot data
plt.figure()
plt.xlabel("TV")
plt.ylabel("log(Sales)")
plt.title("Log Transformation of y")
plt.scatter(adv.TV, np.log(adv.Sales), alpha=0.3)
plt.plot(x_prime, y_hat, 'r', linewidth=2, alpha=0.9)
plt.ylim(1.5, 3.5)
Out[12]:
In [13]:
est_multi = smf.ols(formula='Sales ~ TV + Radio + Newspaper', data=adv).fit()
In [14]:
est_multi.summary()
Out[14]:
In [15]:
est_news = smf.ols(formula='Sales ~ Newspaper', data=adv).fit()
In [16]:
est_news.summary()
Out[16]:
In [18]:
est_tv_radio = smf.ols(formula='Sales ~ TV + Radio', data=adv).fit()
In [19]:
est_tv_radio.summary()
Out[19]:
In [ ]: