In [1]:
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 25)
mpl.rc('figure', figsize=(10, 8))
In [2]:
data = pd.read_csv('./data/Advertising.csv', index_col=[0])
data.head()
Out[2]:
In [3]:
fig, axes=plt.subplots(figsize=(15, 5), nrows=1, ncols=3)
axes[0].scatter(data['TV'], data['Sales'], c='r')
axes[0].set_title("TV")
axes[1].scatter(data['Radio'], data['Sales'], c='r')
axes[1].set_title("Radio")
axes[2].scatter(data['Newspaper'], data['Sales'], c='r')
axes[2].set_title("Newspaper")
fig.tight_layout()
In [4]:
import statsmodels.formula.api as sm
Simple linear regression
$ Y \approx \beta_0 + \beta_1X $
$ sales \approx \beta_0 + \beta_1 \times TV $
In [5]:
tv_model = sm.ols(formula='Sales ~ TV', data=data)
tv_fitted = tv_model.fit()
radio_model = sm.ols(formula='Sales ~ Radio', data=data)
radio_fitted = radio_model.fit()
np_model = sm.ols(formula='Sales ~ Newspaper', data=data)
np_fitted = np_model.fit()
In [6]:
fig, axes=plt.subplots(figsize=(15, 5), nrows=1, ncols=3)
axes[0].plot(data['TV'], data['Sales'], 'ro')
axes[0].plot(data['TV'], tv_fitted.fittedvalues, 'b')
axes[0].set_title('TV')
axes[1].plot(data['Radio'], data['Sales'], 'ro')
axes[1].plot(data['Radio'], radio_fitted.fittedvalues, 'b')
axes[1].set_title('Radio')
axes[2].plot(data['Newspaper'], data['Sales'], 'ro')
axes[2].plot(data['Newspaper'], np_fitted.fittedvalues, 'b')
axes[2].set_title('Newspaper')
fig.tight_layout()
In [7]:
tv_fitted.summary()
Out[7]:
In [8]:
radio_fitted.summary()
Out[8]:
In [9]:
np_fitted.summary()
Out[9]:
In [10]:
all_model = sm.ols(formula='Sales ~ TV + Radio + Newspaper', data=data)
all_fitted = all_model.fit()
all_fitted.summary()
Out[10]:
In [11]:
data.corr()
Out[11]: