In [6]:
%pylab inline
pylab.style.use('ggplot')
import pandas as pd
In [7]:
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
data.head()
Out[7]:
In [8]:
import seaborn as sns
sns.pairplot(data.drop('Sales', axis=1))
Out[8]:
In [9]:
# visualize the relationship between the features and the response using scatterplots
fig, axs = plt.subplots(1, 3, sharey=True)
data.plot(kind='scatter', x='TV', y='Sales', ax=axs[0], figsize=(16, 8))
data.plot(kind='scatter', x='Radio', y='Sales', ax=axs[1])
data.plot(kind='scatter', x='Newspaper', y='Sales', ax=axs[2])
Out[9]:
In [13]:
fig, axs = pylab.subplots(1, 3, sharey=True, figsize=(16, 8))
sns.regplot(data=data, x='TV', y='Sales', ax=axs[0])
sns.regplot(data=data, x='Radio', y='Sales', ax=axs[1])
sns.regplot(data=data, x='Newspaper', y='Sales', ax=axs[2])
Out[13]:
In [19]:
# this is using "formula notation" (similar to R)
import statsmodels.formula.api as smf
# create a fitted model in one line
lm1 = smf.ols(formula='Sales ~ TV', data=data).fit()
# print the coefficients
lm1.params
Out[19]:
In [18]:
print('Spending $100 on TV sales results in %s unit sales.' % (round(lm1.params['TV']*100)))
In [20]:
lm1.rsquared
Out[20]:
In [21]:
lm2 = smf.ols(formula='Sales ~ TV + Radio', data=data).fit()
lm2.params
Out[21]:
In [22]:
lm2.rsquared
Out[22]:
In [23]:
lm3 = smf.ols(formula='Sales ~ TV + Radio + Newspaper', data=data).fit()
lm3.params
Out[23]:
In [24]:
lm3.rsquared
Out[24]:
In [ ]: