Linear Regression by Example



In [6]:

    
%pylab inline
pylab.style.use('ggplot')
import pandas as pd









    



Populating the interactive namespace from numpy and matplotlib



In [7]:

    
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
data.head()

Check for Collinearity



In [8]:

    
import seaborn as sns
sns.pairplot(data.drop('Sales', axis=1))









    Out[8]:





<seaborn.axisgrid.PairGrid at 0x28136469208>



In [9]:

    
# visualize the relationship between the features and the response using scatterplots
fig, axs = plt.subplots(1, 3, sharey=True)
data.plot(kind='scatter', x='TV', y='Sales', ax=axs[0], figsize=(16, 8))
data.plot(kind='scatter', x='Radio', y='Sales', ax=axs[1])
data.plot(kind='scatter', x='Newspaper', y='Sales', ax=axs[2])









    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x28137205f60>



In [13]:

    
fig, axs = pylab.subplots(1, 3, sharey=True, figsize=(16, 8))
sns.regplot(data=data, x='TV', y='Sales', ax=axs[0])
sns.regplot(data=data, x='Radio', y='Sales', ax=axs[1])
sns.regplot(data=data, x='Newspaper', y='Sales', ax=axs[2])









    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x28137601630>



In [19]:

    
# this is using "formula notation" (similar to R)
import statsmodels.formula.api as smf

# create a fitted model in one line
lm1 = smf.ols(formula='Sales ~ TV', data=data).fit()

# print the coefficients
lm1.params









    Out[19]:





Intercept    7.032594
TV           0.047537
dtype: float64



In [18]:

    
print('Spending $100 on TV sales results in %s unit sales.' % (round(lm1.params['TV']*100)))









    



Spending $100 on TV sales results in 5.0 unit sales.



In [20]:

    
lm1.rsquared









    Out[20]:





0.61187505085007121



In [21]:

    
lm2 = smf.ols(formula='Sales ~ TV + Radio', data=data).fit()
lm2.params









    Out[21]:





Intercept    2.921100
TV           0.045755
Radio        0.187994
dtype: float64



In [22]:

    
lm2.rsquared









    Out[22]:





0.89719426108289568



In [23]:

    
lm3 = smf.ols(formula='Sales ~ TV + Radio + Newspaper', data=data).fit()
lm3.params









    Out[23]:





Intercept    2.938889
TV           0.045765
Radio        0.188530
Newspaper   -0.001037
dtype: float64



In [24]:

    
lm3.rsquared









    Out[24]:





0.89721063817895208



In [ ]:

	TV	Radio	Newspaper	Sales
1	230.1	37.8	69.2	22.1
2	44.5	39.3	45.1	10.4
3	17.2	45.9	69.3	9.3
4	151.5	41.3	58.5	18.5
5	180.8	10.8	58.4	12.9