In [1]:
advt = pd.read_csv("Advertising.csv")


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-250dca37d2ab> in <module>()
----> 1 advt = pd.read_csv("Advertising.csv")

NameError: name 'pd' is not defined

In [2]:
import pandas as pd
import numpy as np

In [3]:
advt = pd.read_csv("Advertising.csv")

In [4]:
advt.head()


Out[4]:
Unnamed: 0 TV Radio Newspaper Sales
0 1 230.1 37.8 69.2 22.1
1 2 44.5 39.3 45.1 10.4
2 3 17.2 45.9 69.3 9.3
3 4 151.5 41.3 58.5 18.5
4 5 180.8 10.8 58.4 12.9

In [5]:
advt.tail()


Out[5]:
Unnamed: 0 TV Radio Newspaper Sales
195 196 38.2 3.7 13.8 7.6
196 197 94.2 4.9 8.1 9.7
197 198 177.0 9.3 6.4 12.8
198 199 283.6 42.0 66.2 25.5
199 200 232.1 8.6 8.7 13.4

In [ ]:


In [6]:
advt.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
Unnamed: 0    200 non-null int64
TV            200 non-null float64
Radio         200 non-null float64
Newspaper     200 non-null float64
Sales         200 non-null float64
dtypes: float64(4), int64(1)
memory usage: 7.9 KB

In [7]:
advt = advt[["TV", "Radio", "Newspaper", "Sales"]]

In [8]:
advt.head()


Out[8]:
TV Radio Newspaper Sales
0 230.1 37.8 69.2 22.1
1 44.5 39.3 45.1 10.4
2 17.2 45.9 69.3 9.3
3 151.5 41.3 58.5 18.5
4 180.8 10.8 58.4 12.9

In [9]:
advt.describe()


Out[9]:
TV Radio Newspaper Sales
count 200.000000 200.000000 200.000000 200.000000
mean 147.042500 23.264000 30.554000 14.022500
std 85.854236 14.846809 21.778621 5.217457
min 0.700000 0.000000 0.300000 1.600000
25% 74.375000 9.975000 12.750000 10.375000
50% 149.750000 22.900000 25.750000 12.900000
75% 218.825000 36.525000 45.100000 17.400000
max 296.400000 49.600000 114.000000 27.000000

In [10]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib


Using matplotlib backend: MacOSX

In [12]:
sns.distplot(advt.Sales)


//anaconda/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x11898d358>

In [13]:
sns.distplot(advt.Sales)


//anaconda/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x11b6a3b38>

In [14]:
sns.jointplot(advt.Newspaper, advt.Sales)


Out[14]:
<seaborn.axisgrid.JointGrid at 0x11b6a0a20>

In [15]:
advt.TV.corr(advt.Sales)


Out[15]:
0.78222442486160615

In [16]:
advt.Newspaper.corr(advt.Sales)


Out[16]:
0.22829902637616528

In [17]:
advt.corr()


Out[17]:
TV Radio Newspaper Sales
TV 1.000000 0.054809 0.056648 0.782224
Radio 0.054809 1.000000 0.354104 0.576223
Newspaper 0.056648 0.354104 1.000000 0.228299
Sales 0.782224 0.576223 0.228299 1.000000

In [18]:
sns.heatmap(advt.corr())


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x11d2ae4e0>

In [19]:
#building a linear regression model

#Sales is the response variable and TV is the predictor
import statsmodels.formula.api as smf

In [21]:
lm = smf.ols( 'Sales ~ TV', advt ).fit()

In [22]:
lm.params


Out[22]:
Intercept    7.032594
TV           0.047537
dtype: float64

In [23]:
lm.conf_int()


Out[23]:
0 1
Intercept 6.129719 7.935468
TV 0.042231 0.052843

In [24]:
lm.rsquared


Out[24]:
0.61187505085007099

In [ ]: