In [0]:
%matplotlib inline

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'
sns.set()

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/theleadio/datascience_demo/master/social_ads.csv", index_col=0)

In [14]:
df.head()


Out[14]:
google facebook instagram sales size is_large area area_suburban area_urban
segment
1 230.1 37.8 69.2 22.1 large 1 rural 0 0
2 44.5 39.3 45.1 10.4 small 0 urban 0 1
3 17.2 45.9 69.3 9.3 small 0 rural 0 0
4 151.5 41.3 58.5 18.5 small 0 urban 0 1
5 180.8 10.8 58.4 12.9 large 1 suburban 1 0

In [15]:
df.tail()


Out[15]:
google facebook instagram sales size is_large area area_suburban area_urban
segment
196 38.2 3.7 13.8 7.6 small 0 suburban 1 0
197 94.2 4.9 8.1 9.7 small 0 urban 0 1
198 177.0 9.3 6.4 12.8 small 0 suburban 1 0
199 283.6 42.0 66.2 25.5 small 0 rural 0 0
200 232.1 8.6 8.7 13.4 large 1 rural 0 0

In [16]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 1 to 200
Data columns (total 9 columns):
google           200 non-null float64
facebook         200 non-null float64
instagram        200 non-null float64
sales            200 non-null float64
size             200 non-null object
is_large         200 non-null int64
area             200 non-null object
area_suburban    200 non-null int64
area_urban       200 non-null int64
dtypes: float64(4), int64(3), object(2)
memory usage: 15.6+ KB

In [18]:
sns.set(rc={'figure.figsize':(12,10)})
sns.set_context("talk", font_scale=1)
    
sns.heatmap(df.corr(), cmap='Blues', annot=True)


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f002deb6f60>

In [23]:
# visualize the relationship between the features and the response using scatterplots
fig, axs = plt.subplots(1, 3, sharey=True);
df.plot(kind='scatter', x='google', y='sales', ax=axs[0], figsize=(16, 8));
df.plot(kind='scatter', x='facebook', y='sales', ax=axs[1]);
df.plot(kind='scatter', x='instagram', y='sales', ax=axs[2]);
#this shows google is good coz it has trends, worst is instagram coz we can't predict (very little certainty)


'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.

In [30]:
#questions
#is there a relationship between ads and sales?
#how strong is that relationship?
#which ad types contributes to sales?
#what is the effect of each ad type of sales?
#how can we predict sales?

#use simple linear regression, y=mx+c, y response, x feature, 

# this is the standard import if you're using "formula notation" (similar to R)
import statsmodels.formula.api as smf

# create a fitted model in one line
lm = smf.ols(formula='sales ~ google', data=df).fit()

# print the coefficients
lm.params

#this parameter means what?
#by doing nothing you still have sale of 7.03, for every thousand dollars you spend, you'll sell 47 units?


Out[30]:
Intercept    7.032594
google       0.047537
dtype: float64

In [28]:
# create a fitted model in one line
lm = smf.ols(formula='sales ~ facebook', data=df).fit()

# print the coefficients
lm.params


Out[28]:
Intercept    9.311638
facebook     0.202496
dtype: float64

In [32]:
# create a fitted model in one line
lm = smf.ols(formula='sales ~ instagram', data=df).fit()

# print the coefficients
lm.params


Out[32]:
Intercept    12.351407
instagram     0.054693
dtype: float64

In [34]:
#if boss wants to spend 100k for google, how many sales is he expecting?
100*0.047537+7.032594
#the answer is around 11 thousand units
#all data were reduced by K (thousand)


Out[34]:
11.786294

In [43]:
# this is the standard import if you're using "formula notation" (similar to R)
import statsmodels.formula.api as smf

# create a fitted model in one line
lm = smf.ols(formula='sales ~ google', data=df).fit()

# print the coefficients
lm.params

#if using statistical model
x_pred = pd.DataFrame({'google':[100]})
lm.predict(x_pred)

#due to error, need to execute google again


Out[43]:
0    11.786258
dtype: float64

In [48]:
#to construct a model using linear regression
sns.lmplot(x='google',y='sales',data=df,height=7, aspect=1.5);
#try change to semicolon if have error



In [50]:
#print the r-squared value for the model
lm.rsquared


Out[50]:
0.611875050850071

In [55]:
# create a fitted model with all three features
lm = smf.ols(formula='sales ~ google + facebook + instagram'
, data=df).fit()

# print the coefficients
lm.params


Out[55]:
Intercept    2.938889
google       0.045765
facebook     0.188530
instagram   -0.001037
dtype: float64

In [57]:
# create a fitted model in one line
lm = smf.ols(formula='sales ~ instagram', data=df).fit()

# print the coefficients
lm.params


Out[57]:
Intercept    12.351407
instagram     0.054693
dtype: float64

In [59]:
lm.rsquared


Out[59]:
0.05212044544430516

In [61]:
# create a fitted model with all three features
lm = smf.ols(formula='sales ~ google + facebook + instagram'
, data=df).fit()

# print the coefficients
lm.params

#this shows that spending much on instagram does not give much return


Out[61]:
Intercept    2.938889
google       0.045765
facebook     0.188530
instagram   -0.001037
dtype: float64

In [63]:
lm.rsquared


Out[63]:
0.8972106381789522

In [66]:
# create X and y
feature_cols = ['google', 'facebook', 'instagram']
X = df[feature_cols]
y = df['sales']

# follow the usual sklearn pattern: import, instantiate, fit
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X, y)

# print intercept and coefficients
print ("intercept", lm.intercept_)
print ("coefficients:", lm.coef_)

#result is the same, this is machine learning, the previous one is statistical method


intercept 2.938889369459412
coefficients: [ 0.04576465  0.18853002 -0.00103749]

In [73]:
lm.predict([[100,25,25]])
#this is to forecast that you'll get 12k unit sold with an input of 100k, 25k,25k from those big three
#this is to answer if we have 150k, in what areas should we spend, so this will show the outcome
#however this method is very sensitive to outliers so need to treat the data well


Out[73]:
array([12.20266701])

In [75]:
lm.score(X,y)
#capital X means a lot of X factors
#similar to r-squared


Out[75]:
0.8972106381789521

In [78]:
# create X and y
feature_cols = ['google', 'facebook', 'instagram', 'is_large']
X = df[feature_cols]
y = df['sales']

# instantiate, fit
lm = LinearRegression()
lm.fit(X, y)

# print coefficients
list(zip(feature_cols, lm.coef_))

#this shows that you'll get 57 units extra if you're in a large market
#that's why this is only works in linear, for non-linear need to use non-linear model


Out[78]:
[('google', 0.04571982092436277),
 ('facebook', 0.1887281431342785),
 ('instagram', -0.001097679448351624),
 ('is_large', 0.05742385085482783)]

In [80]:
# create X and y
feature_cols = ['google', 'facebook', 'instagram', 'is_large','area_urban','area_suburban']
X = df[feature_cols]
y = df['sales']

# instantiate, fit
lm = LinearRegression()
lm.fit(X, y)

# print coefficients
print("intercept",lm.intercept_)
list(zip(feature_cols, lm.coef_))


intercept 2.8741909890879107
Out[80]:
[('google', 0.04574401036331374),
 ('facebook', 0.1878666955252581),
 ('instagram', -0.0010876977267108012),
 ('is_large', 0.07739660749747936),
 ('area_urban', 0.26813802165220113),
 ('area_suburban', -0.10656299015958548)]