notebook.community

Edit and run



In [0]:

    
%matplotlib inline

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'
sns.set()



In [0]:

    
df = pd.read_csv("https://raw.githubusercontent.com/theleadio/datascience_demo/master/social_ads.csv", index_col=0)



In [14]:

    
df.head()









    Out[14]:







  
    
      
      google
      facebook
      instagram
      sales
      size
      is_large
      area
      area_suburban
      area_urban
    
    
      segment
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      230.1
      37.8
      69.2
      22.1
      large
      1
      rural
      0
      0
    
    
      2
      44.5
      39.3
      45.1
      10.4
      small
      0
      urban
      0
      1
    
    
      3
      17.2
      45.9
      69.3
      9.3
      small
      0
      rural
      0
      0
    
    
      4
      151.5
      41.3
      58.5
      18.5
      small
      0
      urban
      0
      1
    
    
      5
      180.8
      10.8
      58.4
      12.9
      large
      1
      suburban
      1
      0



In [15]:

    
df.tail()









    Out[15]:







  
    
      
      google
      facebook
      instagram
      sales
      size
      is_large
      area
      area_suburban
      area_urban
    
    
      segment
      
      
      
      
      
      
      
      
      
    
  
  
    
      196
      38.2
      3.7
      13.8
      7.6
      small
      0
      suburban
      1
      0
    
    
      197
      94.2
      4.9
      8.1
      9.7
      small
      0
      urban
      0
      1
    
    
      198
      177.0
      9.3
      6.4
      12.8
      small
      0
      suburban
      1
      0
    
    
      199
      283.6
      42.0
      66.2
      25.5
      small
      0
      rural
      0
      0
    
    
      200
      232.1
      8.6
      8.7
      13.4
      large
      1
      rural
      0
      0



In [16]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 1 to 200
Data columns (total 9 columns):
google           200 non-null float64
facebook         200 non-null float64
instagram        200 non-null float64
sales            200 non-null float64
size             200 non-null object
is_large         200 non-null int64
area             200 non-null object
area_suburban    200 non-null int64
area_urban       200 non-null int64
dtypes: float64(4), int64(3), object(2)
memory usage: 15.6+ KB



In [18]:

    
sns.set(rc={'figure.figsize':(12,10)})
sns.set_context("talk", font_scale=1)
    
sns.heatmap(df.corr(), cmap='Blues', annot=True)









    Out[18]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f002deb6f60>



In [23]:

    
# visualize the relationship between the features and the response using scatterplots
fig, axs = plt.subplots(1, 3, sharey=True);
df.plot(kind='scatter', x='google', y='sales', ax=axs[0], figsize=(16, 8));
df.plot(kind='scatter', x='facebook', y='sales', ax=axs[1]);
df.plot(kind='scatter', x='instagram', y='sales', ax=axs[2]);
#this shows google is good coz it has trends, worst is instagram coz we can't predict (very little certainty)









    



'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.



In [30]:

    
#questions
#is there a relationship between ads and sales?
#how strong is that relationship?
#which ad types contributes to sales?
#what is the effect of each ad type of sales?
#how can we predict sales?

#use simple linear regression, y=mx+c, y response, x feature, 

# this is the standard import if you're using "formula notation" (similar to R)
import statsmodels.formula.api as smf

# create a fitted model in one line
lm = smf.ols(formula='sales ~ google', data=df).fit()

# print the coefficients
lm.params

#this parameter means what?
#by doing nothing you still have sale of 7.03, for every thousand dollars you spend, you'll sell 47 units?









    Out[30]:





Intercept    7.032594
google       0.047537
dtype: float64



In [28]:

    
# create a fitted model in one line
lm = smf.ols(formula='sales ~ facebook', data=df).fit()

# print the coefficients
lm.params









    Out[28]:





Intercept    9.311638
facebook     0.202496
dtype: float64



In [32]:

    
# create a fitted model in one line
lm = smf.ols(formula='sales ~ instagram', data=df).fit()

# print the coefficients
lm.params









    Out[32]:





Intercept    12.351407
instagram     0.054693
dtype: float64



In [34]:

    
#if boss wants to spend 100k for google, how many sales is he expecting?
100*0.047537+7.032594
#the answer is around 11 thousand units
#all data were reduced by K (thousand)









    Out[34]:





11.786294



In [43]:

    
# this is the standard import if you're using "formula notation" (similar to R)
import statsmodels.formula.api as smf

# create a fitted model in one line
lm = smf.ols(formula='sales ~ google', data=df).fit()

# print the coefficients
lm.params

#if using statistical model
x_pred = pd.DataFrame({'google':[100]})
lm.predict(x_pred)

#due to error, need to execute google again









    Out[43]:





0    11.786258
dtype: float64



In [48]:

    
#to construct a model using linear regression
sns.lmplot(x='google',y='sales',data=df,height=7, aspect=1.5);
#try change to semicolon if have error



In [50]:

    
#print the r-squared value for the model
lm.rsquared









    Out[50]:





0.611875050850071



In [55]:

    
# create a fitted model with all three features
lm = smf.ols(formula='sales ~ google + facebook + instagram'
, data=df).fit()

# print the coefficients
lm.params









    Out[55]:





Intercept    2.938889
google       0.045765
facebook     0.188530
instagram   -0.001037
dtype: float64



In [57]:

    
# create a fitted model in one line
lm = smf.ols(formula='sales ~ instagram', data=df).fit()

# print the coefficients
lm.params









    Out[57]:





Intercept    12.351407
instagram     0.054693
dtype: float64



In [59]:

    
lm.rsquared









    Out[59]:





0.05212044544430516



In [61]:

    
# create a fitted model with all three features
lm = smf.ols(formula='sales ~ google + facebook + instagram'
, data=df).fit()

# print the coefficients
lm.params

#this shows that spending much on instagram does not give much return









    Out[61]:





Intercept    2.938889
google       0.045765
facebook     0.188530
instagram   -0.001037
dtype: float64



In [63]:

    
lm.rsquared









    Out[63]:





0.8972106381789522



In [66]:

    
# create X and y
feature_cols = ['google', 'facebook', 'instagram']
X = df[feature_cols]
y = df['sales']

# follow the usual sklearn pattern: import, instantiate, fit
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X, y)

# print intercept and coefficients
print ("intercept", lm.intercept_)
print ("coefficients:", lm.coef_)

#result is the same, this is machine learning, the previous one is statistical method









    



intercept 2.938889369459412
coefficients: [ 0.04576465  0.18853002 -0.00103749]



In [73]:

    
lm.predict([[100,25,25]])
#this is to forecast that you'll get 12k unit sold with an input of 100k, 25k,25k from those big three
#this is to answer if we have 150k, in what areas should we spend, so this will show the outcome
#however this method is very sensitive to outliers so need to treat the data well









    Out[73]:





array([12.20266701])



In [75]:

    
lm.score(X,y)
#capital X means a lot of X factors
#similar to r-squared









    Out[75]:





0.8972106381789521



In [78]:

    
# create X and y
feature_cols = ['google', 'facebook', 'instagram', 'is_large']
X = df[feature_cols]
y = df['sales']

# instantiate, fit
lm = LinearRegression()
lm.fit(X, y)

# print coefficients
list(zip(feature_cols, lm.coef_))

#this shows that you'll get 57 units extra if you're in a large market
#that's why this is only works in linear, for non-linear need to use non-linear model









    Out[78]:





[('google', 0.04571982092436277),
 ('facebook', 0.1887281431342785),
 ('instagram', -0.001097679448351624),
 ('is_large', 0.05742385085482783)]



In [80]:

    
# create X and y
feature_cols = ['google', 'facebook', 'instagram', 'is_large','area_urban','area_suburban']
X = df[feature_cols]
y = df['sales']

# instantiate, fit
lm = LinearRegression()
lm.fit(X, y)

# print coefficients
print("intercept",lm.intercept_)
list(zip(feature_cols, lm.coef_))









    



intercept 2.8741909890879107






    Out[80]:





[('google', 0.04574401036331374),
 ('facebook', 0.1878666955252581),
 ('instagram', -0.0010876977267108012),
 ('is_large', 0.07739660749747936),
 ('area_urban', 0.26813802165220113),
 ('area_suburban', -0.10656299015958548)]

	google	facebook	instagram	sales	size	is_large	area	area_suburban	area_urban
segment
1	230.1	37.8	69.2	22.1	large	1	rural	0	0
2	44.5	39.3	45.1	10.4	small	0	urban	0	1
3	17.2	45.9	69.3	9.3	small	0	rural	0	0
4	151.5	41.3	58.5	18.5	small	0	urban	0	1
5	180.8	10.8	58.4	12.9	large	1	suburban	1	0

	google	facebook	instagram	sales	size	is_large	area	area_suburban	area_urban
segment
196	38.2	3.7	13.8	7.6	small	0	suburban	1	0
197	94.2	4.9	8.1	9.7	small	0	urban	0	1
198	177.0	9.3	6.4	12.8	small	0	suburban	1	0
199	283.6	42.0	66.2	25.5	small	0	rural	0	0
200	232.1	8.6	8.7	13.4	large	1	rural	0	0