In [0]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
sns.set()
In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/theleadio/datascience_demo/master/social_ads.csv", index_col=0)
In [14]:
df.head()
Out[14]:
In [15]:
df.tail()
Out[15]:
In [16]:
df.info()
In [18]:
sns.set(rc={'figure.figsize':(12,10)})
sns.set_context("talk", font_scale=1)
sns.heatmap(df.corr(), cmap='Blues', annot=True)
Out[18]:
In [23]:
# visualize the relationship between the features and the response using scatterplots
fig, axs = plt.subplots(1, 3, sharey=True);
df.plot(kind='scatter', x='google', y='sales', ax=axs[0], figsize=(16, 8));
df.plot(kind='scatter', x='facebook', y='sales', ax=axs[1]);
df.plot(kind='scatter', x='instagram', y='sales', ax=axs[2]);
#this shows google is good coz it has trends, worst is instagram coz we can't predict (very little certainty)
In [30]:
#questions
#is there a relationship between ads and sales?
#how strong is that relationship?
#which ad types contributes to sales?
#what is the effect of each ad type of sales?
#how can we predict sales?
#use simple linear regression, y=mx+c, y response, x feature,
# this is the standard import if you're using "formula notation" (similar to R)
import statsmodels.formula.api as smf
# create a fitted model in one line
lm = smf.ols(formula='sales ~ google', data=df).fit()
# print the coefficients
lm.params
#this parameter means what?
#by doing nothing you still have sale of 7.03, for every thousand dollars you spend, you'll sell 47 units?
Out[30]:
In [28]:
# create a fitted model in one line
lm = smf.ols(formula='sales ~ facebook', data=df).fit()
# print the coefficients
lm.params
Out[28]:
In [32]:
# create a fitted model in one line
lm = smf.ols(formula='sales ~ instagram', data=df).fit()
# print the coefficients
lm.params
Out[32]:
In [34]:
#if boss wants to spend 100k for google, how many sales is he expecting?
100*0.047537+7.032594
#the answer is around 11 thousand units
#all data were reduced by K (thousand)
Out[34]:
In [43]:
# this is the standard import if you're using "formula notation" (similar to R)
import statsmodels.formula.api as smf
# create a fitted model in one line
lm = smf.ols(formula='sales ~ google', data=df).fit()
# print the coefficients
lm.params
#if using statistical model
x_pred = pd.DataFrame({'google':[100]})
lm.predict(x_pred)
#due to error, need to execute google again
Out[43]:
In [48]:
#to construct a model using linear regression
sns.lmplot(x='google',y='sales',data=df,height=7, aspect=1.5);
#try change to semicolon if have error
In [50]:
#print the r-squared value for the model
lm.rsquared
Out[50]:
In [55]:
# create a fitted model with all three features
lm = smf.ols(formula='sales ~ google + facebook + instagram'
, data=df).fit()
# print the coefficients
lm.params
Out[55]:
In [57]:
# create a fitted model in one line
lm = smf.ols(formula='sales ~ instagram', data=df).fit()
# print the coefficients
lm.params
Out[57]:
In [59]:
lm.rsquared
Out[59]:
In [61]:
# create a fitted model with all three features
lm = smf.ols(formula='sales ~ google + facebook + instagram'
, data=df).fit()
# print the coefficients
lm.params
#this shows that spending much on instagram does not give much return
Out[61]:
In [63]:
lm.rsquared
Out[63]:
In [66]:
# create X and y
feature_cols = ['google', 'facebook', 'instagram']
X = df[feature_cols]
y = df['sales']
# follow the usual sklearn pattern: import, instantiate, fit
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X, y)
# print intercept and coefficients
print ("intercept", lm.intercept_)
print ("coefficients:", lm.coef_)
#result is the same, this is machine learning, the previous one is statistical method
In [73]:
lm.predict([[100,25,25]])
#this is to forecast that you'll get 12k unit sold with an input of 100k, 25k,25k from those big three
#this is to answer if we have 150k, in what areas should we spend, so this will show the outcome
#however this method is very sensitive to outliers so need to treat the data well
Out[73]:
In [75]:
lm.score(X,y)
#capital X means a lot of X factors
#similar to r-squared
Out[75]:
In [78]:
# create X and y
feature_cols = ['google', 'facebook', 'instagram', 'is_large']
X = df[feature_cols]
y = df['sales']
# instantiate, fit
lm = LinearRegression()
lm.fit(X, y)
# print coefficients
list(zip(feature_cols, lm.coef_))
#this shows that you'll get 57 units extra if you're in a large market
#that's why this is only works in linear, for non-linear need to use non-linear model
Out[78]:
In [80]:
# create X and y
feature_cols = ['google', 'facebook', 'instagram', 'is_large','area_urban','area_suburban']
X = df[feature_cols]
y = df['sales']
# instantiate, fit
lm = LinearRegression()
lm.fit(X, y)
# print coefficients
print("intercept",lm.intercept_)
list(zip(feature_cols, lm.coef_))
Out[80]: