In [1]:
import ga_utils as g
import pandas as pd
import auth as auth
import matplotlib
import config as config
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as smf
%matplotlib inline

In [2]:
# launch auth process
auth.main()


Out[2]:
<googleapiclient.discovery.Resource at 0x11a021190>

In [3]:
pageviews = g.get_all_pages("94275425", start_date='2015-05-01', end_date='2016-05-20')


pages = g.get_all_pageviews("94275425", start_date='2015-05-01', end_date='2016-05-20')


sessions = g.get_sessions("94275425", start_date='2015-05-01', end_date='2016-05-20')

transactions = g.get_transactions_by_day("94275425", start_date='2015-05-01', end_date='2016-05-20')

In [29]:
df = pd.concat([pages.astype(int), transactions['ga:transactions'].astype(int), sessions['ga:sessions'].astype(int)], axis=1)

In [34]:
#pageviews['date'] = pd.to_datetime(pageviews['date'])
df.rename(columns=lambda x: x.replace('ga:', ''), inplace=True)
df = df[df.pageviews!=87293]
df = df[df.pageviews!=122848]
df = df[df.pageviews!=73522]
df.sort_values(by='pageviews', ascending=False)


Out[34]:
date pageviews transactions sessions
143 20150921 51808 22 43080
308 20160304 46934 26 4145
309 20160305 45128 32 4396
310 20160306 40495 62 4343
316 20160312 37935 27 4465
156 20151004 36564 48 5426
144 20150922 34592 27 23314
317 20160313 32378 49 3942
323 20160319 31819 40 5109
315 20160311 30883 14 4320
321 20160317 28967 25 3808
313 20160309 27661 30 3688
319 20160315 27090 28 2889
325 20160321 25696 44 3402
324 20160320 25009 68 3800
153 20151001 24949 31 4167
311 20160307 24652 25 3715
320 20160316 24254 28 3508
155 20151003 24065 27 4115
154 20151002 23796 28 3935
318 20160314 23361 28 3083
322 20160318 22478 43 3172
62 20150702 22293 91 6966
157 20151005 22012 29 3772
149 20150927 20289 38 2721
312 20160308 19913 25 3236
152 20150930 19785 31 2959
151 20150929 19637 27 3253
326 20160322 19456 26 4628
228 20151215 18292 11 8417
... ... ... ... ...
9 20150510 4984 18 2831
382 20160517 4984 23 1516
7 20150508 4974 11 2645
371 20160506 4937 18 1681
39 20150609 4866 9 2305
57 20150627 4827 4 2220
8 20150509 4808 10 3391
42 20150612 4723 13 1832
13 20150514 4694 8 3420
3 20150504 4676 19 2161
199 20151116 4644 9 1981
372 20160507 4630 13 1545
41 20150611 4581 14 1897
383 20160518 4521 19 1439
10 20150511 4508 15 2612
43 20150613 4495 18 1621
38 20150608 4409 11 2199
384 20160519 4380 17 1395
6 20150507 4264 6 2129
35 20150605 4116 7 2636
139 20150917 4066 13 1686
16 20150517 4033 11 2178
12 20150513 3980 12 2339
11 20150512 3952 17 2145
15 20150516 3889 9 2465
14 20150515 3741 8 2755
17 20150518 3439 9 1772
36 20150606 3377 0 2617
37 20150607 3348 0 2183
385 20160520 2640 8 848

383 rows × 4 columns


In [35]:
fig, axs = plt.subplots(1, 2, sharey=True)
df.plot(kind="scatter", x="pageviews", y='transactions', ax=axs[0], figsize=(16, 8))
df.plot(kind="scatter", x="sessions", y='transactions', ax=axs[1], figsize=(16, 8))


Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x11e30c990>

In [36]:
lm = smf.ols(formula='transactions ~ pageviews', data=df).fit()

In [37]:
lm.params


Out[37]:
Intercept    12.782877
pageviews     0.000734
dtype: float64

In [38]:
lm.params[0] + lm.params[1]*50


Out[38]:
12.819569426260056

In [39]:
X_new = pd.DataFrame({'pageviews':[df['pageviews'].min(), df['pageviews'].max()]})
X_new.head()


Out[39]:
pageviews
0 2640
1 51808

In [40]:
preds = lm.predict(X_new)
preds


Out[40]:
array([ 14.7202616 ,  50.80259094])

In [41]:
df.plot(kind="scatter", x="pageviews", y='transactions', color=g.colours['blue'])
plt.plot(X_new, preds,linewidth=2, color = g.colours['red'])


Out[41]:
[<matplotlib.lines.Line2D at 0x11e3904d0>]

In [42]:
lm.conf_int()


Out[42]:
0 1
Intercept 10.618107 14.947646
pageviews 0.000553 0.000915

In [43]:
lm.pvalues


Out[43]:
Intercept    6.882779e-27
pageviews    1.817306e-14
dtype: float64

In [44]:
lm.rsquared


Out[44]:
0.14300401522255457

In [45]:
lm_multi = smf.ols(formula='transactions ~ sessions + pageviews', data=df).fit()

lm_multi.params


Out[45]:
Intercept    14.100595
sessions     -0.000885
pageviews     0.000931
dtype: float64