In [18]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns #sets up styles and gives us more plotting options
In [19]:
URL = "tmrw.co" # User-entered website
In [20]:
# Time period 17th Jan - 16th April (arbitrary )
# API credentials
# Email address 705762800217-compute@developer.gserviceaccount.com
# Key IDs 948ee8e2a420ef14a5d5a29bd35104fe2f1e6ed4
In [21]:
# open file. It is requested via API explorer using request parameters:
#Account: TMRW Tech Hub
#Property: TMRW
#View: All Web Site Data
#ids: ga:123303369
#start-date: 2017-01-15
#end-date: yesterday
#metrics
#ga:sessions
#ga:percentNewSessions
#ga:bounceRate
#ga:pageviewsPerSession
#ga:avgSessionDuration
#ga:goal1ConversionRate
#ga:goal1Completions
#dimensions
#ga:city
#ga:userAgeBracket
#sort
#ga:goal1ConversionRate
Skipped step: Check statistical validity Filter off spam traffic and own dev/marketing IPs
Algorithm of actions:
Bounce Rate
Avg. Session Duration
Goal 1 Completions
Goal 1 Conversion Rate
Pages / Session
Take Key_metrics and check volume of traffic and conversions. Which is the most extreme? Conversion bucket = new array TMRW_users_city.max=TMRW_users_city.max()
Define key metrics Key_metrics = location/age. Can be location/gender or age/gender also. For them other API call needs to be made.
Open file
Visualise
In [22]:
# Open file
TMRW_users= pd.read_csv("files/TMRW_geo_loc_API.csv")
#TMRW_users[TMRW_users.Age=='55-64']
TMRW_users
Out[22]:
In [ ]:
In [23]:
# rename columns
TMRW_users.columns=['city','age', 'new_sessions','sessions','bounce_rate','asd','goal1','goal1CR','PPS']
TMRW_users=TMRW_users.sort_values('goal1CR')
TMRW_users_filter = TMRW_users[TMRW_users.sessions > 80]
TMRW_users_filter
Out[23]:
In [24]:
TMRW_users_filter.describe()
# will need to convert dnumbers into tim
#TMRW_users_filter_new =pd.to_datetime(TMRW_users_filter['asd'], format='%H:%M')
Out[24]:
In [25]:
# Check if the number of sessions is enough for analysis
if TMRW_users.sessions.sum() < 80:
print("Error")
#if sessions are too small remove
#if one conversion - remove
# 1. Too small sessions
TMRW_users.describe().loc['mean', 'sessions']
Out[25]:
In [26]:
#algo for bucketing into varios secsions:
#buckets by goal1CR
TMRW_users_goal1CR = TMRW_users_filter.nlargest(3, 'goal1CR')
TMRW_users_goal1CR_gCity = TMRW_users_goal1CR.groupby(['city']).mean() #this is not too correct- average of proportion, but ok for now
TMRW_users_goal1CR_gAge = TMRW_users_goal1CR.groupby(['age']).mean()
TMRW_users_goal1CR_gCity
Out[26]:
In [38]:
TMRW_users_goal1CR.loc[11,'age']
Out[38]:
In [28]:
TMRW_users_goal1CR_gAge.loc[: , 'goal1']
Out[28]:
In [35]:
TMRW_users_goal1CR.loc[]
'age'.max
Out[35]:
In [30]:
# The most converting audience
print(+" is most converting Demographic category")
#x = TMRW_users_goal1CR_gAge.index
y = TMRW_users_goal1CR.[: , 'goal1CR']
plt.hist(y)
plt.title("Top converting buckets")
plt.xlabel("Conv rate")
plt.ylabel("Frequency")
plt.show()
In [31]:
TMRW_users_bounce_rate = TMRW_users_filter.nlargest(3, 'bounce_rate')
TMRW_users_bounce_rate
#if the_largest traffic source = ('not_set') then output error ''
Out[31]:
In [70]:
TMRW_users_PPS = TMRW_users_filter.nlargest(3, 'PPS')
TMRW_users_PPS
Out[70]:
In [10]:
TMRW_users_agcities=TMRW_users.groupby(["city"]).sum()
TMRW_users_agcities
Out[10]:
In [10]:
selected=TMRW_users_agcities.loc[:,'Goal 1 Completions']
selected
Out[10]:
In [16]:
labels = selected.index
sizes = selected
# colours are taken from http://tools.medialab.sciences-po.fr/iwanthue/
colors = ['#1f394d','#2a7585', '#163c45', '#004a6e']
explode = (0, 0, 0, 0)
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=False, startangle=90)
plt.axis('equal')
plt.title('Cities by conversion')
plt.show()
# Conversion traffic
In [11]:
# Generate text
print (" segment converts best")
In [120]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
input_mob = pd.read_csv('files/TMRW_mob.csv')
input_mob.columns=['device','sessions','%news', 'new_users','bounce_rate','PPS', 'ASD', 'goal1CR','goal1']
input_mob = input_mob.set_index('device')
def p2f(x):
return float(x.strip('%'))/100
input_mob
Out[120]:
In [121]:
conv_increase = float(input_mob.loc['mobile','sessions']) * p2f(input_mob.loc['desktop','goal1CR']) - float(input_mob.loc['mobile','goal1'])
conv_increase=int(conv_increase)
In [157]:
# Check if data is valid
is_valid = True
output_chart_data = input_mob.loc['mobile',['sessions','goal1']]
# convert percentages into float
mob_CR = p2f(input_mob.loc['desktop','goal1CR'])
desk_CR = p2f(input_mob.loc['mobile','goal1CR'])
def funnel_cart():
return
# calculate difference
mob_dif = mob_CR / desk_CR
# if mobile goal1CR is less by more than 20% then reult is failed
if mob_dif > 1.5:
#funnel_chart()
output_chart_data
else:
is_valid = False
# if not then build chart
# http://stackoverflow.com/questions/21397549/stack-bar-plot-in-matplotlib-and-add-label-to-each-section-and-suggestions
# http://pandas.pydata.org/pandas-docs/stable/visualization.html#bar-plots
#mob_chart = output_chart_data
x = {1}
y = {100,10}
#ou = pd.DataFrame([100,10], columns=['sessions', 'goal1'])
output_chart_data.plot.barh(stacked=True)
#plt.barh(x,y,'stacked')
mob_conv = input_mob.loc['mobile','goal1CR']
plt.xlabel('Visits')
plt.title('Only %s of all mobile visitors end up completing a conversion' % mob_conv)
#need to specify analytics time perdiod
print("Have %s more conversions per month by optmiising mobile UX" % conv_increase)
plt.show()
In [146]:
## Continue here
output_chart_data
Out[146]:
In [ ]:
In [ ]: