In [336]:
request = "POST https://analyticsreporting.googleapis.com/v4/reports:batchGet?fields=reports(columnHeader%2Cdata(rows%2Ctotals))&key={YOUR_API_KEY}"
request = {
"reportRequests": [
{
"viewId": "123303369",
"dateRanges": [
{
"startDate": "2017-01-01",
"endDate": "2017-04-30"
}
],
"metrics": [
{
"expression": "ga:sessions"
},
{
"expression": "ga:sessionDuration"
},
{
"expression": "ga:goal1Completions"#instead of "ga:goal1Completions" use "goal_to_use_in_request" variable from tracking-tags code
},
{
"expression": "ga:bounceRate"
}
],
"dimensions": [
{
"name": "ga:city"
},
{
"name": "ga:userAgeBracket"
}
]
}
]
}
In [337]:
import json
with open('data/TMRW_user_groups.json') as file:
input_ugroups = json.load(file)
#input_ugroups
# Define dimensions list
input_ugroups_dimensions = input_ugroups['reports'][0]['columnHeader']['dimensions']
dimension_count = len(input_ugroups_dimensions)
# Define metrics list
input_ugroups_metrics = input_ugroups['reports'][0]['columnHeader']['metricHeader']['metricHeaderEntries']
def create_metric_list(raw_data):
lst = []
for item in raw_data:
lst.append(item['name'])
return lst
input_ugroups_metrics = create_metric_list(input_ugroups_metrics)
# Create input data
input_ugroups_data = input_ugroups['reports'][0]['data']['rows']
input_ugroups_data
Out[337]:
In [338]:
values_list = []
for group in input_ugroups_data:
new_dim_name = group['dimensions'][0] + ", " + group['dimensions'][1]
group[new_dim_name] = group['metrics'][0]
del group['dimensions']
del group['metrics']
#conv_rate = round(float(int(group[new_dim_name]['values'][2])/int(group[new_dim_name]['values'][0])*100),2)
group[new_dim_name]['values'] = list(map(float,group[new_dim_name]['values']))
#group[new_dim_name]['values'].append(new_dim_name)
values_list.append(group[new_dim_name]['values'])
group[new_dim_name]['values'].insert(0,new_dim_name)
#list(map(float,group[new_dim_name]['values'])).append(new_dim_name)
#values_list = values_list.apply(lambda x: x[0][0][1])
#group[new_dim_name]['values'].append(conv_rate)
#print(group[new_dim_name])
values_list
#input_ugroups_data
Out[338]:
In [339]:
# Define each metric dict
ugroups_data = {}
for ugroup in input_ugroups_data:
#print (ugroup)
for gr in ugroup:
ugroups_data[gr] = {'sessions':0,
'bounce_rate':0,
'conversions':0,
'conversion_rate':0}
ugroups_data[gr]['sessions'] = round(float(ugroup[gr]['values'][1]),2)
ugroups_data[gr]['conversions'] = round(float(ugroup[gr]['values'][2]),2)
ugroups_data[gr]['bounce_rate'] = round(float(ugroup[gr]['values'][3]),2)
ugroups_data[gr]['conversion_rate'] = round(float(ugroup[gr]['values'][4]),2)
ugroups_data
Out[339]:
In [340]:
import collections
from collections import OrderedDict
columns = []
for u in ugroups_data:
#print (u)
for metric in ugroups_data[u]:
columns.append('city_age')
columns.append(metric)
columns = list(OrderedDict.fromkeys(columns))
columns
Out[340]:
In [341]:
import pandas as pd
In [342]:
df = pd.DataFrame(values_list, columns = columns)
df.to_json(orient='split')
table_data = pd.read_json(df.to_json(orient='split'), orient='split')
table_data.conversion_rate = round(table_data.conversion_rate,2)
table_data = table_data[table_data.conversions > 0].sort('sessions')
table_data
Out[342]:
In [469]:
# create a list of sessions
samples_sessions=[]
for i in range(0,len(table_data)):
a = table_data.sessions[i]
samples_sessions.append(a)
samples_sessions = sorted(samples_sessions)
samples_sessions
Out[469]:
In [470]:
#cluster point 1(min)
cl1_point = min(samples_sessions) + 60
cl1_point
Out[470]:
In [471]:
#cluster point 2(middle)
mediana_number = int(len(samples_sessions)/2)
cl2_point = samples_sessions[mediana_number]+60
cl2_point
Out[471]:
In [472]:
#cluster point 3(max)
cl3_point = max(samples_sessions) - 240
cl3_point
Out[472]:
In [473]:
import math
In [474]:
# calculate distance between cl1 and each point in samples_sessions
distance_cl1 = []
for i in range(0,len(table_data)):
d = math.sqrt(math.pow((cl1_point - samples_sessions[i]), 2))
distance_cl1.append(d)
distance_cl1
Out[474]:
In [475]:
distance_cl2 = []
for i in range(0,len(table_data)):
d = math.sqrt(math.pow((cl2_point - samples_sessions[i]), 2))
distance_cl2.append(d)
distance_cl2
Out[475]:
In [476]:
distance_cl3 = []
for i in range(0,len(table_data)):
d = math.sqrt(math.pow((cl3_point - samples_sessions[i]), 2))
distance_cl3.append(d)
distance_cl3
Out[476]:
In [478]:
#create cluster1 which include the smallest distance to cl1_point
cluster1 = []
for k in range(0,len(samples_sessions)):
if (distance_cl1[k] < distance_cl2[k]):
res = distance_cl1[k]
cluster1.append(res)
cluster1
Out[478]:
In [479]:
cluster2 = []
for k in range(len(cluster1),len(samples_sessions)):
if (distance_cl2[k] < distance_cl3[k]):
res = distance_cl2[k]
cluster2.append(res)
cluster2
Out[479]:
In [480]:
cluster3 = []
for k in range(len(cluster1)+len(cluster2),len(samples_sessions)):
if (distance_cl3[k]<distance_cl2[k]):
res = distance_cl3[k]
cluster3.append(res)
cluster3
Out[480]:
In [483]:
#mark first cluster as "low" and define a values for it
cl1 = []
for f in range(0,len(cluster1)):
n =[]
n.append("low")
n.append(samples_sessions[f])
cl1.append(n)
cl1
Out[483]:
In [487]:
#create a table for cluster and value
table_cl1 = pd.DataFrame(cl1,columns = ['cluster_sess','sessions'])
table_cl1
Out[487]:
In [488]:
cl2 = []
for h in range(len(cluster1),len(cluster2)+len(cluster1)):
n =[]
n.append("mid")
n.append(samples_sessions[h])
cl2.append(n)
cl2
Out[488]:
In [489]:
table_cl2 = pd.DataFrame(cl2,index = range(len(cluster1),len(cluster2)+len(cluster1)), columns = ['cluster_sess','sessions'])
table_cl2
Out[489]:
In [490]:
cl3 = []
for h in range(len(cluster2)+len(cluster1),len(samples_sessions)):
n =[]
n.append("high")
n.append(samples_sessions[h])
cl3.append(n)
cl3
Out[490]:
In [491]:
table_cl3 = pd.DataFrame(cl3,index = range(len(cluster2)+len(cluster1),len(samples_sessions)), columns = ['cluster_sess','sessions'])
table_cl3
Out[491]:
In [493]:
#join clusters into one table
result_clusters = pd.concat([table_cl1,table_cl2,table_cl3])
result_clusters
Out[493]:
In [494]:
result_sess = result_clusters.merge(table_data, on = 'sessions')
result_sess
Out[494]:
Cluster analysis by conversion rate
In [495]:
samples_cr=[]
for i in range(0,len(table_data)):
a = round(table_data.conversion_rate[i],2)
samples_cr.append(a)
#print(a)
samples_cr = sorted(samples_cr)
samples_cr
Out[495]:
In [496]:
cl1_point=min(samples_cr) + 0.6
cl1_point
Out[496]:
In [498]:
mediana_number = int(len(samples_cr)/2)
cl2_point = samples_cr[mediana_number] - 0.6
cl2_point
Out[498]:
In [499]:
cl3_point=max(samples_cr)-1
cl3_point
Out[499]:
In [500]:
distance_cl1 = []
for i in range(0,len(table_data)):
d = math.sqrt(math.pow((cl1_point - samples_cr[i]), 2))
distance_cl1.append(d)
distance_cl1
Out[500]:
In [501]:
distance_cl2 = []
for i in range(0,len(table_data)):
d = math.sqrt(math.pow((cl2_point - samples_cr[i]), 2))
distance_cl2.append(d)
distance_cl2
Out[501]:
In [502]:
distance_cl3 = []
for i in range(0,len(table_data)):
d = math.sqrt(math.pow((cl3_point - samples_cr[i]), 2))
distance_cl3.append(d)
distance_cl3
Out[502]:
In [503]:
cluster1 = []
for k in range(0,len(samples_cr)):
if (distance_cl1[k] < distance_cl2[k]):
res = distance_cl1[k]
cluster1.append(res)
cluster1
Out[503]:
In [504]:
cluster2 = []
for k in range(len(cluster1),len(samples_cr)):
if (distance_cl2[k] < distance_cl3[k]):
res = distance_cl2[k]
cluster2.append(res)
cluster2
Out[504]:
In [505]:
cluster3 = []
for k in range(len(cluster1)+len(cluster2),len(samples_cr)):
if (distance_cl3[k]<distance_cl2[k]):
res = distance_cl3[k]
cluster3.append(res)
cluster3
Out[505]:
In [506]:
cl1 = []
for f in range(0,len(cluster1)):
n =[]
n.append("low")
n.append(samples_cr[f])
cl1.append(n)
cl1
Out[506]:
In [507]:
table_cl1 = pd.DataFrame(cl1,columns = ['cluster_cr','conversion_rate'])
table_cl1
Out[507]:
In [508]:
cl2 = []
for h in range(len(cluster1),len(cluster2)+len(cluster1)):
n =[]
n.append("mid")
n.append(samples_cr[h])
cl2.append(n)
cl2
Out[508]:
In [509]:
table_cl2 = pd.DataFrame(cl2,index = range(len(cluster1),len(cluster2)+len(cluster1)), columns = ['cluster_cr','conversion_rate'])
table_cl2
Out[509]:
In [510]:
cl3 = []
for h in range(len(cluster2)+len(cluster1),len(samples_cr)):
n =[]
n.append("high")
n.append(samples_cr[h])
cl3.append(n)
cl3
Out[510]:
In [511]:
table_cl3 = pd.DataFrame(cl3,index = range(len(cluster2)+len(cluster1),len(samples_sessions)), columns = ['cluster_cr','conversion_rate'])
table_cl3
Out[511]:
In [512]:
result_clusters = pd.concat([table_cl1,table_cl2,table_cl3])
result_clusters
Out[512]:
In [513]:
result_cr = result_clusters.merge(table_data, on = 'conversion_rate')
result_cr
Out[513]:
In [514]:
final = result_cr.merge(result_sess, on = 'city_age')
final
Out[514]:
In [515]:
final = final[["city_age","cluster_cr","cluster_sess","conversion_rate_x","conversions_x","sessions_x"]]
final
Out[515]:
In [516]:
import pandas as pd
In [517]:
final["group"] = final["cluster_cr"] + final["cluster_sess"]
final
Out[517]:
In [518]:
import scipy
group_midhigh = []
sess_mh = []
convs_mh = []
cr_mh = []
for i in range(0,len(final)):
if final.group[i] == "midhigh":
sess_mh.append(final.sessions_x[i])
convs_mh.append(final.conversions_x[i])
cr_mh.append(final.conversion_rate_x[i])
group_midhigh.append(final.city_age[i])
sess_midhigh = sum(sess_mh)
convs_midhigh = sum(convs_mh)
cr_midhigh = scipy.mean(cr_mh,0)
group_midhigh = ";".join(group_midhigh)
group1 = [group_midhigh,sess_midhigh,convs_midhigh,cr_midhigh]
group1
Out[518]:
In [519]:
group_highmid = []
sess_hm = []
convs_hm = []
cr_hm = []
for i in range(0,len(final)):
if final.group[i] == "highmid":
sess_hm.append(final.sessions_x[i])
convs_hm.append(final.conversions_x[i])
cr_hm.append(final.conversion_rate_x[i])
group_highmid.append(final.city_age[i])
sess_highmid = sum(sess_hm)
convs_highmid = sum(convs_hm)
cr_highmid = scipy.mean(cr_hm,0)
group_highmid = ";".join(group_highmid)
group2 = [group_highmid,sess_highmid,convs_highmid,cr_highmid]
group2
Out[519]:
In [520]:
group_highlow = []
sess_hl = []
convs_hl = []
cr_hl = []
for i in range(0,len(final)):
if final.group[i] == "highlow":
sess_hl.append(final.sessions_x[i])
convs_hl.append(final.conversions_x[i])
cr_hl.append(final.conversion_rate_x[i])
group_highlow.append(final.city_age[i])
sess_highlow = sum(sess_hl)
convs_highlow = sum(convs_hl)
cr_highlow = scipy.mean(cr_hl,0)
group_highlow = ";".join(group_highlow)
group3 = [group_highlow,sess_highlow,convs_highlow,cr_highlow]
group3
Out[520]:
In [521]:
group = []
sess_group = []
convs_group = []
cr_group = []
for i in range(0,len(final)):
if (final.group[i] != "highlow") and (final.group[i] != "highmid") and (final.group[i] != "midhigh"):
sess_group.append(final.sessions_x[i])
convs_group.append(final.conversions_x[i])
cr_group.append(final.conversion_rate_x[i])
group.append(final.city_age[i])
sess = sum(sess_group)
convs = sum(convs_group)
cr = scipy.mean(cr_group)
group = ";".join(group)
group4 = [group,sess,convs,cr]
group4
Out[521]:
In [522]:
results = [group1,group2,group3,group4]
results
Out[522]:
In [523]:
final_table = pd.DataFrame(results, columns = ["group","sessions","conversions","conversion_rate"])
final_table
Out[523]:
In [ ]: