In [38]:
request = "POST https://analyticsreporting.googleapis.com/v4/reports:batchGet?fields=reports(columnHeader%2Cdata(rows%2Ctotals))&key={YOUR_API_KEY}"
request = {
"reportRequests": [
{
"viewId": "123303369",
"dateRanges": [
{
"startDate": "2017-01-01",
"endDate": "2017-04-30"
}
],
"metrics": [
{
"expression": "ga:sessions"
},
{
"expression": "ga:sessionDuration"
},
{
"expression": "ga:goal1Completions"
},
{
"expression": "ga:bounceRate"
}
],
"dimensions": [
{
"name": "ga:city"
},
{
"name": "ga:userAgeBracket"
}
]
}
]
}
In [39]:
import json
with open('data/TMRW_user_groups.json') as file:
input_ugroups = json.load(file)
#input_ugroups
# Define dimensions list
input_ugroups_dimensions = input_ugroups['reports'][0]['columnHeader']['dimensions']
dimension_count = len(input_ugroups_dimensions)
# Define metrics list
input_ugroups_metrics = input_ugroups['reports'][0]['columnHeader']['metricHeader']['metricHeaderEntries']
def create_metric_list(raw_data):
lst = []
for item in raw_data:
lst.append(item['name'])
return lst
input_ugroups_metrics = create_metric_list(input_ugroups_metrics)
# Create input data
input_ugroups_data = input_ugroups['reports'][0]['data']['rows']
input_ugroups_data
Out[39]:
In [40]:
values_list = []
for group in input_ugroups_data:
new_dim_name = group['dimensions'][0] + ", " + group['dimensions'][1]
group[new_dim_name] = group['metrics'][0]
del group['dimensions']
del group['metrics']
#conv_rate = round(float(int(group[new_dim_name]['values'][2])/int(group[new_dim_name]['values'][0])*100),2)
group[new_dim_name]['values'] = list(map(float,group[new_dim_name]['values']))
group[new_dim_name]['values'].append(new_dim_name)
values_list.append(group[new_dim_name]['values'])
#list(map(float,group[new_dim_name]['values'])).append(new_dim_name)
#values_list = values_list.apply(lambda x: x[0][0][1])
#group[new_dim_name]['values'].append(conv_rate)
#print(group[new_dim_name])
values_list
#input_ugroups_data
Out[40]:
In [41]:
# Define each metric dict
ugroups_data = {}
for ugroup in input_ugroups_data:
#print (ugroup)
for gr in ugroup:
ugroups_data[gr] = {'sessions':0,
'bounce_rate':0,
'conversions':0,
'conversion_rate':0,
'city_age':'',}
ugroups_data[gr]['sessions'] = round(float(ugroup[gr]['values'][0]),2)
ugroups_data[gr]['conversions'] = round(float(ugroup[gr]['values'][1]),2)
ugroups_data[gr]['bounce_rate'] = round(float(ugroup[gr]['values'][2]),2)
ugroups_data[gr]['conversion_rate'] = round(float(ugroup[gr]['values'][3]),2)
ugroups_data[gr]['city_age'] = list(ugroups_data.keys())
#ugroups_data
In [42]:
import collections
from collections import OrderedDict
columns = []
for u in ugroups_data:
#print (test[r])
for metric in ugroups_data[u]:
columns.append(metric)
columns = list(OrderedDict.fromkeys(columns))
columns
Out[42]:
In [43]:
import pandas as pd
In [44]:
df = pd.DataFrame(values_list, columns = columns)
df.to_json(orient='split')
table_data = pd.read_json(df.to_json(orient='split'), orient='split')
table_data.conversion_rate = round(table_data.conversion_rate,2)
table_data = table_data[table_data.conversions > 0]
table_data
Out[44]:
In [45]:
samples_sessions=[]
for i in range(0,len(table_data)):
a = [table_data.sessions[i]]
samples_sessions.append(a)
samples_sessions = sorted(samples_sessions, key=lambda x: x[0])
samples_sessions
Out[45]:
In [46]:
mediana_number = int(len(samples_sessions)/2)
mediana_number
Out[46]:
In [47]:
import random
import math
NUM_CLUSTERS = 3
TOTAL_DATA = len(samples_sessions)
LOWEST_SAMPLE_POINT = samples_sessions.index(min(samples_sessions)) #element 9 of SAMPLES.
Middle_SAMPLE_POINT = mediana_number
HIGHEST_SAMPLE_POINT = samples_sessions.index(max(samples_sessions)) #element 6 of SAMPLES.
BIG_NUMBER = math.pow(10, 10)
SAMPLES = samples_sessions
data1 = []
centroids = []
class DataPoint:
def __init__(self, x):
self.x = x
def set_x(self, x):
self.x = x
def get_x(self):
return self.x
def set_cluster(self, clusterNumber):
self.clusterNumber = clusterNumber
def get_cluster(self):
return self.clusterNumber
class Centroid:
def __init__(self, x):
self.x = x
def set_x(self, x):
self.x = x
def get_x(self):
return self.x
In [48]:
def initialize_centroids():
# Set the centoid coordinates to match the data points furthest from each other.
# In this example, [31, 51.613, 1, 3.2260000000000004] and [758, 59.234999999999999, 22, 2.9019999999999997]
centroids.append(Centroid(SAMPLES[LOWEST_SAMPLE_POINT][0]))
centroids.append(Centroid(SAMPLES[Middle_SAMPLE_POINT][0]))
centroids.append(Centroid(SAMPLES[HIGHEST_SAMPLE_POINT][0]))
print("Centroids initialized at:")
print("(", centroids[0].get_x(),")")
print("(", centroids[1].get_x(),")")
print("(", centroids[2].get_x(),")")
print()
return
#print(initialize_centroids())
#print(centroids.append(Centroid(SAMPLES[HIGHEST_SAMPLE_POINT][0])))
In [49]:
def initialize_datapoints():
# DataPoint objects' x and y values are taken from the SAMPLE array.
# The DataPoints associated with LOWEST_SAMPLE_POINT and HIGHEST_SAMPLE_POINT are initially
# assigned to the clusters matching the LOWEST_SAMPLE_POINT and HIGHEST_SAMPLE_POINT centroids.
for i in range(TOTAL_DATA):
newPoint = DataPoint(SAMPLES[i][0])
if(i == LOWEST_SAMPLE_POINT):
newPoint.set_cluster(0)
elif(i == Middle_SAMPLE_POINT):
newPoint.set_cluster(1)
elif(i == HIGHEST_SAMPLE_POINT):
newPoint.set_cluster(2)
else:
newPoint.set_cluster(None)
data1.append(newPoint)
return
In [50]:
def get_distance(dataPointX, centroidX):
# Calculate Euclidean distance.
return math.sqrt(math.pow((centroidX - dataPointX), 2))
In [51]:
def recalculate_centroids():
totalX = 0
totalInCluster = 0
for j in range(NUM_CLUSTERS):
for k in range(len(data1)):
if(data1[k].get_cluster() == j):
totalX += data1[k].get_x()
totalInCluster += 1
if(totalInCluster > 0):
centroids[j].set_x(totalX / totalInCluster)
return
print(recalculate_centroids())
In [52]:
def update_clusters():
isStillMoving = 0
for i in range(TOTAL_DATA):
bestMinimum = BIG_NUMBER
currentCluster = 0
for j in range(NUM_CLUSTERS):
distance = get_distance(data1[i].get_x(), centroids[j].get_x())
if(distance < bestMinimum):
bestMinimum = distance
currentCluster = j
data1[i].set_cluster(currentCluster)
if(data1[i].get_cluster() is None or data1[i].get_cluster() != currentCluster):
data1[i].set_cluster(currentCluster)
isStillMoving = 1
return isStillMoving
In [53]:
def perform_kmeans():
isStillMoving = 1
initialize_centroids()
initialize_datapoints()
while(isStillMoving):
recalculate_centroids()
isStillMoving = update_clusters()
return
perform_kmeans()
In [54]:
def print_results():
result_list = []
for i in range(NUM_CLUSTERS):
for j in range(TOTAL_DATA):
if(data1[j].get_cluster() == i):
result = []
result.append(data1[j].get_cluster())
result.append(data1[j].get_x())
result_list.append(result)
return result_list
print_results()
Out[54]:
In [55]:
cluster_sessions = pd.DataFrame(print_results(),columns = ['#cluster_for_sess','sessions'])
cluster_sessions
Out[55]:
In [56]:
result_sessions = (table_data.sort('sessions')).merge(cluster_sessions, on = 'sessions')
result_sessions
Out[56]:
In [ ]:
In [57]:
samples_cr=[]
for i in range(0,len(table_data)):
a = [table_data.conversion_rate[i]]
samples_cr.append(a)
samples_cr = sorted(samples_cr, key=lambda x: x[0])
samples_cr
Out[57]:
In [58]:
import random
import math
NUM_CLUSTERS = 3
TOTAL_DATA = len(samples_cr)
LOWEST_SAMPLE_POINT = samples_cr.index(min(samples_cr)) #element 9 of SAMPLES.
Middle_SAMPLE_POINT = mediana_number
HIGHEST_SAMPLE_POINT = samples_cr.index(max(samples_cr)) #element 6 of SAMPLES.
BIG_NUMBER = math.pow(10, 10)
SAMPLES = samples_cr
data1 = []
centroids = []
class DataPoint:
def __init__(self, x):
self.x = x
def set_x(self, x):
self.x = x
def get_x(self):
return self.x
def set_cluster(self, clusterNumber):
self.clusterNumber = clusterNumber
def get_cluster(self):
return self.clusterNumber
class Centroid:
def __init__(self, x):
self.x = x
def set_x(self, x):
self.x = x
def get_x(self):
return self.x
In [59]:
def initialize_centroids():
# Set the centoid coordinates to match the data points furthest from each other.
# In this example, [31, 51.613, 1, 3.2260000000000004] and [758, 59.234999999999999, 22, 2.9019999999999997]
centroids.append(Centroid(SAMPLES[LOWEST_SAMPLE_POINT][0]))
centroids.append(Centroid(SAMPLES[Middle_SAMPLE_POINT][0]))
centroids.append(Centroid(SAMPLES[HIGHEST_SAMPLE_POINT][0]))
print("Centroids initialized at:")
print("(", centroids[0].get_x(),")")
print("(", centroids[1].get_x(),")")
print("(", centroids[2].get_x(),")")
print()
return
#print(initialize_centroids())
#print(centroids.append(Centroid(SAMPLES[HIGHEST_SAMPLE_POINT][0])))
In [60]:
def initialize_datapoints():
# DataPoint objects' x and y values are taken from the SAMPLE array.
# The DataPoints associated with LOWEST_SAMPLE_POINT and HIGHEST_SAMPLE_POINT are initially
# assigned to the clusters matching the LOWEST_SAMPLE_POINT and HIGHEST_SAMPLE_POINT centroids.
for i in range(TOTAL_DATA):
newPoint = DataPoint(SAMPLES[i][0])
if(i == LOWEST_SAMPLE_POINT):
newPoint.set_cluster(0)
elif(i == Middle_SAMPLE_POINT):
newPoint.set_cluster(1)
elif(i == HIGHEST_SAMPLE_POINT):
newPoint.set_cluster(2)
else:
newPoint.set_cluster(None)
data1.append(newPoint)
return
In [61]:
def get_distance(dataPointX, centroidX):
# Calculate Euclidean distance.
return math.sqrt(math.pow((centroidX - dataPointX), 2))
In [62]:
def recalculate_centroids():
totalX = 0
totalInCluster = 0
for j in range(NUM_CLUSTERS):
for k in range(len(data1)):
if(data1[k].get_cluster() == j):
totalX += data1[k].get_x()
totalInCluster += 1
if(totalInCluster > 0):
centroids[j].set_x(totalX / totalInCluster)
return
print(recalculate_centroids())
In [72]:
def update_clusters():
isStillMoving = 0
for i in range(TOTAL_DATA):
bestMinimum = BIG_NUMBER
currentCluster = 0
for j in range(NUM_CLUSTERS):
distance = get_distance(data1[i].get_x(), centroids[j].get_x())
if(distance < bestMinimum):
bestMinimum = distance
currentCluster = j
data1[i].set_cluster(currentCluster)
if(data1[i].get_cluster() is None or data1[i].get_cluster() != currentCluster):
data1[i].set_cluster(currentCluster)
isStillMoving = 1
return isStillMoving
In [87]:
def perform_kmeans():
isStillMoving = 1
initialize_centroids()
initialize_datapoints()
while(isStillMoving):
recalculate_centroids()
isStillMoving = update_clusters()
return
perform_kmeans()
In [29]:
def print_results():
result_list = []
for i in range(NUM_CLUSTERS):
for j in range(TOTAL_DATA):
if(data1[j].get_cluster() == i):
result = []
result.append(data1[j].get_cluster())
result.append(data1[j].get_x())
result_list.append(result)
return result_list
print_results()
Out[29]:
In [ ]:
In [88]:
samples_cr=[]
for i in range(0,len(table_data)):
a = [table_data.conversion_rate[i]]
samples_cr.append(a)
samples_cr = sorted(samples_cr, key=lambda x: x[0])
samples_cr
Out[88]:
In [89]:
len(samples_cr)
Out[89]:
In [ ]:
import random
import math
NUM_CLUSTERS = 3
TOTAL_DATA = 9
LOWEST_SAMPLE_POINT = 0 #element 0 of SAMPLES.
Middle_SAMPLE_POINT = 4
HIGHEST_SAMPLE_POINT = 8 #element 3 of SAMPLES.
BIG_NUMBER = math.pow(10, 10)
SAMPLES = samples_cr
data = []
centroids = []
class DataPoint:
def __init__(self, x):
self.x = x
def set_x(self, x):
self.x = x
def get_x(self):
return self.x
def set_cluster(self, clusterNumber):
self.clusterNumber = clusterNumber
def get_cluster(self):
return self.clusterNumber
class Centroid:
def __init__(self, x):
self.x = x
def set_x(self, x):
self.x = x
def get_x(self):
return self.x
def initialize_centroids():
# Set the centoid coordinates to match the data points furthest from each other.
# In this example, (1.0, 1.0) and (5.0, 7.0)
centroids.append(Centroid(SAMPLES[LOWEST_SAMPLE_POINT][0]))
centroids.append(Centroid(SAMPLES[Middle_SAMPLE_POINT][0]))
centroids.append(Centroid(SAMPLES[HIGHEST_SAMPLE_POINT][0]))
print("Centroids initialized at:")
print("(", centroids[0].get_x(), ")")
print("(", centroids[1].get_x(), ")")
print("(", centroids[2].get_x(), ")")
print()
return
def initialize_datapoints():
# DataPoint objects' x and y values are taken from the SAMPLE array.
# The DataPoints associated with LOWEST_SAMPLE_POINT and HIGHEST_SAMPLE_POINT are initially
# assigned to the clusters matching the LOWEST_SAMPLE_POINT and HIGHEST_SAMPLE_POINT centroids.
for i in range(TOTAL_DATA):
newPoint = DataPoint(SAMPLES[i][0])
if(i == LOWEST_SAMPLE_POINT):
newPoint.set_cluster(0)
elif(i == Middle_SAMPLE_POINT):
newPoint.set_cluster(1)
elif(i == HIGHEST_SAMPLE_POINT):
newPoint.set_cluster(2)
else:
newPoint.set_cluster(None)
data.append(newPoint)
return
def get_distance(dataPointX, centroidX, ):
# Calculate Euclidean distance.
return math.sqrt(math.pow((centroidX - dataPointX), 2))
def recalculate_centroids():
totalX = 0
totalY = 0
totalInCluster = 0
for j in range(NUM_CLUSTERS):
for k in range(len(data)):
if(data[k].get_cluster() == j):
totalX += data[k].get_x()
totalInCluster += 1
if(totalInCluster > 0):
centroids[j].set_x(totalX / totalInCluster)
return
def update_clusters():
isStillMoving = 0
for i in range(TOTAL_DATA):
bestMinimum = BIG_NUMBER
currentCluster = 0
for j in range(NUM_CLUSTERS):
distance = get_distance(data[i].get_x(), centroids[j].get_x())
if(distance < bestMinimum):
bestMinimum = distance
currentCluster = j
data[i].set_cluster(currentCluster)
if(data[i].get_cluster() is None or data[i].get_cluster() != currentCluster):
data[i].set_cluster(currentCluster)
isStillMoving = 1
return isStillMoving
def perform_kmeans():
isStillMoving = 1
initialize_centroids()
initialize_datapoints()
while(isStillMoving):
recalculate_centroids()
isStillMoving = update_clusters()
return
perform_kmeans()
def print_results():
for i in range(NUM_CLUSTERS):
print("Cluster ", i, " includes:")
for j in range(TOTAL_DATA):
if(data[j].get_cluster() == i):
print("(", data[j].get_x(), ")")
print()
return
perform_kmeans()
print_results()
In [109]:
def print_results():
for i in range(NUM_CLUSTERS):
print("Cluster ", i, " includes:")
for j in range(TOTAL_DATA):
if(data[j].get_cluster() == i):
print("(", data[j].get_x(), ")")
print()
return
perform_kmeans()
print_results()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [32]:
cluster_cr = pd.DataFrame(print_results(),columns = ['#cluster_for_cr','conversion_rate'])
cluster_cr
Out[32]:
In [33]:
result_cr = (table_data.sort('conversion_rate')).merge(cluster_cr, on = 'conversion_rate')
result_cr
Out[33]:
In [34]:
finish = result_sessions.merge(result_cr, on = "city_age")
finish
Out[34]:
In [36]:
finish = finish[['city_age','#cluster_for_sess','#cluster_for_cr','sessions_x','conversion_rate_x',]]
finish
Out[36]:
In [ ]: