In [121]:
#data_city
#Account: TMRW Tech Hub
#Property: TMRW
#View: All Web Site Data
#ids: ga:123303369
#start-date: 2017-02-01
#end-date: 2017-04-30
#metrics
#ga:sessions
#ga:bounceRate
#ga:goal1ConversionRate
#ga:goal1Completions
#dimensions
#ga:city
#filter
#ga:sessions>30;ga:city!=(not set)
In [122]:
import json
from pprint import pprint
with open('data/TMRW_user_groups.json') as file:
data_city = json.load(file)
data_city = data_city['reports'][0]['data']['rows']
data_city
pprint(data_city)
In [173]:
def create_dict(x):
city_dict = {}
for s in x:
city_dict[s['dimensions'][0],s['dimensions'][1]] = (s['metrics'][0]['values'][0],s['metrics'][0]['values'][1],s['metrics'][0]['values'][2],s['metrics'][0]['values'][3])
return city_dict
city_dict = create_dict(data_city)
pprint(city_dict)
In [220]:
import numpy as np
def create_dict(x):
points_dict = []
for s in x:
#print(s)
for z in s['metrics']:
#list_float = np.array(z['values']) + 0
#print(list_float)
#print((z['values']))
points_dict.append(z['values'])
#for v in z['values']:
#rint (v)
return points_dict
points_dict = create_dict(data_city)
points_dict
Out[220]:
In [219]:
new = []
for n in points_dict:
for i in range(len(points_dict)-1):
print(n[i])
#new.append(float(n[0]))
#new.append(n[1])
#new.append =
#for z in n:
#print (z)
#new.append(float(z))
new
In [143]:
samples1 =points_dict
for i in range(0,len(samples1)-1):
#print(i)
for k in range(0,i):
a = float(samples1[i][k])
print(a)
In [ ]:
In [126]:
#for z in s['metrics']:
#print(z)
#points_dict.append(z['values'])
#return points_dict
#points_dict = (create_dict(data_city))
#points_dict
In [133]:
for i in samples1:
#print(i)
for k in range(0,4):
a = [float(i[k])]
print(a)
In [66]:
for x in samples1:
minn = min(x[0]+x[1]+x[2]+x[3])
#a=min(x)
#print(a)
#min(x[0]+x[1]+x[2]+x[3] for x in gen)
minn
Out[66]:
In [99]:
import random
import math
NUM_CLUSTERS = 2
TOTAL_DATA = 7
LOWEST_SAMPLE_POINT = 5 #element 0 of SAMPLES.
HIGHEST_SAMPLE_POINT = 3 #element 3 of SAMPLES.
BIG_NUMBER = math.pow(10, 10)
SAMPLES = samples1
data1 = []
centroids = []
class DataPoint:
def __init__(self, x, y, z, f):
self.x = x
self.y = y
self.z = z
self.f = f
def set_x(self, x):
self.x = x
def get_x(self):
return self.x
def set_y(self, y):
self.y = y
def get_y(self):
return self.y
def set_z(self, z):
self.z = z
def get_z(self):
return self.z
def set_f(self, f):
self.f = f
def get_f(self):
return self.f
def set_cluster(self, clusterNumber):
self.clusterNumber = clusterNumber
def get_cluster(self):
return self.clusterNumber
class Centroid:
def __init__(self, x, y, z, f):
self.x = x
self.y = y
self.z = z
self.f = f
def set_x(self, x):
self.x = x
def get_x(self):
return self.x
def set_y(self, y):
self.y = y
def get_y(self):
return self.y
def set_z(self, z):
self.z = z
def get_z(self):
return self.z
def set_f(self, f):
self.f = f
def get_f(self):
return self.f
In [100]:
def initialize_centroids():
# Set the centoid coordinates to match the data points furthest from each other.
# In this example, [31, 51.613, 1, 3.2260000000000004] and [758, 59.234999999999999, 22, 2.9019999999999997]
centroids.append(Centroid(SAMPLES[LOWEST_SAMPLE_POINT][0], SAMPLES[LOWEST_SAMPLE_POINT][1],SAMPLES[LOWEST_SAMPLE_POINT][2],SAMPLES[LOWEST_SAMPLE_POINT][3]))
centroids.append(Centroid(SAMPLES[HIGHEST_SAMPLE_POINT][0], SAMPLES[HIGHEST_SAMPLE_POINT][1],SAMPLES[HIGHEST_SAMPLE_POINT][2],SAMPLES[HIGHEST_SAMPLE_POINT][3]))
print("Centroids initialized at:")
print("(", centroids[0].get_x(), ", ", centroids[0].get_y(), ", ", centroids[0].get_z(), ", ", centroids[0].get_f(),")")
print("(", centroids[1].get_x(), ", ", centroids[1].get_y(), ", ", centroids[1].get_z(), ", ", centroids[1].get_f(),")")
print()
return
print(initialize_centroids())
In [101]:
def initialize_datapoints():
# DataPoint objects' x and y values are taken from the SAMPLE array.
# The DataPoints associated with LOWEST_SAMPLE_POINT and HIGHEST_SAMPLE_POINT are initially
# assigned to the clusters matching the LOWEST_SAMPLE_POINT and HIGHEST_SAMPLE_POINT centroids.
for i in range(TOTAL_DATA):
newPoint = DataPoint(SAMPLES[i][0], SAMPLES[i][1],SAMPLES[i][2],SAMPLES[i][3])
if(i == LOWEST_SAMPLE_POINT):
newPoint.set_cluster(0)
elif(i == HIGHEST_SAMPLE_POINT):
newPoint.set_cluster(1)
else:
newPoint.set_cluster(None)
data1.append(newPoint)
return
In [102]:
def get_distance(dataPointX, dataPointY,dataPointZ, dataPointF,centroidX, centroidY,centroidZ, centroidF):
# Calculate Euclidean distance.
return math.sqrt(math.pow((centroidY - dataPointY), 2) + math.pow((centroidX - dataPointX), 2) + math.pow((centroidZ - dataPointZ), 2) + math.pow((centroidF - dataPointF), 2))
In [103]:
def recalculate_centroids():
totalX = 0
totalY = 0
totalZ = 0
totalF = 0
totalInCluster = 0
for j in range(NUM_CLUSTERS):
for k in range(len(data1)):
if(data1[k].get_cluster() == j):
totalX += data1[k].get_x()
totalY += data1[k].get_y()
totalZ += data1[k].get_z()
totalF += data1[k].get_f()
totalInCluster += 1
if(totalInCluster > 0):
centroids[j].set_x(totalX / totalInCluster)
centroids[j].set_y(totalY / totalInCluster)
centroids[j].set_z(totalZ / totalInCluster)
centroids[j].set_f(totalF / totalInCluster)
return
print(recalculate_centroids())
In [104]:
def update_clusters():
isStillMoving = 0
for i in range(TOTAL_DATA):
bestMinimum = BIG_NUMBER
currentCluster = 0
for j in range(NUM_CLUSTERS):
distance = get_distance(data1[i].get_x(), data1[i].get_y(), data1[i].get_z(), data1[i].get_f(), centroids[j].get_x(), centroids[j].get_y(),centroids[j].get_z(), centroids[j].get_f())
if(distance < bestMinimum):
bestMinimum = distance
currentCluster = j
data1[i].set_cluster(currentCluster)
if(data1[i].get_cluster() is None or data1[i].get_cluster() != currentCluster):
data1[i].set_cluster(currentCluster)
isStillMoving = 1
return isStillMoving
In [134]:
def perform_kmeans():
isStillMoving = 1
initialize_centroids()
initialize_datapoints()
while(isStillMoving):
recalculate_centroids()
isStillMoving = update_clusters()
return
def print_results():
for i in range(NUM_CLUSTERS):
print("Cluster ", i, " includes:")
for j in range(TOTAL_DATA):
if(data1[j].get_cluster() == i):
print("(", data1[j].get_x(), ", ", data1[j].get_y(),", ", data1[j].get_z(), ", ", data1[j].get_f(), ")")
print()
return
perform_kmeans()
print_results()
In [106]:
samples1 = [[data.Sessions[1],data.BounceRate[1],data.Goal1Completions[1],data.Goal1ConversionRate[1]],[data.Sessions[2],data.BounceRate[2],data.Goal1Completions[2],data.Goal1ConversionRate[2]],[data.Sessions[3],data.BounceRate[3],data.Goal1Completions[3],data.Goal1ConversionRate[3]],[data.Sessions[4],data.BounceRate[4],data.Goal1Completions[4],data.Goal1ConversionRate[4]],[data.Sessions[5],data.BounceRate[5],data.Goal1Completions[5],data.Goal1ConversionRate[5]],[data.Sessions[6],data.BounceRate[6],data.Goal1Completions[6],data.Goal1ConversionRate[6]],[data.Sessions[7],data.BounceRate[7],data.Goal1Completions[7],data.Goal1ConversionRate[7]]]
samples1
Out[106]:
In [ ]:
In [99]:
#data = pd.read_csv("data/TMRW_users.csv")
#data = data[data.City_Age!="(not set)"]
#data = data[data.Sessions>30]
#data = data[data.Goal1Completions>0]
#data