notebook.community

Edit and run



In [121]:

    
#data_city

#Account: TMRW Tech Hub
#Property: TMRW
#View: All Web Site Data
#ids: ga:123303369
#start-date: 2017-02-01
#end-date: 2017-04-30

#metrics
#ga:sessions
#ga:bounceRate
#ga:goal1ConversionRate
#ga:goal1Completions

#dimensions
#ga:city

#filter
#ga:sessions>30;ga:city!=(not set)



In [122]:

    
import json
from pprint import pprint

with open('data/TMRW_user_groups.json') as file: 
    data_city = json.load(file)

data_city = data_city['reports'][0]['data']['rows']
data_city
pprint(data_city)









    



[{'dimensions': ['Croydon', '18-24'],
  'metrics': [{'values': ['101',
                          '41.584158415841586',
                          '4',
                          '3.9603960396039604']}]},
 {'dimensions': ['Croydon', '25-34'],
  'metrics': [{'values': ['334',
                          '47.90419161676647',
                          '17',
                          '5.089820359281437']}]},
 {'dimensions': ['Croydon', '35-44'],
  'metrics': [{'values': ['223',
                          '43.04932735426009',
                          '7',
                          '3.1390134529147984']}]},
 {'dimensions': ['Croydon', '45-54'],
  'metrics': [{'values': ['90',
                          '45.55555555555556',
                          '2',
                          '2.2222222222222223']}]},
 {'dimensions': ['Croydon', '55-64'],
  'metrics': [{'values': ['32', '53.125', '1', '3.125']}]},
 {'dimensions': ['London', '18-24'],
  'metrics': [{'values': ['167',
                          '49.700598802395206',
                          '8',
                          '4.790419161676647']}]},
 {'dimensions': ['London', '25-34'],
  'metrics': [{'values': ['842',
                          '59.14489311163895',
                          '24',
                          '2.8503562945368173']}]},
 {'dimensions': ['London', '35-44'],
  'metrics': [{'values': ['482',
                          '54.77178423236515',
                          '14',
                          '2.904564315352697']}]},
 {'dimensions': ['London', '45-54'],
  'metrics': [{'values': ['205',
                          '55.60975609756098',
                          '3',
                          '1.4634146341463417']}]},
 {'dimensions': ['London', '55-64'],
  'metrics': [{'values': ['37', '48.64864864864865', '0', '0.0']}]}]



In [173]:

    
def create_dict(x):
    city_dict = {}
    for s in x:
        
        city_dict[s['dimensions'][0],s['dimensions'][1]] = (s['metrics'][0]['values'][0],s['metrics'][0]['values'][1],s['metrics'][0]['values'][2],s['metrics'][0]['values'][3])
        
    
    return city_dict
        

city_dict = create_dict(data_city)
pprint(city_dict)









    



{('Croydon', '18-24'): ('101', '41.584158415841586', '4', '3.9603960396039604'),
 ('Croydon', '25-34'): ('334', '47.90419161676647', '17', '5.089820359281437'),
 ('Croydon', '35-44'): ('223', '43.04932735426009', '7', '3.1390134529147984'),
 ('Croydon', '45-54'): ('90', '45.55555555555556', '2', '2.2222222222222223'),
 ('Croydon', '55-64'): ('32', '53.125', '1', '3.125'),
 ('London', '18-24'): ('167', '49.700598802395206', '8', '4.790419161676647'),
 ('London', '25-34'): ('842', '59.14489311163895', '24', '2.8503562945368173'),
 ('London', '35-44'): ('482', '54.77178423236515', '14', '2.904564315352697'),
 ('London', '45-54'): ('205', '55.60975609756098', '3', '1.4634146341463417'),
 ('London', '55-64'): ('37', '48.64864864864865', '0', '0.0')}



In [220]:

    
import numpy as np
def create_dict(x):
    points_dict = []
    for s in x:
        #print(s)
        for z in s['metrics']:
            #list_float = np.array(z['values']) + 0
            #print(list_float)
            #print((z['values']))
            points_dict.append(z['values'])
            #for v in z['values']:
                #rint (v)

    return points_dict
        
points_dict = create_dict(data_city)
points_dict









    Out[220]:





[['101', '41.584158415841586', '4', '3.9603960396039604'],
 ['334', '47.90419161676647', '17', '5.089820359281437'],
 ['223', '43.04932735426009', '7', '3.1390134529147984'],
 ['90', '45.55555555555556', '2', '2.2222222222222223'],
 ['32', '53.125', '1', '3.125'],
 ['167', '49.700598802395206', '8', '4.790419161676647'],
 ['842', '59.14489311163895', '24', '2.8503562945368173'],
 ['482', '54.77178423236515', '14', '2.904564315352697'],
 ['205', '55.60975609756098', '3', '1.4634146341463417'],
 ['37', '48.64864864864865', '0', '0.0']]



In [219]:

    
new = []

for n in points_dict:
    for i in range(len(points_dict)-1):
        print(n[i])

        #new.append(float(n[0]))
    #new.append(n[1])
    
    #new.append = 
    #for z in n:
        
        #print (z)
        #new.append(float(z))
        
new









    



101
41.584158415841586
4
3.9603960396039604






    



---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-219-840139d10be5> in <module>()
      3 for n in points_dict:
      4     for i in range(len(points_dict)-1):
----> 5         print(n[i])
      6 
      7         #new.append(float(n[0]))

IndexError: list index out of range



In [143]:

    
samples1 =points_dict
for i in range(0,len(samples1)-1):
    #print(i)
    for k in range(0,i):
        a = float(samples1[i][k])
        print(a)









    



334.0
223.0
43.04932735426009
90.0
45.55555555555556
2.0
32.0
53.125
1.0
3.125
167.0
49.700598802395206
8.0
4.790419161676647






    



---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-143-c815e8e179fe> in <module>()
      3     #print(i)
      4     for k in range(0,i):
----> 5         a = float(samples1[i][k])
      6         print(a)

IndexError: list index out of range



In [ ]:



In [126]:

    
#for z in s['metrics']:
            #print(z)
            #points_dict.append(z['values'])

    #return points_dict
        
#points_dict = (create_dict(data_city))
#points_dict



In [133]:

    
for i in samples1:
    #print(i)
    for k in range(0,4):
        a = [float(i[k])]
        print(a)









    



[101.0]
[41.584158415841586]
[4.0]
[3.9603960396039604]
[334.0]
[47.90419161676647]
[17.0]
[5.089820359281437]
[223.0]
[43.04932735426009]
[7.0]
[3.1390134529147984]
[90.0]
[45.55555555555556]
[2.0]
[2.2222222222222223]
[32.0]
[53.125]
[1.0]
[3.125]
[167.0]
[49.700598802395206]
[8.0]
[4.790419161676647]
[842.0]
[59.14489311163895]
[24.0]
[2.8503562945368173]
[482.0]
[54.77178423236515]
[14.0]
[2.904564315352697]
[205.0]
[55.60975609756098]
[3.0]
[1.4634146341463417]
[37.0]
[48.64864864864865]
[0.0]
[0.0]



In [66]:

    
for x in samples1:
    minn = min(x[0]+x[1]+x[2]+x[3])
    #a=min(x)
    #print(a)

#min(x[0]+x[1]+x[2]+x[3] for x in gen)
minn









    Out[66]:





'.'



In [99]:

    
import random
import math

NUM_CLUSTERS = 2
TOTAL_DATA = 7
LOWEST_SAMPLE_POINT = 5 #element 0 of SAMPLES.
HIGHEST_SAMPLE_POINT = 3 #element 3 of SAMPLES.
BIG_NUMBER = math.pow(10, 10)


SAMPLES = samples1
data1 = []
centroids = []

class DataPoint:
    def __init__(self, x, y, z, f):
        self.x = x
        self.y = y
        self.z = z
        self.f = f
    
    def set_x(self, x):
        self.x = x
    
    def get_x(self):
        return self.x
    
    def set_y(self, y):
        self.y = y
    
    def get_y(self):
        return self.y
    
    def set_z(self, z):
        self.z = z
    
    def get_z(self):
        return self.z
    
    def set_f(self, f):
        self.f = f
    
    def get_f(self):
        return self.f
    
    def set_cluster(self, clusterNumber):
        self.clusterNumber = clusterNumber
    
    def get_cluster(self):
        return self.clusterNumber

class Centroid:
    def __init__(self, x, y, z, f):
        self.x = x
        self.y = y
        self.z = z
        self.f = f
    
    def set_x(self, x):
        self.x = x
    
    def get_x(self):
        return self.x
    
    def set_y(self, y):
        self.y = y
    
    def get_y(self):
        return self.y
    
    def set_z(self, z):
        self.z = z
    
    def get_z(self):
        return self.z
    
    def set_f(self, f):
        self.f = f
    
    def get_f(self):
        return self.f



In [100]:

    
def initialize_centroids():
    # Set the centoid coordinates to match the data points furthest from each other.
    # In this example, [31, 51.613, 1, 3.2260000000000004] and [758, 59.234999999999999, 22, 2.9019999999999997]
    centroids.append(Centroid(SAMPLES[LOWEST_SAMPLE_POINT][0], SAMPLES[LOWEST_SAMPLE_POINT][1],SAMPLES[LOWEST_SAMPLE_POINT][2],SAMPLES[LOWEST_SAMPLE_POINT][3]))
    centroids.append(Centroid(SAMPLES[HIGHEST_SAMPLE_POINT][0], SAMPLES[HIGHEST_SAMPLE_POINT][1],SAMPLES[HIGHEST_SAMPLE_POINT][2],SAMPLES[HIGHEST_SAMPLE_POINT][3]))
    
    print("Centroids initialized at:")
    print("(", centroids[0].get_x(), ", ", centroids[0].get_y(), ", ", centroids[0].get_z(),  ", ", centroids[0].get_f(),")")
    print("(", centroids[1].get_x(), ", ", centroids[1].get_y(), ", ", centroids[1].get_z(),  ", ", centroids[1].get_f(),")")
    print()
    return
print(initialize_centroids())









    



Centroids initialized at:
( 31 ,  51.613 ,  1 ,  3.226 )
( 758 ,  59.235 ,  22 ,  2.902 )

None



In [101]:

    
def initialize_datapoints():
    # DataPoint objects' x and y values are taken from the SAMPLE array.
    # The DataPoints associated with LOWEST_SAMPLE_POINT and HIGHEST_SAMPLE_POINT are initially
    # assigned to the clusters matching the LOWEST_SAMPLE_POINT and HIGHEST_SAMPLE_POINT centroids.
    for i in range(TOTAL_DATA):
        newPoint = DataPoint(SAMPLES[i][0], SAMPLES[i][1],SAMPLES[i][2],SAMPLES[i][3])
        
        if(i == LOWEST_SAMPLE_POINT):
            newPoint.set_cluster(0)
        elif(i == HIGHEST_SAMPLE_POINT):
            newPoint.set_cluster(1)
        else:
            newPoint.set_cluster(None)
            
        data1.append(newPoint)
    
    return



In [102]:

    
def get_distance(dataPointX, dataPointY,dataPointZ, dataPointF,centroidX, centroidY,centroidZ, centroidF):
    # Calculate Euclidean distance.
    return math.sqrt(math.pow((centroidY - dataPointY), 2) + math.pow((centroidX - dataPointX), 2) + math.pow((centroidZ - dataPointZ), 2) + math.pow((centroidF - dataPointF), 2))



In [103]:

    
def recalculate_centroids():
    totalX = 0
    totalY = 0
    totalZ = 0
    totalF = 0
    totalInCluster = 0
    
    for j in range(NUM_CLUSTERS):
        for k in range(len(data1)):
            if(data1[k].get_cluster() == j):
                totalX += data1[k].get_x()
                totalY += data1[k].get_y()
                totalZ += data1[k].get_z()
                totalF += data1[k].get_f()
                totalInCluster += 1
        
        if(totalInCluster > 0):
            centroids[j].set_x(totalX / totalInCluster)
            centroids[j].set_y(totalY / totalInCluster)
            centroids[j].set_z(totalZ / totalInCluster)
            centroids[j].set_f(totalF / totalInCluster)
    
    return

print(recalculate_centroids())









    



None



In [104]:

    
def update_clusters():
    isStillMoving = 0
    
    for i in range(TOTAL_DATA):
        bestMinimum = BIG_NUMBER
        currentCluster = 0
        
        for j in range(NUM_CLUSTERS):
            distance = get_distance(data1[i].get_x(), data1[i].get_y(), data1[i].get_z(), data1[i].get_f(), centroids[j].get_x(), centroids[j].get_y(),centroids[j].get_z(), centroids[j].get_f())
            if(distance < bestMinimum):
                bestMinimum = distance
                currentCluster = j
        
        data1[i].set_cluster(currentCluster)
        
        if(data1[i].get_cluster() is None or data1[i].get_cluster() != currentCluster):
            data1[i].set_cluster(currentCluster)
            isStillMoving = 1
    
    return isStillMoving



In [134]:

    
def perform_kmeans():
    isStillMoving = 1
    
    initialize_centroids()
    
    initialize_datapoints()
    
    while(isStillMoving):
        recalculate_centroids()
        isStillMoving = update_clusters()
    
    return

def print_results():
    for i in range(NUM_CLUSTERS):
        print("Cluster ", i, " includes:")
        for j in range(TOTAL_DATA):
            if(data1[j].get_cluster() == i):
                print("(", data1[j].get_x(), ", ", data1[j].get_y(),", ", data1[j].get_z(), ", ", data1[j].get_f(), ")")
        print()
    
    return

perform_kmeans()
print_results()









    



Centroids initialized at:
( 99.8333333333 ,  48.6841666667 ,  2.33333333333 ,  2.55216666667 )
( 282.333333333 ,  51.5923333333 ,  7.77777777778 ,  2.65933333333 )

Cluster  0  includes:
( 165 ,  53.939 ,  2 ,  1.212 )
( 82 ,  48.78 ,  1 ,  1.22 )
( 31 ,  51.613 ,  1 ,  3.226 )
( 86 ,  43.023 ,  3 ,  3.488 )

Cluster  1  includes:
( 426 ,  53.756 ,  12 ,  2.817 )
( 758 ,  59.235 ,  22 ,  2.902 )
( 204 ,  43.137 ,  6 ,  2.941 )



In [106]:

    
samples1 = [[data.Sessions[1],data.BounceRate[1],data.Goal1Completions[1],data.Goal1ConversionRate[1]],[data.Sessions[2],data.BounceRate[2],data.Goal1Completions[2],data.Goal1ConversionRate[2]],[data.Sessions[3],data.BounceRate[3],data.Goal1Completions[3],data.Goal1ConversionRate[3]],[data.Sessions[4],data.BounceRate[4],data.Goal1Completions[4],data.Goal1ConversionRate[4]],[data.Sessions[5],data.BounceRate[5],data.Goal1Completions[5],data.Goal1ConversionRate[5]],[data.Sessions[6],data.BounceRate[6],data.Goal1Completions[6],data.Goal1ConversionRate[6]],[data.Sessions[7],data.BounceRate[7],data.Goal1Completions[7],data.Goal1ConversionRate[7]]]
samples1









    Out[106]:





[[165, 53.938999999999993, 2, 1.212],
 [82, 48.780000000000001, 1, 1.22],
 [426, 53.756, 12, 2.8169999999999997],
 [758, 59.234999999999999, 22, 2.9019999999999997],
 [204, 43.137, 6, 2.9410000000000003],
 [31, 51.613, 1, 3.2260000000000004],
 [86, 43.023000000000003, 3, 3.488]]



In [ ]:



In [99]:

    
#data = pd.read_csv("data/TMRW_users.csv")
#data = data[data.City_Age!="(not set)"]
#data = data[data.Sessions>30]
#data = data[data.Goal1Completions>0]
#data