Задание по программированию: Размещение баннеров



In [1]:

    
import pandas as pd



In [2]:

    
data = pd.read_csv('preprocessed_data.csv')

data['created_at'] = pd.to_datetime(data['created_at'])



In [3]:

    
data.head()









    Out[3]:







  
    
      
      id
      user_id
      venue_id
      latitude
      longitude
      created_at
    
  
  
    
      0
      984222
      15824
      5222
      38.895112
      -77.036366
      2012-04-21 17:43:47
    
    
      1
      984234
      44652
      5222
      33.800745
      -84.410520
      2012-04-21 17:43:43
    
    
      2
      984291
      105054
      5222
      45.523452
      -122.676207
      2012-04-21 17:39:22
    
    
      3
      984318
      2146539
      5222
      40.764462
      -111.904565
      2012-04-21 17:35:46
    
    
      4
      984232
      93870
      380645
      33.448377
      -112.074037
      2012-04-21 17:38:18



In [4]:

    
subset = data.values[0:100000, 3:5]



In [5]:

    
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth



In [6]:

    
ms = MeanShift(bandwidth=0.1)
ms.fit(subset)

labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)









    



number of estimated clusters : 3230



In [7]:

    
import matplotlib.pyplot as plt
from itertools import cycle

plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.plot(subset[my_members, 0], subset[my_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()



In [8]:

    
clusters_dict = dict()
for label in labels:
    if label not in clusters_dict.keys():
        clusters_dict[label]  = 1
    else:
        clusters_dict[label] += 1

count = 0
for key in clusters_dict.keys():
    if clusters_dict[key] > 15:
        count += 1

clusters_select = np.ndarray(shape=(count, 2))

j = 0
for i in range(len(cluster_centers)):
    if clusters_dict[i] > 15:
        clusters_select[j] = cluster_centers[i]
        j += 1



In [9]:

    
offices = np.ndarray(shape=(6,2))
offices[0] = np.array([33.751277, -118.188740])
offices[1] = np.array([25.867736, -80.324116])
offices[2] = np.array([51.503016, -0.075479])
offices[3] = np.array([52.378894, 4.885084])
offices[4] = np.array([39.366487, 117.036146])
offices[5] = np.array([-33.868457, 151.205134])



In [10]:

    
def distance(point1, point2):
    return ((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2)**0.5



In [11]:

    
answer_index = 0
min_dist = 0
for i in range(len(clusters_select)):
    distances = [distance(xx, clusters_select[i]) for xx in offices]
    if min_dist == 0:
        min_dist = min(distances)
        answer_index = i
    elif min_dist > min(distances):
        min_dist = min(distances)
        answer_index = i



In [12]:

    
def write_answer(center):
    with open("answer.txt", "w") as f:
        f.write(str(center[0]) + ' ' + str(center[1]))



In [13]:

    
write_answer(clusters_select[answer_index])

	id	user_id	venue_id	latitude	longitude	created_at
0	984222	15824	5222	38.895112	-77.036366	2012-04-21 17:43:47
1	984234	44652	5222	33.800745	-84.410520	2012-04-21 17:43:43
2	984291	105054	5222	45.523452	-122.676207	2012-04-21 17:39:22
3	984318	2146539	5222	40.764462	-111.904565	2012-04-21 17:35:46
4	984232	93870	380645	33.448377	-112.074037	2012-04-21 17:38:18