Задание по программированию: Размещение баннеров


In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('preprocessed_data.csv')

data['created_at'] = pd.to_datetime(data['created_at'])

In [3]:
data.head()


Out[3]:
id user_id venue_id latitude longitude created_at
0 984222 15824 5222 38.895112 -77.036366 2012-04-21 17:43:47
1 984234 44652 5222 33.800745 -84.410520 2012-04-21 17:43:43
2 984291 105054 5222 45.523452 -122.676207 2012-04-21 17:39:22
3 984318 2146539 5222 40.764462 -111.904565 2012-04-21 17:35:46
4 984232 93870 380645 33.448377 -112.074037 2012-04-21 17:38:18

In [4]:
subset = data.values[0:100000, 3:5]

In [5]:
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth

In [6]:
ms = MeanShift(bandwidth=0.1)
ms.fit(subset)

labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)


number of estimated clusters : 3230

In [7]:
import matplotlib.pyplot as plt
from itertools import cycle

plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.plot(subset[my_members, 0], subset[my_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()



In [8]:
clusters_dict = dict()
for label in labels:
    if label not in clusters_dict.keys():
        clusters_dict[label]  = 1
    else:
        clusters_dict[label] += 1

count = 0
for key in clusters_dict.keys():
    if clusters_dict[key] > 15:
        count += 1

clusters_select = np.ndarray(shape=(count, 2))

j = 0
for i in range(len(cluster_centers)):
    if clusters_dict[i] > 15:
        clusters_select[j] = cluster_centers[i]
        j += 1

In [9]:
offices = np.ndarray(shape=(6,2))
offices[0] = np.array([33.751277, -118.188740])
offices[1] = np.array([25.867736, -80.324116])
offices[2] = np.array([51.503016, -0.075479])
offices[3] = np.array([52.378894, 4.885084])
offices[4] = np.array([39.366487, 117.036146])
offices[5] = np.array([-33.868457, 151.205134])

In [10]:
def distance(point1, point2):
    return ((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2)**0.5

In [11]:
answer_index = 0
min_dist = 0
for i in range(len(clusters_select)):
    distances = [distance(xx, clusters_select[i]) for xx in offices]
    if min_dist == 0:
        min_dist = min(distances)
        answer_index = i
    elif min_dist > min(distances):
        min_dist = min(distances)
        answer_index = i

In [12]:
def write_answer(center):
    with open("answer.txt", "w") as f:
        f.write(str(center[0]) + ' ' + str(center[1]))

In [13]:
write_answer(clusters_select[answer_index])