In [1]:
import pandas as pd
In [2]:
data = pd.read_csv('preprocessed_data.csv')
data['created_at'] = pd.to_datetime(data['created_at'])
In [3]:
data.head()
Out[3]:
In [4]:
subset = data.values[0:100000, 3:5]
In [5]:
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
In [6]:
ms = MeanShift(bandwidth=0.1)
ms.fit(subset)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print("number of estimated clusters : %d" % n_clusters_)
In [7]:
import matplotlib.pyplot as plt
from itertools import cycle
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
my_members = labels == k
cluster_center = cluster_centers[k]
plt.plot(subset[my_members, 0], subset[my_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
In [8]:
clusters_dict = dict()
for label in labels:
if label not in clusters_dict.keys():
clusters_dict[label] = 1
else:
clusters_dict[label] += 1
count = 0
for key in clusters_dict.keys():
if clusters_dict[key] > 15:
count += 1
clusters_select = np.ndarray(shape=(count, 2))
j = 0
for i in range(len(cluster_centers)):
if clusters_dict[i] > 15:
clusters_select[j] = cluster_centers[i]
j += 1
In [9]:
offices = np.ndarray(shape=(6,2))
offices[0] = np.array([33.751277, -118.188740])
offices[1] = np.array([25.867736, -80.324116])
offices[2] = np.array([51.503016, -0.075479])
offices[3] = np.array([52.378894, 4.885084])
offices[4] = np.array([39.366487, 117.036146])
offices[5] = np.array([-33.868457, 151.205134])
In [10]:
def distance(point1, point2):
return ((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2)**0.5
In [11]:
answer_index = 0
min_dist = 0
for i in range(len(clusters_select)):
distances = [distance(xx, clusters_select[i]) for xx in offices]
if min_dist == 0:
min_dist = min(distances)
answer_index = i
elif min_dist > min(distances):
min_dist = min(distances)
answer_index = i
In [12]:
def write_answer(center):
with open("answer.txt", "w") as f:
f.write(str(center[0]) + ' ' + str(center[1]))
In [13]:
write_answer(clusters_select[answer_index])