In [1]:
import numpy as np
import pandas as pd
In [2]:
data = pd.read_csv('checkins.dat', sep='|',skipinitialspace=True).dropna()
In [3]:
data.head()
Out[3]:
In [4]:
len(data)
Out[4]:
In [5]:
cols = []
for i in range(0,len(data.columns)):
cols.append(data.columns[i].strip())
data.columns = cols
In [6]:
data.columns
Out[6]:
In [7]:
X = data[['latitude','longitude']]
X.head()
Out[7]:
In [8]:
X = X[:100000]
In [9]:
from sklearn.cluster import MeanShift, estimate_bandwidth
In [10]:
MS = MeanShift(bandwidth=0.1)
MS.fit(X)
Out[10]:
In [11]:
labels = MS.labels_ #label каждой точки
cluster_centers = MS.cluster_centers_ #Координаты кластерных центров.
# сортировка
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
n_clusters_
x_plt = np.array(X)
In [12]:
# графическое изображение
import matplotlib.pyplot as plt
from itertools import cycle
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
my_members = labels == k
cluster_center = cluster_centers[k]
plt.plot(x_plt[my_members, 0], x_plt[my_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
In [13]:
# кол-во в каждом кластере
points = dict()
for i in xrange(0,len(labels)):
if points.has_key(labels[i]):
points[labels[i]] += 1
else:
points[labels[i]] = 1
In [14]:
# убирает, если < 15
for i in xrange(0,len(points)):
if points[i]<16:
points.pop(i)
In [15]:
# кооординаты "лучших центров"
opt_points = []
j = 0
for i in xrange(0,len(points)):
opt_points.append(cluster_centers[i])
len(points)
Out[15]:
In [16]:
# офисы компании
offices = [[33.751277, -118.188740],[25.867736, -80.324116],
[51.503016, -0.075479],[52.378894, 4.885084],
[39.366487, 117.036146],[-33.868457, 151.205134]]
offices
Out[16]:
In [17]:
distance_points = []
i = 0
for opt_point in opt_points:
min_dist = 10000000
for office in offices:
dist = np.sqrt((opt_point[0] - office[0])**2 + (opt_point[1] - office[1])**2)
if dist<min_dist:
min_dist=dist
distance_points.append((min_dist,opt_point))
In [18]:
distance_points.sort()
In [19]:
ans = distance_points[0]
ans = np.array(ans[1])
ans[0]
Out[19]:
In [20]:
with open("ans.txt", "w") as f:
f.write(str(ans[0]) + ' ' + str(ans[0]))