notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd



In [2]:

    
data = pd.read_csv('checkins.dat', sep='|',skipinitialspace=True).dropna()









    



C:\Users\Factorion\Anaconda2\lib\site-packages\IPython\core\interactiveshell.py:2717: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [3]:

    
data.head()









    Out[3]:







  
    
      
      id
      user_id
      venue_id
      latitude
      longitude
      created_at
    
  
  
    
      2
      984222
      15824.0
      5222.0
      38.895112
      -77.036366
      2012-04-21 17:43:47
    
    
      4
      984234
      44652.0
      5222.0
      33.800745
      -84.410520
      2012-04-21 17:43:43
    
    
      8
      984291
      105054.0
      5222.0
      45.523452
      -122.676207
      2012-04-21 17:39:22
    
    
      10
      984318
      2146539.0
      5222.0
      40.764462
      -111.904565
      2012-04-21 17:35:46
    
    
      11
      984232
      93870.0
      380645.0
      33.448377
      -112.074037
      2012-04-21 17:38:18



In [4]:

    
len(data)









    Out[4]:





396634



In [5]:

    
cols = []
for i in range(0,len(data.columns)):
    cols.append(data.columns[i].strip())
data.columns = cols



In [6]:

    
data.columns









    Out[6]:





Index([u'id', u'user_id', u'venue_id', u'latitude', u'longitude',
       u'created_at'],
      dtype='object')



In [7]:

    
X = data[['latitude','longitude']]
X.head()









    Out[7]:







  
    
      
      latitude
      longitude
    
  
  
    
      2
      38.895112
      -77.036366
    
    
      4
      33.800745
      -84.410520
    
    
      8
      45.523452
      -122.676207
    
    
      10
      40.764462
      -111.904565
    
    
      11
      33.448377
      -112.074037



In [8]:

    
X = X[:100000]



In [9]:

    
from sklearn.cluster import MeanShift, estimate_bandwidth



In [10]:

    
MS = MeanShift(bandwidth=0.1)
MS.fit(X)









    Out[10]:





MeanShift(bandwidth=0.1, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)



In [11]:

    
labels = MS.labels_ #label каждой точки
cluster_centers = MS.cluster_centers_ #Координаты кластерных центров.

# сортировка
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
n_clusters_
x_plt = np.array(X)



In [12]:

    
# графическое изображение
import matplotlib.pyplot as plt
from itertools import cycle

plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.plot(x_plt[my_members, 0], x_plt[my_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()



In [13]:

    
# кол-во в каждом кластере
points = dict()
for i in xrange(0,len(labels)):
    if points.has_key(labels[i]):
        points[labels[i]] += 1
    else:
        points[labels[i]] = 1



In [14]:

    
# убирает, если < 15
for i in xrange(0,len(points)):
    if points[i]<16:
        points.pop(i)



In [15]:

    
# кооординаты "лучших центров"
opt_points = []
j = 0
for i in xrange(0,len(points)):
    opt_points.append(cluster_centers[i])
    
len(points)









    Out[15]:





593



In [16]:

    
# офисы компании
offices = [[33.751277, -118.188740],[25.867736, -80.324116],
           [51.503016, -0.075479],[52.378894, 4.885084],
           [39.366487, 117.036146],[-33.868457, 151.205134]]
offices









    Out[16]:





[[33.751277, -118.18874],
 [25.867736, -80.324116],
 [51.503016, -0.075479],
 [52.378894, 4.885084],
 [39.366487, 117.036146],
 [-33.868457, 151.205134]]



In [17]:

    
distance_points = []
i = 0
for opt_point in opt_points:
    min_dist = 10000000
    for office in offices:
        dist = np.sqrt((opt_point[0] - office[0])**2 + (opt_point[1] - office[1])**2)
        if dist<min_dist:
            min_dist=dist
    distance_points.append((min_dist,opt_point))



In [18]:

    
distance_points.sort()



In [19]:

    
ans = distance_points[0]
ans = np.array(ans[1])
ans[0]









    Out[19]:





-33.860630428571433



In [20]:

    
with open("ans.txt", "w") as f:
        f.write(str(ans[0]) + ' ' + str(ans[0]))

	id	user_id	venue_id	latitude	longitude	created_at
2	984222	15824.0	5222.0	38.895112	-77.036366	2012-04-21 17:43:47
4	984234	44652.0	5222.0	33.800745	-84.410520	2012-04-21 17:43:43
8	984291	105054.0	5222.0	45.523452	-122.676207	2012-04-21 17:39:22
10	984318	2146539.0	5222.0	40.764462	-111.904565	2012-04-21 17:35:46
11	984232	93870.0	380645.0	33.448377	-112.074037	2012-04-21 17:38:18