In [7]:
#download link https://archive.org/details/201309_foursquare_dataset_umn
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import MeanShift
In [13]:
f = open("checkins.dat",'r')
filedata = f.read()
f.close()
newdata = filedata.replace(" ","")
f = open("checkins.dat",'w')
f.write(newdata)
f.close()
In [14]:
df_train = pd.read_csv("checkins.dat")
In [28]:
df_train.head()
Out[28]:
In [29]:
df_train[['latitude','longitude']] = df_train[['latitude','longitude']][df_train.latitude == df_train.latitude or df_train.longitude == df_train.longitude]
In [33]:
df_train = df_train[df_train.latitude == df_train.latitude]
df_train = df_train[df_train.longitude == df_train.longitude]
df_train.head()
Out[33]:
In [35]:
df_train.info()
In [38]:
ms = MeanShift(bandwidth=0.1)
ms.fit(df_train[['latitude','longitude']])
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print("number of estimated clusters : %d" % n_clusters_)
In [47]:
def dist(a, b):
return ((a[0]-b[0])**2 + (a[1]-b[1])**2)**0.5
In [63]:
X = df_train[['latitude','longitude']]
In [71]:
clusSize = np.bincount(labels)
clusSize
Out[71]:
In [136]:
# 33.751277, -118.188740 (Los Angeles)
# 25.867736, -80.324116 (Miami)
# 51.503016, -0.075479 (London)
# 52.378894, 4.885084 (Amsterdam)
# 39.366487, 117.036146 (Beijing)
# -33.868457, 151.205134 (Sydney)
offices = np.array([[33.751277, -118.188740],[25.867736, -80.324116],[51.503016, -0.075479],[52.378894, 4.885084],[39.366487, 117.036146],[-33.868457, 151.205134]])
k = 0
banners = []
for cnt in cluster_centers:
for office in offices:
if (clusSize[k] > 15):
banners.append([k, dist(cnt,office)])
k = k + 1
banners = np.asarray(banners)
#banners_sorted = np.sort(banners.view('i4,i4'), order=['f1'], axis=0).view(np.float)
banners[:20]
Out[136]:
In [140]:
print banners.shape
In [145]:
np.sort(banners.view('i8,i8'), order=['f1'], axis=0).view(np.float)[:20]
Out[145]:
In [146]:
cluster_centers[251]
Out[146]: