In [7]:
#download link  https://archive.org/details/201309_foursquare_dataset_umn
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import MeanShift

In [13]:
f = open("checkins.dat",'r')
filedata = f.read()
f.close()

newdata = filedata.replace(" ","")

f = open("checkins.dat",'w')
f.write(newdata)
f.close()

In [14]:
df_train = pd.read_csv("checkins.dat")

In [28]:
df_train.head()


Out[28]:
id user_id venue_id latitude longitude created_at
0 984301 2041916 5222 NaN NaN 2012-04-2117:39:01
1 984222 15824 5222 38.895112 -77.036366 2012-04-2117:43:47
2 984315 1764391 5222 NaN NaN 2012-04-2117:37:18
3 984234 44652 5222 33.800745 -84.410520 2012-04-2117:43:43
4 984249 2146840 5222 NaN NaN 2012-04-2117:42:58

In [29]:
df_train[['latitude','longitude']] = df_train[['latitude','longitude']][df_train.latitude == df_train.latitude or df_train.longitude == df_train.longitude]


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-29-d28cfca19337> in <module>()
----> 1 df_train[['latitude','longitude']] = df_train[['latitude','longitude']][df_train.latitude == df_train.latitude or df_train.longitude == df_train.longitude]

C:\Users\SBT-Ashrapov-IR\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\generic.pyc in __nonzero__(self)
    728         raise ValueError("The truth value of a {0} is ambiguous. "
    729                          "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
--> 730                          .format(self.__class__.__name__))
    731 
    732     __bool__ = __nonzero__

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [33]:
df_train = df_train[df_train.latitude == df_train.latitude]
df_train = df_train[df_train.longitude == df_train.longitude]
df_train.head()


Out[33]:
id user_id venue_id latitude longitude created_at
1 984222 15824 5222 38.895112 -77.036366 2012-04-2117:43:47
3 984234 44652 5222 33.800745 -84.410520 2012-04-2117:43:43
7 984291 105054 5222 45.523452 -122.676207 2012-04-2117:39:22
9 984318 2146539 5222 40.764462 -111.904565 2012-04-2117:35:46
10 984232 93870 380645 33.448377 -112.074037 2012-04-2117:38:18

In [35]:
df_train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 396634 entries, 1 to 1021964
Data columns (total 6 columns):
id            396634 non-null int64
user_id       396634 non-null int64
venue_id      396634 non-null int64
latitude      396634 non-null float64
longitude     396634 non-null float64
created_at    396634 non-null object
dtypes: float64(2), int64(3), object(1)
memory usage: 19.7+ MB

In [38]:
ms = MeanShift(bandwidth=0.1)
ms.fit(df_train[['latitude','longitude']])
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)


number of estimated clusters : 5536

In [47]:
def dist(a, b):
    return ((a[0]-b[0])**2 + (a[1]-b[1])**2)**0.5

In [63]:
X = df_train[['latitude','longitude']]

In [71]:
clusSize = np.bincount(labels)
clusSize


Out[71]:
array([56187, 10895, 15282, ...,     1,     1,     1])

In [136]:
# 33.751277, -118.188740 (Los Angeles)

# 25.867736, -80.324116 (Miami)

# 51.503016, -0.075479 (London)

# 52.378894, 4.885084 (Amsterdam)

# 39.366487, 117.036146 (Beijing)

# -33.868457, 151.205134 (Sydney)
offices = np.array([[33.751277, -118.188740],[25.867736, -80.324116],[51.503016, -0.075479],[52.378894, 4.885084],[39.366487, 117.036146],[-33.868457, 151.205134]])
k = 0
banners = []
for cnt in cluster_centers:
    for office in offices:        
        if (clusSize[k] > 15):
            banners.append([k, dist(cnt,office)])
    k = k + 1

banners = np.asarray(banners)
#banners_sorted = np.sort(banners.view('i4,i4'), order=['f1'], axis=0).view(np.float)
banners[:20]


Out[136]:
array([[   0.        ,   44.74498122],
       [   0.        ,   16.14447836],
       [   0.        ,   74.69664865],
       [   0.        ,   79.73183932],
       [   0.        ,  191.03028213],
       [   0.        ,  237.22483488],
       [   1.        ,    6.19352376],
       [   1.        ,   32.57315012],
       [   1.        ,  113.37369432],
       [   1.        ,  118.41045828],
       [   1.        ,  229.11512106],
       [   1.        ,  271.68000546],
       [   2.        ,   31.62109199],
       [   2.        ,   17.59850099],
       [   2.        ,   88.08178947],
       [   2.        ,   93.10893671],
       [   2.        ,  204.68138428],
       [   2.        ,  250.55875423],
       [   3.        ,    6.29357494],
       [   3.        ,   32.47520697]])

In [140]:
print banners.shape


(7824, 2)

In [145]:
np.sort(banners.view('i8,i8'), order=['f1'], axis=0).view(np.float)[:20]


Out[145]:
array([[  2.51000000e+02,   3.02270345e-03],
       [  3.21000000e+02,   9.62460328e-03],
       [  3.16000000e+02,   2.50839856e-02],
       [  5.50000000e+01,   5.16344157e-02],
       [  4.80000000e+01,   7.46435786e-02],
       [  2.40000000e+01,   1.35583491e-01],
       [  9.80000000e+01,   1.81389543e-01],
       [  7.90000000e+01,   1.94083541e-01],
       [  8.52000000e+02,   2.33943524e-01],
       [  9.80000000e+02,   2.64099724e-01],
       [  4.61000000e+02,   2.71823908e-01],
       [  3.60000000e+01,   2.82977830e-01],
       [  5.10000000e+01,   2.92214962e-01],
       [  3.10000000e+01,   3.02268828e-01],
       [  1.10000000e+01,   3.22950983e-01],
       [  3.20000000e+01,   3.42824775e-01],
       [  1.59000000e+02,   3.44131419e-01],
       [  6.60000000e+01,   3.58840860e-01],
       [  1.00100000e+03,   3.64413549e-01],
       [  9.70000000e+01,   3.72901079e-01]])

In [146]:
cluster_centers[251]


Out[146]:
array([ -33.86614607,  151.20708242])