notebook.community

Edit and run



In [7]:

    
#download link  https://archive.org/details/201309_foursquare_dataset_umn
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import MeanShift



In [13]:

    
f = open("checkins.dat",'r')
filedata = f.read()
f.close()

newdata = filedata.replace(" ","")

f = open("checkins.dat",'w')
f.write(newdata)
f.close()



In [14]:

    
df_train = pd.read_csv("checkins.dat")



In [28]:

    
df_train.head()









    Out[28]:






  
    
      
      id
      user_id
      venue_id
      latitude
      longitude
      created_at
    
  
  
    
      0
      984301
      2041916
      5222
      NaN
      NaN
      2012-04-2117:39:01
    
    
      1
      984222
      15824
      5222
      38.895112
      -77.036366
      2012-04-2117:43:47
    
    
      2
      984315
      1764391
      5222
      NaN
      NaN
      2012-04-2117:37:18
    
    
      3
      984234
      44652
      5222
      33.800745
      -84.410520
      2012-04-2117:43:43
    
    
      4
      984249
      2146840
      5222
      NaN
      NaN
      2012-04-2117:42:58



In [29]:

    
df_train[['latitude','longitude']] = df_train[['latitude','longitude']][df_train.latitude == df_train.latitude or df_train.longitude == df_train.longitude]









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-29-d28cfca19337> in <module>()
----> 1 df_train[['latitude','longitude']] = df_train[['latitude','longitude']][df_train.latitude == df_train.latitude or df_train.longitude == df_train.longitude]

C:\Users\SBT-Ashrapov-IR\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\generic.pyc in __nonzero__(self)
    728         raise ValueError("The truth value of a {0} is ambiguous. "
    729                          "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
--> 730                          .format(self.__class__.__name__))
    731 
    732     __bool__ = __nonzero__

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().



In [33]:

    
df_train = df_train[df_train.latitude == df_train.latitude]
df_train = df_train[df_train.longitude == df_train.longitude]
df_train.head()









    Out[33]:






  
    
      
      id
      user_id
      venue_id
      latitude
      longitude
      created_at
    
  
  
    
      1
      984222
      15824
      5222
      38.895112
      -77.036366
      2012-04-2117:43:47
    
    
      3
      984234
      44652
      5222
      33.800745
      -84.410520
      2012-04-2117:43:43
    
    
      7
      984291
      105054
      5222
      45.523452
      -122.676207
      2012-04-2117:39:22
    
    
      9
      984318
      2146539
      5222
      40.764462
      -111.904565
      2012-04-2117:35:46
    
    
      10
      984232
      93870
      380645
      33.448377
      -112.074037
      2012-04-2117:38:18



In [35]:

    
df_train.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 396634 entries, 1 to 1021964
Data columns (total 6 columns):
id            396634 non-null int64
user_id       396634 non-null int64
venue_id      396634 non-null int64
latitude      396634 non-null float64
longitude     396634 non-null float64
created_at    396634 non-null object
dtypes: float64(2), int64(3), object(1)
memory usage: 19.7+ MB



In [38]:

    
ms = MeanShift(bandwidth=0.1)
ms.fit(df_train[['latitude','longitude']])
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)









    



number of estimated clusters : 5536



In [47]:

    
def dist(a, b):
    return ((a[0]-b[0])**2 + (a[1]-b[1])**2)**0.5



In [63]:

    
X = df_train[['latitude','longitude']]



In [71]:

    
clusSize = np.bincount(labels)
clusSize









    Out[71]:





array([56187, 10895, 15282, ...,     1,     1,     1])



In [136]:

    
# 33.751277, -118.188740 (Los Angeles)

# 25.867736, -80.324116 (Miami)

# 51.503016, -0.075479 (London)

# 52.378894, 4.885084 (Amsterdam)

# 39.366487, 117.036146 (Beijing)

# -33.868457, 151.205134 (Sydney)
offices = np.array([[33.751277, -118.188740],[25.867736, -80.324116],[51.503016, -0.075479],[52.378894, 4.885084],[39.366487, 117.036146],[-33.868457, 151.205134]])
k = 0
banners = []
for cnt in cluster_centers:
    for office in offices:        
        if (clusSize[k] > 15):
            banners.append([k, dist(cnt,office)])
    k = k + 1

banners = np.asarray(banners)
#banners_sorted = np.sort(banners.view('i4,i4'), order=['f1'], axis=0).view(np.float)
banners[:20]









    Out[136]:





array([[   0.        ,   44.74498122],
       [   0.        ,   16.14447836],
       [   0.        ,   74.69664865],
       [   0.        ,   79.73183932],
       [   0.        ,  191.03028213],
       [   0.        ,  237.22483488],
       [   1.        ,    6.19352376],
       [   1.        ,   32.57315012],
       [   1.        ,  113.37369432],
       [   1.        ,  118.41045828],
       [   1.        ,  229.11512106],
       [   1.        ,  271.68000546],
       [   2.        ,   31.62109199],
       [   2.        ,   17.59850099],
       [   2.        ,   88.08178947],
       [   2.        ,   93.10893671],
       [   2.        ,  204.68138428],
       [   2.        ,  250.55875423],
       [   3.        ,    6.29357494],
       [   3.        ,   32.47520697]])



In [140]:

    
print banners.shape



In [145]:

    
np.sort(banners.view('i8,i8'), order=['f1'], axis=0).view(np.float)[:20]









    Out[145]:





array([[  2.51000000e+02,   3.02270345e-03],
       [  3.21000000e+02,   9.62460328e-03],
       [  3.16000000e+02,   2.50839856e-02],
       [  5.50000000e+01,   5.16344157e-02],
       [  4.80000000e+01,   7.46435786e-02],
       [  2.40000000e+01,   1.35583491e-01],
       [  9.80000000e+01,   1.81389543e-01],
       [  7.90000000e+01,   1.94083541e-01],
       [  8.52000000e+02,   2.33943524e-01],
       [  9.80000000e+02,   2.64099724e-01],
       [  4.61000000e+02,   2.71823908e-01],
       [  3.60000000e+01,   2.82977830e-01],
       [  5.10000000e+01,   2.92214962e-01],
       [  3.10000000e+01,   3.02268828e-01],
       [  1.10000000e+01,   3.22950983e-01],
       [  3.20000000e+01,   3.42824775e-01],
       [  1.59000000e+02,   3.44131419e-01],
       [  6.60000000e+01,   3.58840860e-01],
       [  1.00100000e+03,   3.64413549e-01],
       [  9.70000000e+01,   3.72901079e-01]])



In [146]:

    
cluster_centers[251]









    Out[146]:





array([ -33.86614607,  151.20708242])

	id	user_id	venue_id	latitude	longitude	created_at
0	984301	2041916	5222	NaN	NaN	2012-04-2117:39:01
1	984222	15824	5222	38.895112	-77.036366	2012-04-2117:43:47
2	984315	1764391	5222	NaN	NaN	2012-04-2117:37:18
3	984234	44652	5222	33.800745	-84.410520	2012-04-2117:43:43
4	984249	2146840	5222	NaN	NaN	2012-04-2117:42:58