In [3]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

%matplotlib inline

In [4]:
df = pd.read_pickle('../data_processeing/Yelp_Cuisine_Chinese.pkl')

In [41]:
df_spatial = df[['latitude', 'longitude', 'cuisine_Chinese']]

In [ ]:
11
#5F4690, #1D6996, #38A6A5, #0F8554, #73AF48, #EDAD08, #E17C05, #CC503E, #94346E, #6F4070, #666666

In [46]:
kmeans = KMeans(n_clusters=11, random_state=5).fit(df[['latitude', 'longitude']])

In [ ]:
for

In [47]:
df_spatial['label'] = kmeans.labels_


/gpfs1/cusp/xz1845/.conda/envs/myPy27/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [48]:
df_spatial[df_spatial['label'] == 0].plot(x='longitude', y='latitude', kind='scatter')


Out[48]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5f1ccd7e10>

In [35]:
range(11)


Out[35]:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [49]:
for i in range(11):
    temp = df_spatial[df_spatial['cuisine_Chinese'] == 2]
    print len(temp[temp['label'] == i])


80
84
482
853
63
182
194
410
163
139
43

In [45]:
for i in range(11):
    temp = df_spatial[df_spatial['cuisine_Chinese'] == 2]
    print len(temp[temp['label'] == i])


80
84
482
163
63
194
182
410
853
139
43

In [39]:
for i in range(11):
    df_spatial[df_spatial['label'] == i].plot(x='longitude', y='latitude', kind='scatter')



In [29]:
df_spatial.plot(x='longitude', y='latitude', kind='scatter', c='#5F4690')


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5f222809d0>

In [26]:
kmeans.labels_


Out[26]:
array([8, 2, 5, ..., 1, 2, 0], dtype=int32)

In [50]:



Out[50]:
array([3, 2, 6, ..., 1, 2, 0], dtype=int32)

In [52]:
df_ = df_spatial.copy()

In [ ]:


In [54]:
df_ = df_[['label']]

In [57]:
df_.columns = ['spatial_label']

In [58]:
df_.to_pickle('../spatial_labels.pkl')


Out[58]:
spatial_label
EDqCEAGXVGCH4FJXgqtjqg 3
GDnbt3isfhd57T1QqU6flg 2
a1Ba6XeIOP48e64YFD0dMw 6
826djy6K_9Fp0ptqJ2_Yag 3
Mi5uhdFB9OJteXPd0_IKfw 2
Uxh0fXFH_QQBivRnIBpdiw 8
YPavuOh2XsnRbLfl0DH2lQ 2
hovoWva_UjbnyLWEbnFvBw 8
L_thK7r3K_h5M4tV7amEKQ 3
hEcn9k6ONd5n2mq0lB2aew 2
diXqpz9WOr4WLQq6uvARkg 3
se2a2QaJi8Yx_KjE1r-_sQ 8
J3H6VSIgUTlACkb_HPFA8w 2
Ja4stXdNYr39u5CZHMNtjw 2
TegeuAoYBVgQRUBiDLU7Mw 8
7caQImCyJKGHwSlsDNZHhA 1
ZUvAR3I1uhC27zQFcRXumw 8
A2pZTpFXWC38z506XIhnBQ 7
mRv3Z25F56qduMKnvnCkVA 3
Ir_QIzs-4o9ElOtiGuxJrw 6
QfWFxmXqRGixztgaZN0gOA 0
h7uxML49NQ_Wj-7zk0DSqg 5
IikkB8j2E-1QmujRsPh5wQ 3
q_KQbgnaYDlPx8EHTydcBQ 3
2px99IppAcnxR238eq_8_w 2
jDqaz1jKdeWk_fTVCzc1Pg 9
dFArFtgDj1rIahUDHU8L7Q 2
mvsnt4q3A5TzcbPFolpsZQ 2
Eq3qA7F5uZBUbcYXROzntA 8
Ld2hhA3q3cdkptwS1fsYEg 2
... ...
9Z_6rRy7Tl_C6HIgm7y6FA 2
WuPSEvu8pES752S92awAVQ 8
uB2By0pTXV2JeM_CsMg7xw 2
uLUl_dMl6a9m774LGReDVA 2
P0ytQBoqSbMG9S7Hx-PAdw 8
ofXqwIbHprYF6TP47XudOw 5
ORloM7z9R9eh-G-an5DQLA 8
fh6tKiiZlrGk6fRaxd8buA 2
2OVuHRbdb5GTHUqrCqV-Fg 2
UNG2-2YFN93Z9lIoCuGteg 1
tquWyW3Mm5ka8LJtzodALg 1
A-apTF3JfkUPR_oeazZC1Q 2
22RFH-U5gboNJgws_GBG3w 5
9iJMGMsTK-q6W5MB1_Ny3Q 3
J6tgF98vwUz4D8T-N72org 3
UkoseRjoLT4K2bEzvmyhYA 3
w5CSi-An5meLnxjKSFn0wQ 7
RzXSaz2BjPBHeJzEIUKoQw 2
Q3UkgxNNInsPcUFhsQFcrg 3
GIfZNMP0oIJCje_Xp0Bgrw 8
bVgRVpdPgwHBLa7RHb9CpQ 6
2hUENHFcBIqop8-uWvrf6A 5
J1RDyyPxhioqm8c_fi4P4Q 8
_BJ2lBc1TWwIfxNzXbaaBA 3
C_chktl5jFeRqu1C0rmScA 3
xowy4YxBXXGOYuajsxjwkA 7
g9gAuOEXBbWnLlu2uOftwA 2
-McKyjNSqS1h9dDJH3dyUA 1
eHLUQ2W_hXx61NmiL9kSVg 2
OgwN65jZebPRIPSmNpRP7A 0

27314 rows × 1 columns


In [ ]: