In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
%matplotlib inline

In [31]:
df = pd.read_pickle('../data_processeing/Yelp_Cuisine_Japanese.pkl')
d = pd.read_pickle('../data_processeing/Yelp_Cuisine_Japanese.pkl')
df = df.loc[:,'AcceptsInsurance':'cuisine_Japanese']
if_col_keep = 1.0* df.count().sort_values() / len(df) > 0.7
col_keep = df.count().sort_values()[if_col_keep]
col_prefix = []
for c in col_keep.keys().values:
    #print c.split('_')[0]
    if c == 'cuisine_Chinese':
        pass
    else:
        col_prefix.append(c.split('_')[0])
col_prefix = list(set(col_prefix))
delete_col = ['HairSpecializesIn']
for c in delete_col:
    col_prefix.remove(c)
len(col_prefix)
col_with_prefix = []
for c in df.columns[:-1]:
    if c.split('_')[0] in col_prefix:
        col_with_prefix.append(c)
df1 = df.copy()
md = df1.join(d.review_count).review_count.median()
df1['review_count_greater_median'] = df1.join(d.review_count).review_count > md
df_basic = d[[u'categories', u'city', u'hours', u'is_open', u'latitude', u'longitude', u'name', u'neighborhood', u'postal_code', u'review_count', u'stars', u'state']]
df_final = df_basic.join(df1[col_with_prefix+['review_count_greater_median','cuisine_Japanese']])

In [32]:
spatial_label = pd.read_pickle('../data_processeing/spatial_labels.pkl')

In [33]:
df_new = pd.concat([df_final,spatial_label], axis=1)

In [35]:
df_select = df_new[(df_new['stars'] >= 4) & (df_new['spatial_label'] == 7) & (df_new['cuisine_Japanese'] == 2)]

Spectual Clustering


In [36]:
X = df_select.loc[:, 'AgesAllowed': 'review_count_greater_median']
X = pd.concat([X, df_select[['stars']]], axis=1)
X['stars'] = X['stars'].apply(str)
X = pd.get_dummies(X, dummy_na=False, drop_first=True)
def true_false(x):
    if x == True:
        return 1
    else:
        return 0
X['review_count_greater_median'] = X['review_count_greater_median'].apply(true_false)

In [37]:
for n_clusters in range(2,5):  
    spectural_clustering = SpectralClustering(n_clusters=n_clusters, random_state=100, affinity='sigmoid').fit(X)
    labels = spectural_clustering.labels_
    print silhouette_score(X, labels, metric='cityblock')
    print list(labels).count(0)
    print list(labels).count(1)
    print list(labels).count(2)
    print list(labels).count(3)
    print list(labels).count(4)


0.344147842509
147
15
0
0
0
0.287056046631
10
140
12
0
0
0.263604236518
140
1
10
11
0

In [38]:
spectural_clustering = SpectralClustering(n_clusters=2, random_state=999, affinity='sigmoid').fit(X)
labels_spectural = spectural_clustering.labels_

In [39]:
df_select.shape


Out[39]:
(162, 67)

KMeans


In [40]:
## function that gets silhouette scores for a clustering method
def get_silhouette_score(X, cluster_method, method='complete'):
    
    #Choose a range(list) of clusters I would like to try:
    range_n_clusters = range(2,10)
    
    if cluster_method == 'kmeans':
        for n_clusters in range_n_clusters: 
            km_result = KMeans(n_clusters=n_clusters, random_state=22).fit(X) # fit model
            cluster_labels = km_result.labels_

            # clustering model & clustering result to variable "cluster_labels". 
            silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
            print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
                  format(cluster_method, silhouette_avg))

        
    elif cluster_method == 'gaussian_mix':
        for n_clusters in range_n_clusters: 
            gm = GaussianMixture(n_components=n_clusters, random_state=22).fit(X) # fit model
            cluster_labels = gm.predict(X)

            # clustering model & clustering result to variable "cluster_labels". 
            silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
            print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
                  format(cluster_method, silhouette_avg))
        
        # res_mix_cluster.predict(X)
        
    elif cluster_method == 'hierarchical':
        # Define Z
        Z = linkage(X, method)

        for n_clusters in range_n_clusters:   
            cluster_labels=  fcluster(Z, n_clusters, criterion='maxclust') 

            silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
            print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
                  format(cluster_method, silhouette_avg))

In [41]:
## choose the number of clusters using silhouette method
get_silhouette_score(X, 'kmeans')


For n_clusters =2,kmeans - average silhouette_score :0.165822998091
For n_clusters =3,kmeans - average silhouette_score :0.187805540167
For n_clusters =4,kmeans - average silhouette_score :0.166438115101
For n_clusters =5,kmeans - average silhouette_score :0.120126963588
For n_clusters =6,kmeans - average silhouette_score :0.114257522659
For n_clusters =7,kmeans - average silhouette_score :0.120802327111
For n_clusters =8,kmeans - average silhouette_score :0.117169200864
For n_clusters =9,kmeans - average silhouette_score :0.106731287082

In [43]:
n_clusters = 3  # number of clusters
#XX= X.ix[:, ] # hour of day data

#train the model.
km=KMeans(n_clusters=n_clusters, random_state=22).fit(X)
labels_km = km.labels_

print list(labels_km).count(0)
print list(labels_km).count(1)
print list(labels_km).count(2)


47
94
21

In [44]:
## Distance 
# KM=KMeans(n_clusters=k,random_state=9)
scor=km.fit_predict(X)

res_p=pd.DataFrame(km.transform(X))  ## distances of each data point to each cluster center!!!
res_p=pd.concat((res_p, pd.DataFrame(km.fit_predict(X))),axis=1)

res_p.columns=list(range(n_clusters))+["cluster"]
res_p.loc[:,"score"]=res_p.apply(lambda x: x[int(x["cluster"])],axis=1)
res_p.sort_values("score",ascending=False)[:5]


Out[44]:
0 1 2 cluster score
128 2.963334 3.460277 3.383299 0 2.963334
146 3.034284 2.949054 2.797796 2 2.797796
130 2.762682 3.366782 3.355031 0 2.762682
0 2.567072 2.529634 2.631141 1 2.529634
6 2.521075 3.048388 3.209962 0 2.521075

In [45]:
X_df = X.copy()

In [46]:
X_df['km'] = labels_km

In [47]:
X_df['distance_KM'] = res_p.score.values

In [48]:
## Get Anomalies
def get_anomaly(df, label_col, dist_or_likelihood, thres):
    
    df_anomalies = pd.DataFrame(columns=df.columns)
    
    clustermean = df.groupby(label_col)[dist_or_likelihood].mean()
    clusterstd = df.groupby(label_col)[dist_or_likelihood].std()
    
    for label in df[label_col].unique():
        anomaly = df[(df[label_col]==label) \
               & (abs(df[dist_or_likelihood] - clustermean[label]) / clusterstd[label] > thres)]
        
        df_anomalies = pd.concat([df_anomalies, anomaly], axis=0)
    
    return df_anomalies

In [49]:
## get anomalies based on kmeans distance
km_anomalies = get_anomaly(X_df, 'km', 'distance_KM', 2)

In [50]:
km_anomalies['distance_KM']


Out[50]:
GFlbBbpmC830wrIR6k5bIQ    2.529634
Uxm9N6jwmJybOv1NcBbnRw    2.373414
grlL6zmHBOer9rPycsRhBA    2.512756
aYyj9OdH059CoEXadmldXA    2.797796
IhYo9Szx_gPDfNOcz5-8DQ    2.963334
mO786Fl-KJV82FrW00RzBw    2.762682
Name: distance_KM, dtype: float64

In [ ]:


In [ ]:

Gaussian Mixture


In [51]:
##### Gaussian Mixture #########

## choose the number of clusters using silhouette method
get_silhouette_score(X, 'gaussian_mix')


For n_clusters =2,gaussian_mix - average silhouette_score :0.0903079188504
For n_clusters =3,gaussian_mix - average silhouette_score :0.0823246172313
For n_clusters =4,gaussian_mix - average silhouette_score :0.109228035479
For n_clusters =5,gaussian_mix - average silhouette_score :0.0916121753279
For n_clusters =6,gaussian_mix - average silhouette_score :0.104909665675
For n_clusters =7,gaussian_mix - average silhouette_score :0.0959469346231
For n_clusters =8,gaussian_mix - average silhouette_score :0.0820634365561
For n_clusters =9,gaussian_mix - average silhouette_score :0.0760367905315

In [52]:
k = 4
GM=GaussianMixture(n_components=k,random_state=22)
GM.fit(X)

label_gm = GM.predict(X)

In [53]:
print list(label_gm).count(0)
print list(label_gm).count(1)
print list(label_gm).count(2)
print list(label_gm).count(3)
print list(label_gm).count(4)


47
16
32
67
0

Isolation Forest


In [54]:
### 3) Isolation Foreset
from sklearn.ensemble import IsolationForest

# fit the model
ISF = IsolationForest(max_samples=100, random_state=22)
ISF.fit(X)

## compute anomaly score of the input. The lower, the more abnormal.
score_isf = ISF.decision_function(X)

In [55]:
score_isf.argmin()


Out[55]:
112

KNN Distance


In [56]:
def dist2knn(x, nn, k):
    dist_ = []
    for i in range(len(nn)):
        dist_.append(distance.euclidean(x, nn.iloc[i,:]))
    dist_.sort()
    return sum(dist_[:k+1]) # +1: remove 0 self

In [57]:
dist_sum_knn = []
for i in range(len(X)):
    print '\r{}%'.format(100.0*(i+1)/len(X)),
    dist_sum_knn.append(dist2knn(X.iloc[i,:], X, 5))


100.0%

In [ ]:

Output


In [58]:
result = pd.DataFrame(index=X.index)

In [59]:
result['clusters_sp'] = labels_spectural
result['clusters_km'] = labels_km
result['distance_km'] = km_anomalies['distance_KM']
result['clusters_gm'] = label_gm
result['scores_isf'] = score_isf
result['distance_knn'] = dist_sum_knn

In [60]:
result


Out[60]:
clusters_sp clusters_km distance_km clusters_gm scores_isf distance_knn
GFlbBbpmC830wrIR6k5bIQ 0 1 2.529634 2 0.030464 10.889744
4k3RlMAMd46DZ_JyZU0lMg 0 1 NaN 3 0.129966 5.828427
3GfdCuI0YCc5U3rLLLPHUw 0 1 NaN 3 0.076589 10.000000
HGitzBs7x_fUvdTtrTacXg 1 2 NaN 1 -0.027590 8.382332
6tSvz_21BMo3a4GaItwa0g 0 1 NaN 3 0.123768 5.828427
WSGHEQdcdbBWXDpna99EiQ 0 0 NaN 0 0.087490 7.928203
wpnLPAnkMEb1NpC3xiD_qg 1 0 NaN 0 -0.039625 13.032495
Lq9rYrwvWV5QDBYzq7MGHQ 0 1 NaN 2 0.119516 5.414214
7mMxJjeZqnN_nOhWhyU-Vg 0 0 NaN 0 0.018746 11.607183
TtqvXhjmXXd0nhibRAbpRg 0 1 NaN 3 0.111167 7.071068
YkOCo5ipV2he2WXIAlZb-A 0 0 NaN 0 0.067458 7.706742
EvxTD0ETbbFbFjUMWhHbIw 0 0 NaN 0 0.036402 10.708204
Uxm9N6jwmJybOv1NcBbnRw 0 1 2.373414 3 0.033663 12.034027
R9gRb3T8t-eIKlOtMPShuA 0 1 NaN 3 0.142077 4.000000
REXvfvEMPKbcLBD33y6fjA 0 1 NaN 2 0.111728 6.656854
7ibuDLfx8gEhESAufYIySw 0 1 NaN 3 0.148051 4.000000
W8apgXmOxESpoL_EeogC5w 0 1 NaN 3 0.142077 4.000000
5yZ1XmDcOEsElDeb9PlPDQ 0 1 NaN 3 0.116565 6.242641
sYfHxCqHpIX2HnDhw0JRAA 0 1 NaN 2 0.131637 4.828427
BEtgRzNeXGAf0uQ-HuSyfA 0 0 NaN 0 0.063565 9.732051
agGJSdMTRrkA6VrYMbC-SQ 0 1 NaN 3 0.132777 6.242641
M-SS_Kr0zlC7v8kBSRa74g 0 1 NaN 3 0.123886 6.656854
2PS9kBbuJcmBhcNp-D62uA 0 0 NaN 0 0.065227 9.196152
BcW7Z9lPmOB_8eS2lEuOqQ 0 1 NaN 2 0.111421 7.071068
ZAuAwz1ijuydn5yL3q3rzA 0 1 NaN 3 0.120212 6.656854
w6zW6gIyg1sI5V6Wag_SYg 0 1 NaN 3 0.075405 8.196152
B_WggEKFq-ZFNui8CHPYvA 0 0 NaN 0 0.091803 6.974691
6H8xfhoZ2IGa3eNiY5FqLA 0 1 NaN 2 0.132884 7.071068
RrCgc8eAKbHu-2IpQXg6Rw 0 1 NaN 3 0.130651 6.242641
364hhL5st0LV16UcBHRJ3A 0 1 NaN 2 0.066015 9.968119
... ... ... ... ... ... ...
7vHvQCjPq5pWj7Rio1A--w 0 1 NaN 3 0.159102 2.000000
O3OH5IEFMPtz7mPKakPZ3Q 0 1 NaN 3 0.033584 9.146264
wWuGa3OOQJro_XGe-GKBXA 0 2 NaN 2 -0.016156 10.569419
xM37qm9Wbc-hOAS7-Xse7g 0 1 NaN 3 0.098812 7.071068
-DnaKAs2oK3rXfrjSvn9ew 0 0 NaN 0 0.055171 9.464102
7wHLFohwCw8l6WS-feLjeg 0 1 NaN 2 0.028417 10.944272
xR7t4xCYRYEe1RjpYjBbkw 0 1 NaN 2 0.093311 8.024580
Cy8XYYDrZ5wd3Bq-toXMsg 0 1 NaN 2 0.049642 9.700170
BjrKNWhtQkedHw8hP_0Bjg 1 0 NaN 0 0.020568 9.032614
Jx1q_S8-T4RXuhf7P_ZuIQ 0 2 NaN 1 -0.040684 13.201585
eJPckkV1k7gn0V2awA1pzA 0 0 NaN 0 0.066463 8.342417
lmxA0dJM0XsPCIHPXhEQ-g 1 2 NaN 1 -0.024451 12.213128
ugGqbJFuVs4pEyrvDS1TYQ 0 1 NaN 2 0.058785 9.464102
burnDlH36FSphvf4vQf10w 0 1 NaN 3 0.087831 6.974691
aYyj9OdH059CoEXadmldXA 0 2 2.797796 3 -0.039172 14.142136
yU34swtpTS8-SMjeLIEFcQ 0 0 NaN 0 0.094301 7.706742
1nF8Me63cpxIkFK8Ysx-DA 0 1 NaN 3 0.094009 7.388905
FPAOrUsP7GuIyqmK9UoPUQ 0 0 NaN 0 0.056114 9.414214
XKo3y41cF6euqS7I6AX_1Q 0 0 NaN 0 0.041340 10.440255
NTtLuuBKpywlyIz6oyd9kg 0 1 NaN 3 0.081663 8.928203
0xvdC8F0HmFpAFkJk6nXLg 0 1 NaN 3 0.086183 8.660254
D6K3VT6S8FwFm4u5uN8T8g 0 1 NaN 3 0.121443 6.242641
H1NqpoM2hT_A6M98PSrZvw 0 0 NaN 0 0.051091 9.464102
Vs7gc9EE3k9wARuUcN9piA 0 1 NaN 3 0.128016 5.656854
gx2yPrOJSwF1ApJYdGBWIw 0 1 NaN 3 0.120752 6.242641
NIOwzgujIXKVBEVNTQBXpg 0 1 NaN 2 0.038692 9.414214
z_46RY-L3vcMrH3-wxCp9w 0 0 NaN 0 -0.001332 11.820605
3xykzfVY2PbdjKCRDLdzTQ 0 1 NaN 2 0.131637 4.828427
nrahyQyopCtajDqUtVVVfA 0 1 NaN 3 0.159102 2.000000
DIxgItsSI9QwX9H8lVptYg 0 0 NaN 0 0.032490 10.204187

162 rows × 6 columns


In [61]:
result.to_csv('LasVegas_jap_results.csv', index_label=False, encoding='utf-8' )

In [63]:
anomaly_merged = pd.concat([df_select, result], axis=1)

In [64]:
anomaly_merged['distance_km'].sort_values()


Out[64]:
Uxm9N6jwmJybOv1NcBbnRw    2.373414
grlL6zmHBOer9rPycsRhBA    2.512756
GFlbBbpmC830wrIR6k5bIQ    2.529634
mO786Fl-KJV82FrW00RzBw    2.762682
aYyj9OdH059CoEXadmldXA    2.797796
IhYo9Szx_gPDfNOcz5-8DQ    2.963334
4k3RlMAMd46DZ_JyZU0lMg         NaN
3GfdCuI0YCc5U3rLLLPHUw         NaN
HGitzBs7x_fUvdTtrTacXg         NaN
6tSvz_21BMo3a4GaItwa0g         NaN
WSGHEQdcdbBWXDpna99EiQ         NaN
wpnLPAnkMEb1NpC3xiD_qg         NaN
Lq9rYrwvWV5QDBYzq7MGHQ         NaN
7mMxJjeZqnN_nOhWhyU-Vg         NaN
TtqvXhjmXXd0nhibRAbpRg         NaN
YkOCo5ipV2he2WXIAlZb-A         NaN
EvxTD0ETbbFbFjUMWhHbIw         NaN
R9gRb3T8t-eIKlOtMPShuA         NaN
REXvfvEMPKbcLBD33y6fjA         NaN
7ibuDLfx8gEhESAufYIySw         NaN
W8apgXmOxESpoL_EeogC5w         NaN
5yZ1XmDcOEsElDeb9PlPDQ         NaN
sYfHxCqHpIX2HnDhw0JRAA         NaN
BEtgRzNeXGAf0uQ-HuSyfA         NaN
agGJSdMTRrkA6VrYMbC-SQ         NaN
M-SS_Kr0zlC7v8kBSRa74g         NaN
2PS9kBbuJcmBhcNp-D62uA         NaN
BcW7Z9lPmOB_8eS2lEuOqQ         NaN
ZAuAwz1ijuydn5yL3q3rzA         NaN
w6zW6gIyg1sI5V6Wag_SYg         NaN
                            ...   
82YGtjc5KKikNiqBZ33qzw         NaN
7vHvQCjPq5pWj7Rio1A--w         NaN
O3OH5IEFMPtz7mPKakPZ3Q         NaN
wWuGa3OOQJro_XGe-GKBXA         NaN
xM37qm9Wbc-hOAS7-Xse7g         NaN
-DnaKAs2oK3rXfrjSvn9ew         NaN
7wHLFohwCw8l6WS-feLjeg         NaN
xR7t4xCYRYEe1RjpYjBbkw         NaN
Cy8XYYDrZ5wd3Bq-toXMsg         NaN
BjrKNWhtQkedHw8hP_0Bjg         NaN
Jx1q_S8-T4RXuhf7P_ZuIQ         NaN
eJPckkV1k7gn0V2awA1pzA         NaN
lmxA0dJM0XsPCIHPXhEQ-g         NaN
ugGqbJFuVs4pEyrvDS1TYQ         NaN
burnDlH36FSphvf4vQf10w         NaN
yU34swtpTS8-SMjeLIEFcQ         NaN
1nF8Me63cpxIkFK8Ysx-DA         NaN
FPAOrUsP7GuIyqmK9UoPUQ         NaN
XKo3y41cF6euqS7I6AX_1Q         NaN
NTtLuuBKpywlyIz6oyd9kg         NaN
0xvdC8F0HmFpAFkJk6nXLg         NaN
D6K3VT6S8FwFm4u5uN8T8g         NaN
H1NqpoM2hT_A6M98PSrZvw         NaN
Vs7gc9EE3k9wARuUcN9piA         NaN
gx2yPrOJSwF1ApJYdGBWIw         NaN
NIOwzgujIXKVBEVNTQBXpg         NaN
z_46RY-L3vcMrH3-wxCp9w         NaN
3xykzfVY2PbdjKCRDLdzTQ         NaN
nrahyQyopCtajDqUtVVVfA         NaN
DIxgItsSI9QwX9H8lVptYg         NaN
Name: distance_km, dtype: float64

In [65]:
anomaly_merged['distance_knn'].sort_values()


Out[65]:
nrahyQyopCtajDqUtVVVfA     2.000000
5JjGTaZkHbbjBrzBODcsAQ     2.000000
GkRF8rSvh9cOQuuPeDh9bg     2.000000
7vHvQCjPq5pWj7Rio1A--w     2.000000
ugLqbAvBdRDc-gS4hpslXw     4.000000
o6jHRzuoTDCt6xiJA3VwUQ     4.000000
iqQEH6olFhpqwZjFRRFmuA     4.000000
aQ222ydz_GSRZV66xNt4kQ     4.000000
3oajqiPFhYQJsHHiVCchEQ     4.000000
W8apgXmOxESpoL_EeogC5w     4.000000
OK0ba4X5seibH8oMWh4bhw     4.000000
7ibuDLfx8gEhESAufYIySw     4.000000
Bm8nRUsZ-dK6g2eJLxMTOw     4.000000
R9gRb3T8t-eIKlOtMPShuA     4.000000
stK7zVTmMar1ThpzT0Wjvg     4.242641
yHYjDPuRtLvhvMpOn5fXOQ     4.242641
dEAk-gE-5Q95a7p91gNn8A     4.242641
sYfHxCqHpIX2HnDhw0JRAA     4.828427
3xykzfVY2PbdjKCRDLdzTQ     4.828427
dZB5VuI4mCVRz8qQUwUgCg     5.000000
iSQwKRrLYYi-dI2PpJmJJg     5.000000
HmsCerK_rub0Ulo0aC0f9A     5.000000
PChG1Dm0A6AXIXkXGVK8Fw     5.000000
Lq9rYrwvWV5QDBYzq7MGHQ     5.414214
78XbdLi5e42XDxgnxuYacg     5.414214
arv3T_av9OoVo3SvxV4TXw     5.656854
Vs7gc9EE3k9wARuUcN9piA     5.656854
4k3RlMAMd46DZ_JyZU0lMg     5.828427
6tSvz_21BMo3a4GaItwa0g     5.828427
qpub9UXk8oBJr95oq-MbIg     5.828427
                            ...    
wWuGa3OOQJro_XGe-GKBXA    10.569419
rbQOcy2n8dBPpm5ua3VU_Q    10.676323
jX9DocoiY4Bo9EUkaTSqvg    10.676323
EvxTD0ETbbFbFjUMWhHbIw    10.708204
Fv4EXwV30rwGD2NzN1ekgA    10.708204
YYzHvjD16a8Vn6AeaWBKJg    10.708204
GFlbBbpmC830wrIR6k5bIQ    10.889744
7wHLFohwCw8l6WS-feLjeg    10.944272
YTCCJ3ShO-zg0dlx1nk6dw    11.117887
vOMDU31gdylrzBhAKC9QbA    11.180340
xEH00X_VoRepWKXQrragmw    11.316588
gBfPyzPRmeOaj3SdcIj0Rw    11.607183
7mMxJjeZqnN_nOhWhyU-Vg    11.607183
z_46RY-L3vcMrH3-wxCp9w    11.820605
Uxm9N6jwmJybOv1NcBbnRw    12.034027
Er5XAY2UFGGqBHdqpeCFkA    12.034027
ARohEoxqg8TzA5gQfdGAZw    12.034027
lmxA0dJM0XsPCIHPXhEQ-g    12.213128
RSOinkOUpxm0mGw1IWr4Xw    12.230289
aNGcHb8rjFFI9nY_fUgImg    12.622812
grlL6zmHBOer9rPycsRhBA    12.622812
VeiL_tgw7dsl-7IcnOsh0g    12.639972
T419Y3fiJW9EuFDkFougVA    12.836233
9cw870n3gsGikVoOT5YOYg    12.836233
wpnLPAnkMEb1NpC3xiD_qg    13.032495
Jx1q_S8-T4RXuhf7P_ZuIQ    13.201585
9P23-V64kYz3trn9ecaJJA    13.594108
mO786Fl-KJV82FrW00RzBw    13.765681
aYyj9OdH059CoEXadmldXA    14.142136
IhYo9Szx_gPDfNOcz5-8DQ    14.798734
Name: distance_knn, dtype: float64

In [66]:
anomaly_merged['scores_isf'].sort_values()


Out[66]:
YTCCJ3ShO-zg0dlx1nk6dw   -0.065252
IhYo9Szx_gPDfNOcz5-8DQ   -0.065241
mO786Fl-KJV82FrW00RzBw   -0.058650
9P23-V64kYz3trn9ecaJJA   -0.045980
9cw870n3gsGikVoOT5YOYg   -0.044326
Jx1q_S8-T4RXuhf7P_ZuIQ   -0.040684
gBfPyzPRmeOaj3SdcIj0Rw   -0.040620
wpnLPAnkMEb1NpC3xiD_qg   -0.039625
aYyj9OdH059CoEXadmldXA   -0.039172
EZQESVzkOPs7LwCHVC3Szg   -0.028573
HGitzBs7x_fUvdTtrTacXg   -0.027590
lmxA0dJM0XsPCIHPXhEQ-g   -0.024451
T419Y3fiJW9EuFDkFougVA   -0.019428
rbQOcy2n8dBPpm5ua3VU_Q   -0.018711
wWuGa3OOQJro_XGe-GKBXA   -0.016156
aNGcHb8rjFFI9nY_fUgImg   -0.011770
mS_dBSMFiDJn4UVc0iv6Hg   -0.009724
ARohEoxqg8TzA5gQfdGAZw   -0.009628
xEH00X_VoRepWKXQrragmw   -0.005506
RSOinkOUpxm0mGw1IWr4Xw   -0.005236
VeiL_tgw7dsl-7IcnOsh0g   -0.003752
grlL6zmHBOer9rPycsRhBA   -0.002677
z_46RY-L3vcMrH3-wxCp9w   -0.001332
pyqnGlIfP9Zw8LLxBjyAiA    0.003168
TCWMgOiV0PxQkWE1SyBIWQ    0.007371
vRFYqRz5F41ici5IOO2_pg    0.008905
7mMxJjeZqnN_nOhWhyU-Vg    0.018746
j5nPiTwWEFr-VsePew7Sjg    0.020271
Fv4EXwV30rwGD2NzN1ekgA    0.020502
BjrKNWhtQkedHw8hP_0Bjg    0.020568
                            ...   
sYfHxCqHpIX2HnDhw0JRAA    0.131637
3xykzfVY2PbdjKCRDLdzTQ    0.131637
2SbyRgHWuWNlq18eHAx95Q    0.132010
A-uZAD4zP3rRxb44WUGV5w    0.132201
agGJSdMTRrkA6VrYMbC-SQ    0.132777
6H8xfhoZ2IGa3eNiY5FqLA    0.132884
stK7zVTmMar1ThpzT0Wjvg    0.133970
dEAk-gE-5Q95a7p91gNn8A    0.133970
yHYjDPuRtLvhvMpOn5fXOQ    0.133970
HmsCerK_rub0Ulo0aC0f9A    0.135130
PChG1Dm0A6AXIXkXGVK8Fw    0.139258
MXC9pwIxovWUc9yu1F8OxA    0.140717
R9gRb3T8t-eIKlOtMPShuA    0.142077
W8apgXmOxESpoL_EeogC5w    0.142077
tJ9-u9MfpVbX4X2miIJ71w    0.143017
dZB5VuI4mCVRz8qQUwUgCg    0.144487
vdOf_NWsct8jr4qYKoE_iw    0.145402
OK0ba4X5seibH8oMWh4bhw    0.146611
aQ222ydz_GSRZV66xNt4kQ    0.146611
7ibuDLfx8gEhESAufYIySw    0.148051
ugLqbAvBdRDc-gS4hpslXw    0.148051
iSQwKRrLYYi-dI2PpJmJJg    0.150844
3oajqiPFhYQJsHHiVCchEQ    0.151847
Bm8nRUsZ-dK6g2eJLxMTOw    0.151847
o6jHRzuoTDCt6xiJA3VwUQ    0.152652
iqQEH6olFhpqwZjFRRFmuA    0.152652
7vHvQCjPq5pWj7Rio1A--w    0.159102
5JjGTaZkHbbjBrzBODcsAQ    0.159102
nrahyQyopCtajDqUtVVVfA    0.159102
GkRF8rSvh9cOQuuPeDh9bg    0.159102
Name: scores_isf, dtype: float64

In [73]:
anomaly_merged.loc['IhYo9Szx_gPDfNOcz5-8DQ', :]


Out[73]:
categories                        [Home Services, Occupational Therapy, Massage,...
city                                                                      Las Vegas
hours                             [Monday 9:0-21:0, Tuesday 9:0-21:0, Wednesday ...
is_open                                                                           1
latitude                                                                    36.1335
longitude                                                                  -115.279
name                                               Shiatsu'ssage 365 Sports Massage
neighborhood                                                               Westside
postal_code                                                                   89117
review_count                                                                      9
stars                                                                           4.5
state                                                                            NV
AgesAllowed                                                                     NaN
Ambience_casual                                                               False
Ambience_classy                                                                 NaN
Ambience_divey                                                                  NaN
Ambience_hipster                                                                NaN
Ambience_intimate                                                               NaN
Ambience_romantic                                                               NaN
Ambience_touristy                                                               NaN
Ambience_trendy                                                                 NaN
Ambience_upscale                                                              False
BYOB                                                                            NaN
BYOBCorkage                                                                     NaN
BestNights_friday                                                               NaN
BestNights_monday                                                               NaN
BestNights_saturday                                                             NaN
BestNights_sunday                                                               NaN
BestNights_thursday                                                             NaN
BestNights_tuesday                                                            False
                                                        ...                        
DietaryRestrictions_soy-free                                                    NaN
DietaryRestrictions_vegan                                                       NaN
DietaryRestrictions_vegetarian                                                  NaN
GoodForMeal_breakfast                                                           NaN
GoodForMeal_brunch                                                              NaN
GoodForMeal_dessert                                                             NaN
GoodForMeal_dinner                                                              NaN
GoodForMeal_latenight                                                           NaN
GoodForMeal_lunch                                                              True
HappyHour                                                                       NaN
Music_background_music                                                          NaN
Music_dj                                                                       True
Music_jukebox                                                                   NaN
Music_karaoke                                                                   NaN
Music_live                                                                      NaN
Music_no_music                                                                 True
Music_video                                                                   False
OutdoorSeating                                                                  NaN
RestaurantsDelivery                                                             NaN
RestaurantsGoodForGroups                                                      False
Smoking                                                                         NaN
review_count_greater_median                                                   False
cuisine_Japanese                                                                  2
spatial_label                                                                     7
clusters_sp                                                                       0
clusters_km                                                                       0
distance_km                                                                 2.96333
clusters_gm                                                                       0
scores_isf                                                               -0.0652408
distance_knn                                                                14.7987
Name: IhYo9Szx_gPDfNOcz5-8DQ, dtype: object

In [ ]: