In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_pickle('df_1518.pkl')

In [3]:
spatial_label = pd.read_pickle('../data_processeing/spatial_labels.pkl')

In [41]:
df_new.groupby('spatial_label').count()


Out[41]:
categories city hours is_open latitude longitude name neighborhood postal_code review_count ... Music_karaoke Music_live Music_no_music Music_video OutdoorSeating RestaurantsDelivery RestaurantsGoodForGroups Smoking review_count_greater_median cuisine_Chinese
spatial_label
0 892 892 892 892 892 892 892 892 892 892 ... 0 175 807 807 796 792 807 792 892 892
1 1094 1094 1094 1094 1094 1094 1094 1094 1094 1094 ... 0 83 932 932 687 562 932 562 1094 1094
2 5274 5274 5274 5274 5274 5274 5274 5274 5274 5274 ... 1 677 4835 4835 4916 4856 4835 4856 5274 5274
3 6464 6464 6464 6464 6464 6464 6464 6464 6464 6464 ... 2 595 5388 5388 5474 5366 5388 5366 6464 6464
4 952 952 952 952 952 952 952 952 952 952 ... 0 125 683 683 718 639 683 639 952 952
5 2071 2071 2071 2071 2071 2071 2071 2071 2071 2071 ... 0 279 1790 1790 1872 1850 1790 1850 2071 2071
6 2317 2317 2317 2317 2317 2317 2317 2317 2317 2317 ... 0 161 1764 1764 1863 1808 1764 1808 2317 2317
7 3564 3564 3564 3564 3564 3564 3564 3564 3564 3564 ... 3 461 3234 3234 3278 3279 3234 3279 3564 3564
8 2502 2502 2502 2502 2502 2502 2502 2502 2502 2502 ... 0 404 2210 2210 2187 2171 2210 2171 2502 2502
9 1887 1887 1887 1887 1887 1887 1887 1887 1887 1887 ... 0 287 1677 1677 1631 1630 1677 1630 1887 1887
10 297 297 297 297 297 297 297 297 297 297 ... 0 38 270 270 266 264 270 264 297 297

11 rows × 66 columns


In [4]:
df_new = pd.concat([df,spatial_label], axis=1)

In [50]:
df_select = df_new[(df_new['stars'] >= 4) & (df_new['spatial_label'] == 2) & (df_new['cuisine_Chinese'] == 2)]

Spectual Clustering


In [51]:
X = df_select.loc[:, 'AgesAllowed': 'review_count_greater_median']
X = pd.concat([X, df_select[['stars']]], axis=1)
X['stars'] = X['stars'].apply(str)
X = pd.get_dummies(X, dummy_na=False, drop_first=True)
def true_false(x):
    if x == True:
        return 1
    else:
        return 0
X['review_count_greater_median'] = X['review_count_greater_median'].apply(true_false)

In [52]:
for n_clusters in range(2,5):  
    spectural_clustering = SpectralClustering(n_clusters=n_clusters, random_state=100, affinity='sigmoid').fit(X)
    labels = spectural_clustering.labels_
    print silhouette_score(X, labels, metric='cityblock')
    print list(labels).count(0)
    print list(labels).count(1)
    print list(labels).count(2)
    print list(labels).count(3)
    print list(labels).count(4)


0.405114162492
172
4
0
0
0
0.2347455604
170
5
1
0
0
0.227827609194
169
3
3
1
0

In [53]:
spectural_clustering = SpectralClustering(n_clusters=2, random_state=999, affinity='sigmoid').fit(X)
labels_spectural = spectural_clustering.labels_

In [54]:
df_select.shape


Out[54]:
(176, 67)

KMeans


In [55]:
## function that gets silhouette scores for a clustering method
def get_silhouette_score(X, cluster_method, method='complete'):
    
    #Choose a range(list) of clusters I would like to try:
    range_n_clusters = range(2,10)
    
    if cluster_method == 'kmeans':
        for n_clusters in range_n_clusters: 
            km_result = KMeans(n_clusters=n_clusters, random_state=22).fit(X) # fit model
            cluster_labels = km_result.labels_

            # clustering model & clustering result to variable "cluster_labels". 
            silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
            print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
                  format(cluster_method, silhouette_avg))

        
    elif cluster_method == 'gaussian_mix':
        for n_clusters in range_n_clusters: 
            gm = GaussianMixture(n_components=n_clusters, random_state=22).fit(X) # fit model
            cluster_labels = gm.predict(X)

            # clustering model & clustering result to variable "cluster_labels". 
            silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
            print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
                  format(cluster_method, silhouette_avg))
        
        # res_mix_cluster.predict(X)
        
    elif cluster_method == 'hierarchical':
        # Define Z
        Z = linkage(X, method)

        for n_clusters in range_n_clusters:   
            cluster_labels=  fcluster(Z, n_clusters, criterion='maxclust') 

            silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
            print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
                  format(cluster_method, silhouette_avg))

In [56]:
## choose the number of clusters using silhouette method
get_silhouette_score(X, 'kmeans')


For n_clusters =2,kmeans - average silhouette_score :0.127166501834
For n_clusters =3,kmeans - average silhouette_score :0.125831355634
For n_clusters =4,kmeans - average silhouette_score :0.109131456769
For n_clusters =5,kmeans - average silhouette_score :0.0808942656566
For n_clusters =6,kmeans - average silhouette_score :0.0830437671919
For n_clusters =7,kmeans - average silhouette_score :0.0738487530425
For n_clusters =8,kmeans - average silhouette_score :0.084144752272
For n_clusters =9,kmeans - average silhouette_score :0.0840328521262

In [57]:
n_clusters = 2  # number of clusters
#XX= X.ix[:, ] # hour of day data

#train the model.
km=KMeans(n_clusters=n_clusters, random_state=22).fit(X)
labels_km = km.labels_

print list(labels_km).count(0)
print list(labels_km).count(1)


78
98

In [19]:
## Distance 
# KM=KMeans(n_clusters=k,random_state=9)
scor=km.fit_predict(X)

res_p=pd.DataFrame(km.transform(X))  ## distances of each data point to each cluster center!!!
res_p=pd.concat((res_p, pd.DataFrame(km.fit_predict(X))),axis=1)

res_p.columns=list(range(n_clusters))+["cluster"]
res_p.loc[:,"score"]=res_p.apply(lambda x: x[int(x["cluster"])],axis=1)
res_p.sort_values("score",ascending=False)[:5]


Out[19]:
0 1 cluster score
35 3.348413 3.529255 0 3.348413
136 3.130752 3.558051 0 3.130752
64 2.945065 3.514769 0 2.945065
60 2.852182 3.348245 0 2.852182
124 3.534674 2.851160 1 2.851160

In [20]:
X_df = X.copy()

In [21]:
X_df['km'] = labels_km

In [22]:
X_df['distance_KM'] = res_p.score.values

In [23]:
## Get Anomalies
def get_anomaly(df, label_col, dist_or_likelihood, thres):
    
    df_anomalies = pd.DataFrame(columns=df.columns)
    
    clustermean = df.groupby(label_col)[dist_or_likelihood].mean()
    clusterstd = df.groupby(label_col)[dist_or_likelihood].std()
    
    for label in df[label_col].unique():
        anomaly = df[(df[label_col]==label) \
               & (abs(df[dist_or_likelihood] - clustermean[label]) / clusterstd[label] > thres)]
        
        df_anomalies = pd.concat([df_anomalies, anomaly], axis=0)
    
    return df_anomalies

In [24]:
## get anomalies based on kmeans distance
km_anomalies = get_anomaly(X_df, 'km', 'distance_KM', 2)

In [25]:
km_anomalies['distance_KM']


Out[25]:
Gz9SlLqxS6wnxPvgdOQNrA    3.348413
ZCzey5aPhd7jYIoHsUfjmQ    2.852182
e6d50rwRTU-fONeSBJmOHQ    2.945065
OtGSQQzV7uF8XPAsNIh7Bw    3.130752
I_a74zmgR-X03LsKISWPcg    2.592479
t4P-8drZzj3TMwA_WU-4Zg    2.584595
7SemopLjhDc3IQkI-JUYdQ    2.730502
baIT89GubjGJV1mpn82Eeg    2.851160
1o03kXj4zA-_SvN1HfqNyg    2.774985
Name: distance_KM, dtype: float64

In [ ]:


In [ ]:

Gaussian Mixture


In [26]:
##### Gaussian Mixture #########

## choose the number of clusters using silhouette method
get_silhouette_score(X, 'gaussian_mix')


For n_clusters =2,gaussian_mix - average silhouette_score :0.12887906314
For n_clusters =3,gaussian_mix - average silhouette_score :0.119042473774
For n_clusters =4,gaussian_mix - average silhouette_score :0.0681504944473
For n_clusters =5,gaussian_mix - average silhouette_score :0.0553045067472
For n_clusters =6,gaussian_mix - average silhouette_score :0.061464526033
For n_clusters =7,gaussian_mix - average silhouette_score :0.0713501158681
For n_clusters =8,gaussian_mix - average silhouette_score :0.0512525024615
For n_clusters =9,gaussian_mix - average silhouette_score :0.0632702854815

In [27]:
k = 2
GM=GaussianMixture(n_components=k,random_state=22)
GM.fit(X)

label_gm = GM.predict(X)

In [28]:
print list(label_gm).count(0)
print list(label_gm).count(1)
print list(label_gm).count(2)
print list(label_gm).count(3)
print list(label_gm).count(4)


54
122
0
0
0

Isolation Forest


In [29]:
### 3) Isolation Foreset
from sklearn.ensemble import IsolationForest

# fit the model
ISF = IsolationForest(max_samples=100, random_state=22)
ISF.fit(X)

## compute anomaly score of the input. The lower, the more abnormal.
score_isf = ISF.decision_function(X)

In [30]:
score_isf.argmin()


Out[30]:
35

KNN Distance


In [31]:
def dist2knn(x, nn, k):
    dist_ = []
    for i in range(len(nn)):
        dist_.append(distance.euclidean(x, nn.iloc[i,:]))
    dist_.sort()
    return sum(dist_[:k+1]) # +1: remove 0 self

In [32]:
dist_sum_knn = []
for i in range(len(X)):
    print '\r{}%'.format(100.0*(i+1)/len(X)),
    dist_sum_knn.append(dist2knn(X.iloc[i,:], X, 5))


100.0%

In [ ]:

Output


In [33]:
result = pd.DataFrame(index=X.index)

In [34]:
result['clusters_sp'] = labels_spectural
result['clusters_km'] = labels_km
result['distance_km'] = km_anomalies['distance_KM']
result['clusters_gm'] = label_gm
result['scores_isf'] = score_isf
result['distance_knn'] = dist_sum_knn

In [35]:
result


Out[35]:
clusters_sp clusters_km distance_km clusters_gm scores_isf distance_knn
2px99IppAcnxR238eq_8_w 0 0 NaN 1 0.121704 7.706742
cXdQ3H0GqcIzRgrvBEaBxw 0 1 NaN 1 0.073304 8.660254
qMJCVx6-ZYvZQHlfrZ_cug 0 1 NaN 1 0.061064 8.928203
fBmyj1okdO1p-GUzfQQohg 0 1 NaN 1 0.141037 3.000000
umrDQGRNied77aVg29_fVw 0 1 NaN 1 0.017409 12.016867
Y1WIH4jstH846oWcDnoZLw 0 0 NaN 1 0.141601 6.242641
fS9a8AGrcwWPf_9vfn5wPQ 0 0 NaN 0 0.157274 2.000000
uWYzS46xqA_YA1D1INgbEQ 0 0 NaN 0 0.133695 5.828427
aVKBmuvdDmEOgOfqaSF_xA 0 1 NaN 1 0.046627 9.700170
n8KZA9G4FvGqDBWIscpz9Q 0 0 NaN 1 0.152312 5.000000
kkEqZmVvVkgmCaOqE13mDg 0 1 NaN 1 0.101646 8.660254
RtnTbhWYw4QjKhoTBV37OA 0 0 NaN 0 0.032846 11.803445
ePt0qmZgU-fjqMoM1M2U3Q 0 0 NaN 1 0.129436 7.610366
2RsLaIEnNUIzjQyCvPCNWA 0 0 NaN 0 0.083720 10.000000
U3TUTWRqbW9WeLeWMxi2tw 0 1 NaN 1 0.075404 10.000000
-oOKqZbYDt08zaWWyLZNIw 0 1 NaN 1 0.118648 5.414214
46LhKfz6MPaLYoS0jegsdw 0 0 NaN 0 0.133699 3.000000
g3w01guDiTLszguA3vUrSg 1 0 NaN 0 0.031306 12.034027
J7rkQISD48jXgdM6UxNAbw 0 1 NaN 1 0.127690 7.071068
aY-lZvoTHAFl9k6dcfcAxQ 0 0 NaN 1 0.148731 6.242641
EAs61Wm1O6tLjCs8t2eP-Q 0 1 NaN 1 0.104349 8.024580
mBFGYVLh588BrvfmGsOhrA 0 1 NaN 1 0.033378 10.944272
2g4ZTD3ePNSbDAvh6qAOKQ 0 1 NaN 1 0.135599 7.071068
JQOmOqfoJgNOdFDQrBRmBw 0 1 NaN 1 0.117779 8.660254
odVKlMYqb4FHJXManv-VRA 0 1 NaN 1 0.097711 8.342417
kzuQsq0KPWLeKFtnLXXyHg 0 1 NaN 1 0.076106 9.732051
bWucOPNoIjd8ECdiDyVq9Q 0 0 NaN 0 0.023342 11.180340
lcSsIw_oC-OT2veksCtRwg 0 1 NaN 1 0.127725 6.242641
8IblwilNlenDVwBbEzQEVg 0 0 NaN 1 0.126073 6.878315
cPEo3-TmIqCwHLFGKZgwGg 0 1 NaN 1 0.152229 6.656854
... ... ... ... ... ... ...
puQ2WGO0L2KvKqNQCJYZRw 0 1 NaN 1 -0.015224 11.607183
YBIhTjmEH3IEZW4Dr5ReRA 0 0 NaN 1 0.139443 6.656854
trsKoo2b9-nMcXCKn8dlfQ 0 1 NaN 1 0.140568 6.656854
4ps8XJFZWi9nNwiWUcycVA 0 0 NaN 1 0.140827 6.656854
atSfDP-SLY4GrvBlBPB31Q 0 1 NaN 1 0.009153 10.944272
LQ-gMAQLbPVxjrBYCJ1ApQ 0 0 NaN 1 0.125072 6.656854
cwWQB8LyFbxxiOzgfZdWCg 0 0 NaN 0 0.123581 5.000000
A2eA3LRbptrexCGw8fu67Q 0 1 NaN 1 0.030261 10.708204
cmsAFAymfVoBitSF8R6GKw 0 0 NaN 1 0.156546 4.000000
ecJri9ozyke4dOCWulZiRQ 0 0 NaN 0 0.141980 7.071068
HI56ArpYEbhEUeRx9Curpw 0 1 NaN 1 0.125528 6.656854
gBEWJ4b2OvUmN4Oh7ju3hw 0 0 NaN 1 0.132683 6.656854
KXLjOJbl9VYXFxKT4YK99Q 0 1 NaN 1 0.114987 7.706742
TdjydrOFUSMUsTKdlXW6aQ 0 1 NaN 1 0.086056 9.464102
K3J4zvnQ_G1d5xfffPf2Wg 0 1 NaN 1 -0.004559 11.977060
1o03kXj4zA-_SvN1HfqNyg 0 1 2.774985 1 -0.028638 13.397847
J5lgLvoKPzNqGCGPbznh-Q 0 1 NaN 1 0.011846 10.889744
lEtpTFWetCf6xnzeImLiHg 0 0 NaN 0 0.103828 7.610366
MBsIf01fcA8Vy7vCMMKf2A 0 0 NaN 0 0.141687 7.071068
6ZIHxvFTHC1pvAzAS0uLDA 0 1 NaN 1 0.052726 9.732051
JCmyrBoNl2qfZ-f8I5RA3A 0 0 NaN 0 0.096450 7.610366
puPY7gVy7RCz8bK1zkzdoQ 0 1 NaN 1 0.134854 6.242641
OBnqMdIsS8vWPrzPtnlb-w 0 1 NaN 1 0.109639 8.660254
BvrG8XMpTtlJYWhOCbnpaQ 0 0 NaN 1 0.144361 6.974691
EnOCjP3wBpoICvEZXil4Cg 0 0 NaN 1 0.128433 8.660254
yqYtY3-Po4OVPafA9Z-Xyw 0 0 NaN 0 0.092281 8.660254
sWeVNiPulXsD8PWhEtn9Ew 0 1 NaN 1 0.019464 10.944272
Xdzd_3-eh0sxCZOXGhEWkw 0 1 NaN 1 0.074462 10.440255
sQ6DZZLLJRfwUr0eDWKDDg 0 1 NaN 1 0.117368 7.292529
9Z_6rRy7Tl_C6HIgm7y6FA 0 0 NaN 1 0.156546 4.000000

176 rows × 6 columns


In [36]:
result.to_csv('Phoenix_chi_results.csv', index_label=False, encoding='utf-8' )

In [58]:
anomaly_merged = pd.concat([df_select, result], axis=1)

In [59]:
X[anomaly_merged['clusters_sp'] == 1].mean() - X[anomaly_merged['clusters_sp'] == 0].mean()


Out[59]:
review_count_greater_median          0.110465
AgesAllowed_True                    -0.023256
Ambience_classy_full_bar             0.726744
Ambience_classy_none                -0.511628
Ambience_divey_True                 -0.348837
Ambience_hipster_loud               -0.023256
Ambience_hipster_quiet              -0.319767
Ambience_romantic_dressy             0.500000
Ambience_trendy_True                -0.011628
Ambience_upscale_True                0.023256
BYOB_True                            0.604651
BYOBCorkage_True                     0.197674
BestNights_monday_True              -0.005814
BestNights_saturday_True            -0.011628
BestNights_sunday_True               0.046512
BestNights_thursday_yes_corkage     -0.005814
BestNights_thursday_yes_free        -0.040698
BestNights_tuesday_True             -0.005814
BestNights_wednesday_True           -0.011628
BikeParking_True                    -0.023256
BusinessAcceptsCreditCards_True     -0.360465
BusinessParking_garage_True         -0.854651
BusinessParking_lot_free            -0.075581
BusinessParking_lot_paid            -0.005814
BusinessParking_valet_True          -0.029070
BusinessParking_validated_True      -0.005814
Corkage_True                        -0.569767
DietaryRestrictions_halal_True      -0.994186
DietaryRestrictions_kosher_True      0.488372
DietaryRestrictions_soy-free_True   -0.988372
DietaryRestrictions_vegan_True      -0.011628
GoodForMeal_breakfast_True          -0.023256
GoodForMeal_brunch_True             -0.081395
GoodForMeal_dessert_True             0.023256
GoodForMeal_dinner_True             -0.011628
GoodForMeal_latenight_True          -0.040698
GoodForMeal_lunch_True              -0.081395
HappyHour_True                      -0.296512
Music_background_music_True          0.215116
Music_jukebox_True                  -0.011628
Music_live_True                     -0.005814
Music_no_music_True                 -0.267442
Music_video_True                     0.238372
OutdoorSeating_True                  0.000000
RestaurantsDelivery_True             0.488372
RestaurantsGoodForGroups_True        0.220930
Smoking_True                        -0.779070
stars_4.5                           -0.209302
stars_5.0                           -0.017442
dtype: float64

In [60]:
anomaly_merged['distance_km'].sort_values()


Out[60]:
t4P-8drZzj3TMwA_WU-4Zg    2.584595
I_a74zmgR-X03LsKISWPcg    2.592479
7SemopLjhDc3IQkI-JUYdQ    2.730502
1o03kXj4zA-_SvN1HfqNyg    2.774985
baIT89GubjGJV1mpn82Eeg    2.851160
ZCzey5aPhd7jYIoHsUfjmQ    2.852182
e6d50rwRTU-fONeSBJmOHQ    2.945065
OtGSQQzV7uF8XPAsNIh7Bw    3.130752
Gz9SlLqxS6wnxPvgdOQNrA    3.348413
2px99IppAcnxR238eq_8_w         NaN
cXdQ3H0GqcIzRgrvBEaBxw         NaN
qMJCVx6-ZYvZQHlfrZ_cug         NaN
fBmyj1okdO1p-GUzfQQohg         NaN
umrDQGRNied77aVg29_fVw         NaN
Y1WIH4jstH846oWcDnoZLw         NaN
fS9a8AGrcwWPf_9vfn5wPQ         NaN
uWYzS46xqA_YA1D1INgbEQ         NaN
aVKBmuvdDmEOgOfqaSF_xA         NaN
n8KZA9G4FvGqDBWIscpz9Q         NaN
kkEqZmVvVkgmCaOqE13mDg         NaN
RtnTbhWYw4QjKhoTBV37OA         NaN
ePt0qmZgU-fjqMoM1M2U3Q         NaN
2RsLaIEnNUIzjQyCvPCNWA         NaN
U3TUTWRqbW9WeLeWMxi2tw         NaN
-oOKqZbYDt08zaWWyLZNIw         NaN
46LhKfz6MPaLYoS0jegsdw         NaN
g3w01guDiTLszguA3vUrSg         NaN
J7rkQISD48jXgdM6UxNAbw         NaN
aY-lZvoTHAFl9k6dcfcAxQ         NaN
EAs61Wm1O6tLjCs8t2eP-Q         NaN
                            ...   
Xfod0UWr_9B-TOM6qxVIwQ         NaN
puQ2WGO0L2KvKqNQCJYZRw         NaN
YBIhTjmEH3IEZW4Dr5ReRA         NaN
trsKoo2b9-nMcXCKn8dlfQ         NaN
4ps8XJFZWi9nNwiWUcycVA         NaN
atSfDP-SLY4GrvBlBPB31Q         NaN
LQ-gMAQLbPVxjrBYCJ1ApQ         NaN
cwWQB8LyFbxxiOzgfZdWCg         NaN
A2eA3LRbptrexCGw8fu67Q         NaN
cmsAFAymfVoBitSF8R6GKw         NaN
ecJri9ozyke4dOCWulZiRQ         NaN
HI56ArpYEbhEUeRx9Curpw         NaN
gBEWJ4b2OvUmN4Oh7ju3hw         NaN
KXLjOJbl9VYXFxKT4YK99Q         NaN
TdjydrOFUSMUsTKdlXW6aQ         NaN
K3J4zvnQ_G1d5xfffPf2Wg         NaN
J5lgLvoKPzNqGCGPbznh-Q         NaN
lEtpTFWetCf6xnzeImLiHg         NaN
MBsIf01fcA8Vy7vCMMKf2A         NaN
6ZIHxvFTHC1pvAzAS0uLDA         NaN
JCmyrBoNl2qfZ-f8I5RA3A         NaN
puPY7gVy7RCz8bK1zkzdoQ         NaN
OBnqMdIsS8vWPrzPtnlb-w         NaN
BvrG8XMpTtlJYWhOCbnpaQ         NaN
EnOCjP3wBpoICvEZXil4Cg         NaN
yqYtY3-Po4OVPafA9Z-Xyw         NaN
sWeVNiPulXsD8PWhEtn9Ew         NaN
Xdzd_3-eh0sxCZOXGhEWkw         NaN
sQ6DZZLLJRfwUr0eDWKDDg         NaN
9Z_6rRy7Tl_C6HIgm7y6FA         NaN
Name: distance_km, dtype: float64

In [81]:
anomaly_merged.loc['Gz9SlLqxS6wnxPvgdOQNrA', :]


Out[81]:
categories                        [Restaurants, Cocktail Bars, Bars, Nightlife, ...
city                                                                        Phoenix
hours                             [Tuesday 16:0-2:0, Wednesday 16:0-2:0, Thursda...
is_open                                                                           1
latitude                                                                     33.447
longitude                                                                  -112.074
name                                              Bitter & Twisted Cocktail Parlour
neighborhood                                                                       
postal_code                                                                   85003
review_count                                                                    513
stars                                                                             4
state                                                                            AZ
AgesAllowed                                                                     NaN
Ambience_casual                                                                 NaN
Ambience_classy                                                            full_bar
Ambience_divey                                                                False
Ambience_hipster                                                            average
Ambience_intimate                                                             False
Ambience_romantic                                                               NaN
Ambience_touristy                                                             False
Ambience_trendy                                                                 NaN
Ambience_upscale                                                               True
BYOB                                                                           True
BYOBCorkage                                                                    True
BestNights_friday                                                             False
BestNights_monday                                                               NaN
BestNights_saturday                                                           False
BestNights_sunday                                                              True
BestNights_thursday                                                             NaN
BestNights_tuesday                                                            False
                                                        ...                        
DietaryRestrictions_soy-free                                                    NaN
DietaryRestrictions_vegan                                                       NaN
DietaryRestrictions_vegetarian                                                  NaN
GoodForMeal_breakfast                                                         False
GoodForMeal_brunch                                                            False
GoodForMeal_dessert                                                             NaN
GoodForMeal_dinner                                                              NaN
GoodForMeal_latenight                                                           NaN
GoodForMeal_lunch                                                             False
HappyHour                                                                       NaN
Music_background_music                                                         True
Music_dj                                                                        NaN
Music_jukebox                                                                 False
Music_karaoke                                                                   NaN
Music_live                                                                    False
Music_no_music                                                                False
Music_video                                                                    True
OutdoorSeating                                                                  NaN
RestaurantsDelivery                                                            True
RestaurantsGoodForGroups                                                      False
Smoking                                                                       False
review_count_greater_median                                                    True
cuisine_Chinese                                                                   2
spatial_label                                                                     2
clusters_sp                                                                       1
clusters_km                                                                       0
distance_km                                                                 3.34841
clusters_gm                                                                       0
scores_isf                                                               -0.0931457
distance_knn                                                                15.5956
Name: Gz9SlLqxS6wnxPvgdOQNrA, dtype: object

In [62]:
anomaly_merged['distance_knn'].sort_values()


Out[62]:
I4EmpNOW7zCcx7fUnQ9ixg     2.000000
fS9a8AGrcwWPf_9vfn5wPQ     2.000000
yvwK4MO2qnSZE5ggYGyk1A     2.000000
I7eqTdTmRTWYwF9HNZAHxw     2.000000
dYU5hXVyPsm7C-T-13I_YQ     3.000000
fBmyj1okdO1p-GUzfQQohg     3.000000
zfiSQ1dl3vTJ-og96eqXGA     3.000000
myw0csRck6XG4w5Z-EFyqg     3.000000
SMXmELJ8jFBPDq3VWRlLeQ     3.000000
46LhKfz6MPaLYoS0jegsdw     3.000000
JWsq_5AX1vDi-cG7iMJQCA     3.732051
0NgePUfXd2TG0kWgsQRq0A     3.732051
3dk9XrtorfOJu8yX6BVYRg     3.732051
24YIAXzArY1w2GXzoUbcmw     4.000000
9Z_6rRy7Tl_C6HIgm7y6FA     4.000000
cmsAFAymfVoBitSF8R6GKw     4.000000
ZN8EHmoe4Xjy4q4Y8ROEfw     4.000000
5zk3LxFzijmY2JipVZ7Xdg     4.414214
AwmLDzqJ0aMGZTYYoZnxWg     4.414214
KkmSbkWOWC3DtY-Sxa9PmQ     4.414214
dUOueknc1hk6m78AaiIwuQ     4.414214
twrfUEK_7kTOLOL96oPYyw     4.414214
JhEtJbGjAqKDHCZ11i3qoQ     4.414214
dHTGzM4_3kKEiQvab9mi9A     5.000000
cwWQB8LyFbxxiOzgfZdWCg     5.000000
n8KZA9G4FvGqDBWIscpz9Q     5.000000
3JxKzWquEbPC3yPIfoCiLw     5.000000
-oOKqZbYDt08zaWWyLZNIw     5.414214
ddLH_-W5Gr9Az4M5Xo981w     5.414214
3c8GQB0RiXK5ceRSpo5dow     5.828427
                            ...    
kxAqrCuEqnLI_F3vEfDtRQ    10.944272
mBFGYVLh588BrvfmGsOhrA    10.944272
FsCujpVh9Za2Dl5MIYLCxA    11.157694
vn5Ebwy-IdtJ6PQE6y7vaA    11.180340
0HgLKI1T9WT3hJXskDtUrQ    11.180340
JkzqS4JDOpoHrj534rzvAA    11.180340
bWucOPNoIjd8ECdiDyVq9Q    11.180340
SIBaomIYVvsMU0GTuqX4vQ    11.180340
t4P-8drZzj3TMwA_WU-4Zg    11.180340
uytrX0s6etYPCcMVC07KJw    11.371115
GA_Nx4xA3Z4pn9i5XKF1Wg    11.584537
puQ2WGO0L2KvKqNQCJYZRw    11.607183
RtnTbhWYw4QjKhoTBV37OA    11.803445
_XN-GwzZwAyIqLKJsl2htg    11.820605
TuQ6d7Frx3Gds7uyWB1TQA    11.820605
K3J4zvnQ_G1d5xfffPf2Wg    11.977060
k_SfP93Tmlpmx-5OL1JmTg    11.994221
umrDQGRNied77aVg29_fVw    12.016867
g3w01guDiTLszguA3vUrSg    12.034027
6oKZoCI_0ePyzfRqSFMBig    12.034027
VMr8sgoQOW0fo0qVJENajA    12.213128
baIT89GubjGJV1mpn82Eeg    12.426550
I_a74zmgR-X03LsKISWPcg    12.426550
7SemopLjhDc3IQkI-JUYdQ    13.184425
1o03kXj4zA-_SvN1HfqNyg    13.397847
xhyzmAnZp2snpBklfcr3Sw    13.411432
ZCzey5aPhd7jYIoHsUfjmQ    13.959460
e6d50rwRTU-fONeSBJmOHQ    14.131033
OtGSQQzV7uF8XPAsNIh7Bw    14.419796
Gz9SlLqxS6wnxPvgdOQNrA    15.595626
Name: distance_knn, dtype: float64

In [63]:
anomaly_merged['scores_isf'].sort_values()


Out[63]:
Gz9SlLqxS6wnxPvgdOQNrA   -0.093146
OtGSQQzV7uF8XPAsNIh7Bw   -0.066612
baIT89GubjGJV1mpn82Eeg   -0.064454
e6d50rwRTU-fONeSBJmOHQ   -0.060331
1o03kXj4zA-_SvN1HfqNyg   -0.028638
ZCzey5aPhd7jYIoHsUfjmQ   -0.020232
I_a74zmgR-X03LsKISWPcg   -0.019128
puQ2WGO0L2KvKqNQCJYZRw   -0.015224
xhyzmAnZp2snpBklfcr3Sw   -0.013029
K3J4zvnQ_G1d5xfffPf2Wg   -0.004559
7SemopLjhDc3IQkI-JUYdQ   -0.004330
VMr8sgoQOW0fo0qVJENajA    0.001784
6oKZoCI_0ePyzfRqSFMBig    0.003718
t4P-8drZzj3TMwA_WU-4Zg    0.006044
atSfDP-SLY4GrvBlBPB31Q    0.009153
J5lgLvoKPzNqGCGPbznh-Q    0.011846
TuQ6d7Frx3Gds7uyWB1TQA    0.013664
nQVPgOqVrNIWhY3F5kS79g    0.014397
k_SfP93Tmlpmx-5OL1JmTg    0.015420
umrDQGRNied77aVg29_fVw    0.017409
sWeVNiPulXsD8PWhEtn9Ew    0.019464
kxAqrCuEqnLI_F3vEfDtRQ    0.022574
_XN-GwzZwAyIqLKJsl2htg    0.023151
bWucOPNoIjd8ECdiDyVq9Q    0.023342
FsCujpVh9Za2Dl5MIYLCxA    0.025138
w3RH9W99ZIKvw0Qgibdn4w    0.030129
A2eA3LRbptrexCGw8fu67Q    0.030261
GA_Nx4xA3Z4pn9i5XKF1Wg    0.030940
g3w01guDiTLszguA3vUrSg    0.031306
RtnTbhWYw4QjKhoTBV37OA    0.032846
                            ...   
4ps8XJFZWi9nNwiWUcycVA    0.140827
myw0csRck6XG4w5Z-EFyqg    0.141037
zfiSQ1dl3vTJ-og96eqXGA    0.141037
fBmyj1okdO1p-GUzfQQohg    0.141037
U4OLUFb9VxMmpMXm1ZF-cQ    0.141312
Y1WIH4jstH846oWcDnoZLw    0.141601
MBsIf01fcA8Vy7vCMMKf2A    0.141687
TulmRC5V0--dnXYd_GOSvA    0.141776
ecJri9ozyke4dOCWulZiRQ    0.141980
kP2wC0luve3mT5fcdrA3nQ    0.142720
9ibMbke1qYbmhJ2Qf3tWfA    0.142972
twrfUEK_7kTOLOL96oPYyw    0.143600
JhEtJbGjAqKDHCZ11i3qoQ    0.143600
BvrG8XMpTtlJYWhOCbnpaQ    0.144361
aY-lZvoTHAFl9k6dcfcAxQ    0.148731
HV3Q2JYwK9AvMHPYXcxzag    0.149998
24YIAXzArY1w2GXzoUbcmw    0.151934
ZN8EHmoe4Xjy4q4Y8ROEfw    0.151934
cPEo3-TmIqCwHLFGKZgwGg    0.152229
n8KZA9G4FvGqDBWIscpz9Q    0.152312
9Z_6rRy7Tl_C6HIgm7y6FA    0.156546
cmsAFAymfVoBitSF8R6GKw    0.156546
fS9a8AGrcwWPf_9vfn5wPQ    0.157274
I7eqTdTmRTWYwF9HNZAHxw    0.157274
yvwK4MO2qnSZE5ggYGyk1A    0.157274
I4EmpNOW7zCcx7fUnQ9ixg    0.157274
HTAAMVDZ4P0VX5F9euJBoA    0.157729
3JxKzWquEbPC3yPIfoCiLw    0.162478
dUOueknc1hk6m78AaiIwuQ    0.164037
KkmSbkWOWC3DtY-Sxa9PmQ    0.164037
Name: scores_isf, dtype: float64

In [ ]: