In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_pickle('../data_processeing/Yelp_Cuisine_Japanese.pkl')
d = pd.read_pickle('../data_processeing/Yelp_Cuisine_Japanese.pkl')
df = df.loc[:,'AcceptsInsurance':'cuisine_Japanese']
if_col_keep = 1.0* df.count().sort_values() / len(df) > 0.7
col_keep = df.count().sort_values()[if_col_keep]
col_prefix = []
for c in col_keep.keys().values:
    #print c.split('_')[0]
    if c == 'cuisine_Chinese':
        pass
    else:
        col_prefix.append(c.split('_')[0])
col_prefix = list(set(col_prefix))
delete_col = ['HairSpecializesIn']
for c in delete_col:
    col_prefix.remove(c)
len(col_prefix)
col_with_prefix = []
for c in df.columns[:-1]:
    if c.split('_')[0] in col_prefix:
        col_with_prefix.append(c)
df1 = df.copy()
md = df1.join(d.review_count).review_count.median()
df1['review_count_greater_median'] = df1.join(d.review_count).review_count > md
df_basic = d[[u'categories', u'city', u'hours', u'is_open', u'latitude', u'longitude', u'name', u'neighborhood', u'postal_code', u'review_count', u'stars', u'state']]
df_final = df_basic.join(df1[col_with_prefix+['review_count_greater_median','cuisine_Japanese']])

In [3]:
spatial_label = pd.read_pickle('../data_processeing/spatial_labels.pkl')

In [4]:
df_new = pd.concat([df_final,spatial_label], axis=1)

In [5]:
df_select = df_new[(df_new['stars'] >= 4) & (df_new['spatial_label'] == 2) & (df_new['cuisine_Japanese'] == 2)]

Spectual Clustering


In [6]:
X = df_select.loc[:, 'AgesAllowed': 'review_count_greater_median']
X = pd.concat([X, df_select[['stars']]], axis=1)
X['stars'] = X['stars'].apply(str)
X = pd.get_dummies(X, dummy_na=False, drop_first=True)
def true_false(x):
    if x == True:
        return 1
    else:
        return 0
X['review_count_greater_median'] = X['review_count_greater_median'].apply(true_false)

In [7]:
for n_clusters in range(2,5):  
    spectural_clustering = SpectralClustering(n_clusters=n_clusters, random_state=100, affinity='sigmoid').fit(X)
    labels = spectural_clustering.labels_
    print silhouette_score(X, labels, metric='cityblock')
    print list(labels).count(0)
    print list(labels).count(1)
    print list(labels).count(2)
    print list(labels).count(3)
    print list(labels).count(4)


0.357625510143
118
1
0
0
0
0.313062733033
2
116
1
0
0
0.100278545237
112
5
1
1
0

In [8]:
spectural_clustering = SpectralClustering(n_clusters=2, random_state=999, affinity='sigmoid').fit(X)
labels_spectural = spectural_clustering.labels_

In [9]:
df_select.shape


Out[9]:
(119, 67)

KMeans


In [10]:
## function that gets silhouette scores for a clustering method
def get_silhouette_score(X, cluster_method, method='complete'):
    
    #Choose a range(list) of clusters I would like to try:
    range_n_clusters = range(2,10)
    
    if cluster_method == 'kmeans':
        for n_clusters in range_n_clusters: 
            km_result = KMeans(n_clusters=n_clusters, random_state=22).fit(X) # fit model
            cluster_labels = km_result.labels_

            # clustering model & clustering result to variable "cluster_labels". 
            silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
            print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
                  format(cluster_method, silhouette_avg))

        
    elif cluster_method == 'gaussian_mix':
        for n_clusters in range_n_clusters: 
            gm = GaussianMixture(n_components=n_clusters, random_state=22).fit(X) # fit model
            cluster_labels = gm.predict(X)

            # clustering model & clustering result to variable "cluster_labels". 
            silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
            print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
                  format(cluster_method, silhouette_avg))
        
        # res_mix_cluster.predict(X)
        
    elif cluster_method == 'hierarchical':
        # Define Z
        Z = linkage(X, method)

        for n_clusters in range_n_clusters:   
            cluster_labels=  fcluster(Z, n_clusters, criterion='maxclust') 

            silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
            print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
                  format(cluster_method, silhouette_avg))

In [11]:
## choose the number of clusters using silhouette method
get_silhouette_score(X, 'kmeans')


For n_clusters =2,kmeans - average silhouette_score :0.198111214924
For n_clusters =3,kmeans - average silhouette_score :0.124831239787
For n_clusters =4,kmeans - average silhouette_score :0.13200666947
For n_clusters =5,kmeans - average silhouette_score :0.127508828854
For n_clusters =6,kmeans - average silhouette_score :0.110961452304
For n_clusters =7,kmeans - average silhouette_score :0.110801032654
For n_clusters =8,kmeans - average silhouette_score :0.111323618817
For n_clusters =9,kmeans - average silhouette_score :0.0983671939625

In [12]:
n_clusters = 2  # number of clusters
#XX= X.ix[:, ] # hour of day data

#train the model.
km=KMeans(n_clusters=n_clusters, random_state=22).fit(X)
labels_km = km.labels_

print list(labels_km).count(0)
print list(labels_km).count(1)


88
31

In [13]:
## Distance 
# KM=KMeans(n_clusters=k,random_state=9)
scor=km.fit_predict(X)

res_p=pd.DataFrame(km.transform(X))  ## distances of each data point to each cluster center!!!
res_p=pd.concat((res_p, pd.DataFrame(km.fit_predict(X))),axis=1)

res_p.columns=list(range(n_clusters))+["cluster"]
res_p.loc[:,"score"]=res_p.apply(lambda x: x[int(x["cluster"])],axis=1)
res_p.sort_values("score",ascending=False)[:5]


Out[13]:
0 1 cluster score
39 3.108302 3.416463 0 3.108302
70 2.950762 3.368922 0 2.950762
35 2.911997 3.541636 0 2.911997
30 2.788401 3.344898 0 2.788401
14 2.856842 2.722894 1 2.722894

In [14]:
X_df = X.copy()

In [15]:
X_df['km'] = labels_km

In [16]:
X_df['distance_KM'] = res_p.score.values

In [17]:
## Get Anomalies
def get_anomaly(df, label_col, dist_or_likelihood, thres):
    
    df_anomalies = pd.DataFrame(columns=df.columns)
    
    clustermean = df.groupby(label_col)[dist_or_likelihood].mean()
    clusterstd = df.groupby(label_col)[dist_or_likelihood].std()
    
    for label in df[label_col].unique():
        anomaly = df[(df[label_col]==label) \
               & (abs(df[dist_or_likelihood] - clustermean[label]) / clusterstd[label] > thres)]
        
        df_anomalies = pd.concat([df_anomalies, anomaly], axis=0)
    
    return df_anomalies

In [18]:
## get anomalies based on kmeans distance
km_anomalies = get_anomaly(X_df, 'km', 'distance_KM', 2)

In [19]:
km_anomalies['distance_KM']


Out[19]:
O7ZvoH_ONGNKbXvHDSZlwg    2.722894
ZCzey5aPhd7jYIoHsUfjmQ    2.788401
e6d50rwRTU-fONeSBJmOHQ    2.911997
0Buxoc0cRqjpvkezo3bqog    3.108302
fI9ErCUGY8rXRPBbatcxMA    2.950762
Name: distance_KM, dtype: float64

In [ ]:


In [ ]:

Gaussian Mixture


In [20]:
##### Gaussian Mixture #########

## choose the number of clusters using silhouette method
get_silhouette_score(X, 'gaussian_mix')


For n_clusters =2,gaussian_mix - average silhouette_score :0.115937006875
For n_clusters =3,gaussian_mix - average silhouette_score :0.119965109891
For n_clusters =4,gaussian_mix - average silhouette_score :0.146853177694
For n_clusters =5,gaussian_mix - average silhouette_score :0.106963683433
For n_clusters =6,gaussian_mix - average silhouette_score :0.108982386698
For n_clusters =7,gaussian_mix - average silhouette_score :0.112175042316
For n_clusters =8,gaussian_mix - average silhouette_score :0.0755344608829
For n_clusters =9,gaussian_mix - average silhouette_score :0.0862290144781

In [22]:
k = 4
GM=GaussianMixture(n_components=k,random_state=22)
GM.fit(X)

label_gm = GM.predict(X)

In [23]:
print list(label_gm).count(0)
print list(label_gm).count(1)
print list(label_gm).count(2)
print list(label_gm).count(3)
print list(label_gm).count(4)


24
17
16
62
0

Isolation Forest


In [24]:
### 3) Isolation Foreset
from sklearn.ensemble import IsolationForest

# fit the model
ISF = IsolationForest(max_samples=100, random_state=22)
ISF.fit(X)

## compute anomaly score of the input. The lower, the more abnormal.
score_isf = ISF.decision_function(X)

In [25]:
score_isf.argmin()


Out[25]:
35

KNN Distance


In [26]:
def dist2knn(x, nn, k):
    dist_ = []
    for i in range(len(nn)):
        dist_.append(distance.euclidean(x, nn.iloc[i,:]))
    dist_.sort()
    return sum(dist_[:k+1]) # +1: remove 0 self

In [27]:
dist_sum_knn = []
for i in range(len(X)):
    print '\r{}%'.format(100.0*(i+1)/len(X)),
    dist_sum_knn.append(dist2knn(X.iloc[i,:], X, 5))


100.0%

In [ ]:

Output


In [28]:
result = pd.DataFrame(index=X.index)

In [29]:
result['clusters_sp'] = labels_spectural
result['clusters_km'] = labels_km
result['distance_km'] = km_anomalies['distance_KM']
result['clusters_gm'] = label_gm
result['scores_isf'] = score_isf
result['distance_knn'] = dist_sum_knn

In [30]:
result


Out[30]:
clusters_sp clusters_km distance_km clusters_gm scores_isf distance_knn
Ld2hhA3q3cdkptwS1fsYEg 0 1 NaN 2 0.067136 9.464102
E_f5hWSBXkxvyckUpU31kQ 1 1 NaN 1 -0.050171 12.230289
gA9hCYY7MYl9oZ3aym5dvw 0 0 NaN 3 0.165696 2.000000
fS9a8AGrcwWPf_9vfn5wPQ 0 0 NaN 3 0.166010 5.000000
_IZyyv0dFfpIy_rI_-nwTQ 0 0 NaN 0 0.117533 6.656854
-ITj6Pu8Gdw8MmLf0XBEKQ 0 0 NaN 3 0.165041 0.000000
kkEqZmVvVkgmCaOqE13mDg 0 1 NaN 2 0.084863 8.928203
KGS8NdjkMzrS_BoBTo5nBA 0 0 NaN 3 0.153479 6.242641
ctzH0uqGz6q_o2nuCrAUvA 0 0 NaN 3 0.132629 7.071068
J7rkQISD48jXgdM6UxNAbw 0 1 NaN 2 0.104192 8.292529
RZFryvOkznd4jtpj8PbSzg 0 0 NaN 3 0.155109 5.000000
Ly2ShApiomYZwKehwb7eRA 0 0 NaN 3 0.115044 7.292529
wa8QgXQu1ZxwPgdRl9lYlg 0 0 NaN 3 0.165041 0.000000
P2GBKrx7dJg3xeGtuU2K-A 0 0 NaN 3 0.152744 5.242641
O7ZvoH_ONGNKbXvHDSZlwg 0 1 2.722894 1 -0.015636 13.580522
mRxUCEoUUe-XsEYS1SlrTA 0 1 NaN 2 0.021353 11.371115
5j2ugUALtjsa2nkV1YGq5Q 0 0 NaN 3 0.139997 6.656854
gRMYm-CeEdjXtaQcVo8vsw 0 0 NaN 3 0.167695 0.000000
dYU5hXVyPsm7C-T-13I_YQ 0 0 NaN 3 0.128427 5.242641
oa52t2Nfb7sOCqJE5M5hzg 0 0 NaN 0 0.119636 6.974691
8PaMGALpL7FwwdUy7eK4ZA 0 1 NaN 1 0.048683 8.878315
TkoyGi8J7YFjA6SbaRzrxg 0 0 NaN 3 0.167695 0.000000
4s_cPDS_AkLP1fXMJPqB2w 0 0 NaN 3 0.163634 5.000000
SIBaomIYVvsMU0GTuqX4vQ 0 0 NaN 0 0.075484 10.236068
KH9XgG2nt0G6bk8-NKkQQQ 0 0 NaN 0 0.141131 6.656854
L6ZPG8lO1tMGmHimSjuEMg 0 0 NaN 3 0.031630 12.247449
fT6Uwl6abRQfiNgxre_qbg 0 0 NaN 3 0.015909 10.000000
evdJO0v9rvVixieNEnaeJg 0 0 NaN 3 0.143754 6.656854
zXjA4Jms2jCAP09EO7Ghfg 0 1 NaN 2 0.073777 8.928203
XVS-zdj4Z6TihagZcZZ7pQ 0 0 NaN 3 0.108696 5.000000
... ... ... ... ... ... ...
CUivTcULsu5MJIYYNVm1zw 0 0 NaN 3 0.077025 9.464102
NJ0RzuWd5xDqfJejYQZ65g 0 0 NaN 0 0.009708 11.607183
dA7ORbBrCdxoMlvn3I6JDA 0 0 NaN 0 0.069254 11.180340
W_2SaN0xzmH0WjScED4a4Q 0 1 NaN 2 0.004568 11.820605
GWnhc3MO4XjsKIpyExV--Q 0 0 NaN 3 0.154119 6.656854
DwP10iEz5LGf3fhcVQZm0Q 0 0 NaN 0 0.016535 10.745523
u2q_84hHvKGl5hKnAE7zNw 0 0 NaN 3 0.146813 5.828427
B-pELAaG6vPfa3eQAUk98A 0 1 NaN 1 0.037356 8.878315
S-oLPRdhlyL5HAknBKTUcQ 0 0 NaN 3 0.114535 8.610366
kO1-ZswCgQak2beKUMaYRw 0 1 NaN 2 0.116140 7.706742
SMXmELJ8jFBPDq3VWRlLeQ 0 0 NaN 3 0.128427 5.242641
Xfod0UWr_9B-TOM6qxVIwQ 0 1 NaN 2 0.082645 8.928203
JQwMRgCMZJhIDxec1z7URQ 0 0 NaN 3 0.143050 6.656854
8CRXSMiyTR9DOMOdrTDRUw 0 0 NaN 3 0.165041 0.000000
2r6UD7ExSMrK0LGeglquDA 0 0 NaN 0 0.079122 9.732051
H_yoEB8N8CeABPuLMFWEzQ 0 0 NaN 3 0.141537 7.388905
YrtAT1B8aFp5dKNss1ICbw 0 0 NaN 3 0.165696 2.000000
ecJri9ozyke4dOCWulZiRQ 0 0 NaN 3 0.151234 6.656854
gBEWJ4b2OvUmN4Oh7ju3hw 0 0 NaN 3 0.122476 6.656854
oZcbn7ENit23xbCsyu9xbA 0 0 NaN 0 0.081784 10.000000
BWgXFbRqrPKci7C1__6BXg 0 1 NaN 2 0.093320 9.196152
U5ZlFP1kBFzlmnaIn1f9sQ 0 0 NaN 3 0.167695 0.000000
gcNC4k7TZJVX_1YHdIkDNQ 0 0 NaN 3 0.151806 5.000000
ivAe-BA1y3DOyRUKHdPnQA 0 0 NaN 3 0.118956 7.706742
K3J4zvnQ_G1d5xfffPf2Wg 0 1 NaN 1 -0.018467 13.032495
MBsIf01fcA8Vy7vCMMKf2A 0 0 NaN 3 0.163736 5.000000
yqYtY3-Po4OVPafA9Z-Xyw 0 0 NaN 0 0.085100 8.342417
XC06xxb7FFYsXcI2E38IIw 0 0 NaN 0 0.072360 10.000000
hNLgrdd0QC3qLbD4tOHHVQ 0 0 NaN 3 0.130741 7.071068
35Z23ZztQCsJdASRbU69Vg 0 0 NaN 3 0.128919 5.000000

119 rows × 6 columns


In [31]:
result.to_csv('Phoenix_jap_results.csv', index_label=False, encoding='utf-8' )

In [32]:
anomaly_merged = pd.concat([df_select, result], axis=1)

In [35]:
df_select[anomaly_merged['clusters_sp'] == 1]


Out[35]:
categories city hours is_open latitude longitude name neighborhood postal_code review_count ... Music_live Music_no_music Music_video OutdoorSeating RestaurantsDelivery RestaurantsGoodForGroups Smoking review_count_greater_median cuisine_Japanese spatial_label
E_f5hWSBXkxvyckUpU31kQ [Food Trucks, Food, Restaurants, Sushi Bars] Phoenix [Tuesday 11:0-22:0, Wednesday 11:0-22:0, Thurs... 1 33.451025 -112.133684 Sushi Mocorito 85009 3 ... NaN NaN NaN False NaN NaN NaN False 2 2

1 rows × 67 columns


In [33]:
anomaly_merged['distance_km'].sort_values()


Out[33]:
O7ZvoH_ONGNKbXvHDSZlwg    2.722894
ZCzey5aPhd7jYIoHsUfjmQ    2.788401
e6d50rwRTU-fONeSBJmOHQ    2.911997
fI9ErCUGY8rXRPBbatcxMA    2.950762
0Buxoc0cRqjpvkezo3bqog    3.108302
Ld2hhA3q3cdkptwS1fsYEg         NaN
E_f5hWSBXkxvyckUpU31kQ         NaN
gA9hCYY7MYl9oZ3aym5dvw         NaN
fS9a8AGrcwWPf_9vfn5wPQ         NaN
_IZyyv0dFfpIy_rI_-nwTQ         NaN
-ITj6Pu8Gdw8MmLf0XBEKQ         NaN
kkEqZmVvVkgmCaOqE13mDg         NaN
KGS8NdjkMzrS_BoBTo5nBA         NaN
ctzH0uqGz6q_o2nuCrAUvA         NaN
J7rkQISD48jXgdM6UxNAbw         NaN
RZFryvOkznd4jtpj8PbSzg         NaN
Ly2ShApiomYZwKehwb7eRA         NaN
wa8QgXQu1ZxwPgdRl9lYlg         NaN
P2GBKrx7dJg3xeGtuU2K-A         NaN
mRxUCEoUUe-XsEYS1SlrTA         NaN
5j2ugUALtjsa2nkV1YGq5Q         NaN
gRMYm-CeEdjXtaQcVo8vsw         NaN
dYU5hXVyPsm7C-T-13I_YQ         NaN
oa52t2Nfb7sOCqJE5M5hzg         NaN
8PaMGALpL7FwwdUy7eK4ZA         NaN
TkoyGi8J7YFjA6SbaRzrxg         NaN
4s_cPDS_AkLP1fXMJPqB2w         NaN
SIBaomIYVvsMU0GTuqX4vQ         NaN
KH9XgG2nt0G6bk8-NKkQQQ         NaN
L6ZPG8lO1tMGmHimSjuEMg         NaN
                            ...   
CUivTcULsu5MJIYYNVm1zw         NaN
NJ0RzuWd5xDqfJejYQZ65g         NaN
dA7ORbBrCdxoMlvn3I6JDA         NaN
W_2SaN0xzmH0WjScED4a4Q         NaN
GWnhc3MO4XjsKIpyExV--Q         NaN
DwP10iEz5LGf3fhcVQZm0Q         NaN
u2q_84hHvKGl5hKnAE7zNw         NaN
B-pELAaG6vPfa3eQAUk98A         NaN
S-oLPRdhlyL5HAknBKTUcQ         NaN
kO1-ZswCgQak2beKUMaYRw         NaN
SMXmELJ8jFBPDq3VWRlLeQ         NaN
Xfod0UWr_9B-TOM6qxVIwQ         NaN
JQwMRgCMZJhIDxec1z7URQ         NaN
8CRXSMiyTR9DOMOdrTDRUw         NaN
2r6UD7ExSMrK0LGeglquDA         NaN
H_yoEB8N8CeABPuLMFWEzQ         NaN
YrtAT1B8aFp5dKNss1ICbw         NaN
ecJri9ozyke4dOCWulZiRQ         NaN
gBEWJ4b2OvUmN4Oh7ju3hw         NaN
oZcbn7ENit23xbCsyu9xbA         NaN
BWgXFbRqrPKci7C1__6BXg         NaN
U5ZlFP1kBFzlmnaIn1f9sQ         NaN
gcNC4k7TZJVX_1YHdIkDNQ         NaN
ivAe-BA1y3DOyRUKHdPnQA         NaN
K3J4zvnQ_G1d5xfffPf2Wg         NaN
MBsIf01fcA8Vy7vCMMKf2A         NaN
yqYtY3-Po4OVPafA9Z-Xyw         NaN
XC06xxb7FFYsXcI2E38IIw         NaN
hNLgrdd0QC3qLbD4tOHHVQ         NaN
35Z23ZztQCsJdASRbU69Vg         NaN
Name: distance_km, dtype: float64

In [36]:
anomaly_merged['distance_knn'].sort_values()


Out[36]:
8CRXSMiyTR9DOMOdrTDRUw     0.000000
kImf4ivgHInr7kTTJWaVhg     0.000000
LtNgP4FqXp5nMFOHErK8cw     0.000000
U5ZlFP1kBFzlmnaIn1f9sQ     0.000000
XqRY9T8s0JhdehfSmXhCxw     0.000000
-ITj6Pu8Gdw8MmLf0XBEKQ     0.000000
_reykENmCBh03iczpFAufg     0.000000
pBoWsom3mZFYo9EucIWRDg     0.000000
DEeGnshpgUOIOjhq_dd5Ug     0.000000
wa8QgXQu1ZxwPgdRl9lYlg     0.000000
gRMYm-CeEdjXtaQcVo8vsw     0.000000
TkoyGi8J7YFjA6SbaRzrxg     0.000000
YrtAT1B8aFp5dKNss1ICbw     2.000000
EB0vzUuimM2nRXND5VJkIw     2.000000
gA9hCYY7MYl9oZ3aym5dvw     2.000000
a1KXovXPdeDurOrER4ST6A     2.000000
XVS-zdj4Z6TihagZcZZ7pQ     5.000000
35Z23ZztQCsJdASRbU69Vg     5.000000
R_M4P9XetEM-aLE7eHdthw     5.000000
xlb4QVFWB9CSPT7qNRn5xA     5.000000
R-Jmd1zLAyGsN5uP_hJx3g     5.000000
V6X1T1DdfUVNIk3AmispDg     5.000000
bkZVphD4miecxL2PalLi8w     5.000000
G4LzA_UqrzcZ5gAyWWP2sw     5.000000
gcNC4k7TZJVX_1YHdIkDNQ     5.000000
MBsIf01fcA8Vy7vCMMKf2A     5.000000
4s_cPDS_AkLP1fXMJPqB2w     5.000000
RZFryvOkznd4jtpj8PbSzg     5.000000
fS9a8AGrcwWPf_9vfn5wPQ     5.000000
P2GBKrx7dJg3xeGtuU2K-A     5.242641
                            ...    
GnAvR67XtnBdCuLDuulQwA     9.968119
fT6Uwl6abRQfiNgxre_qbg    10.000000
XC06xxb7FFYsXcI2E38IIw    10.000000
oZcbn7ENit23xbCsyu9xbA    10.000000
SIBaomIYVvsMU0GTuqX4vQ    10.236068
erpMXkhBRNAwfXYfqIaC6Q    10.440255
5O02gK9VpnLZNC-aFnp6lQ    10.708204
DwP10iEz5LGf3fhcVQZm0Q    10.745523
rWmZAJMZ8NBw-_FbOYPnfA    10.785329
FsCujpVh9Za2Dl5MIYLCxA    11.157694
dA7ORbBrCdxoMlvn3I6JDA    11.180340
mRxUCEoUUe-XsEYS1SlrTA    11.371115
1qkKfqhO8z2XMzLLDFE96Q    11.371115
GA_Nx4xA3Z4pn9i5XKF1Wg    11.393762
NJ0RzuWd5xDqfJejYQZ65g    11.607183
GUyrT0FO-YBAmvGNH0653w    11.780799
3HU3V2AYuVUxdd_Q5Nv5Vw    11.820605
W_2SaN0xzmH0WjScED4a4Q    11.820605
yCK1Ok69D7TSWJESZQZHiQ    11.820605
4_XrzTImR7calqcE6Otyaw    12.034027
fI9ErCUGY8rXRPBbatcxMA    12.166309
8QfDxSuHkXA-sv5wcfP4Zg    12.173322
E_f5hWSBXkxvyckUpU31kQ    12.230289
L6ZPG8lO1tMGmHimSjuEMg    12.247449
PDxLOAtYqOz6z7M7h0LRoA    12.443710
K3J4zvnQ_G1d5xfffPf2Wg    13.032495
O7ZvoH_ONGNKbXvHDSZlwg    13.580522
ZCzey5aPhd7jYIoHsUfjmQ    13.776784
e6d50rwRTU-fONeSBJmOHQ    14.313708
0Buxoc0cRqjpvkezo3bqog    14.809837
Name: distance_knn, dtype: float64

In [42]:
anomaly_merged.loc['e6d50rwRTU-fONeSBJmOHQ', 'categories']


Out[42]:
[u'Nightlife',
 u'Cocktail Bars',
 u'Asian Fusion',
 u'Burgers',
 u'Restaurants',
 u'Bars',
 u'Japanese']

In [38]:
anomaly_merged['scores_isf'].sort_values()


Out[38]:
e6d50rwRTU-fONeSBJmOHQ   -0.087289
8QfDxSuHkXA-sv5wcfP4Zg   -0.059991
0Buxoc0cRqjpvkezo3bqog   -0.059496
E_f5hWSBXkxvyckUpU31kQ   -0.050171
fI9ErCUGY8rXRPBbatcxMA   -0.037297
PDxLOAtYqOz6z7M7h0LRoA   -0.028772
ZCzey5aPhd7jYIoHsUfjmQ   -0.027958
GUyrT0FO-YBAmvGNH0653w   -0.023175
K3J4zvnQ_G1d5xfffPf2Wg   -0.018467
O7ZvoH_ONGNKbXvHDSZlwg   -0.015636
FsCujpVh9Za2Dl5MIYLCxA   -0.002797
3HU3V2AYuVUxdd_Q5Nv5Vw    0.001810
W_2SaN0xzmH0WjScED4a4Q    0.004568
1qkKfqhO8z2XMzLLDFE96Q    0.008132
rWmZAJMZ8NBw-_FbOYPnfA    0.008444
NJ0RzuWd5xDqfJejYQZ65g    0.009708
yCK1Ok69D7TSWJESZQZHiQ    0.009708
GA_Nx4xA3Z4pn9i5XKF1Wg    0.012851
fT6Uwl6abRQfiNgxre_qbg    0.015909
DwP10iEz5LGf3fhcVQZm0Q    0.016535
mRxUCEoUUe-XsEYS1SlrTA    0.021353
4_XrzTImR7calqcE6Otyaw    0.027445
3vta1BSPwdSulcCFMpBjDw    0.028404
L6ZPG8lO1tMGmHimSjuEMg    0.031630
B-pELAaG6vPfa3eQAUk98A    0.037356
8PaMGALpL7FwwdUy7eK4ZA    0.048683
RWAgNm7kFVhr69XEZsccYw    0.055319
z9KxcVoe6tQGyB_Zy0Umcg    0.057540
L7rjzj2g2EEbHfee9V-X2Q    0.058574
jCg6MSfu3fgXxO2QrpDV7w    0.063499
                            ...   
ecJri9ozyke4dOCWulZiRQ    0.151234
gcNC4k7TZJVX_1YHdIkDNQ    0.151806
xlb4QVFWB9CSPT7qNRn5xA    0.151966
R1oI13c1oNxE91wkjYDeow    0.152744
P2GBKrx7dJg3xeGtuU2K-A    0.152744
KGS8NdjkMzrS_BoBTo5nBA    0.153479
GWnhc3MO4XjsKIpyExV--Q    0.154119
RZFryvOkznd4jtpj8PbSzg    0.155109
G4LzA_UqrzcZ5gAyWWP2sw    0.161855
R-Jmd1zLAyGsN5uP_hJx3g    0.163135
4s_cPDS_AkLP1fXMJPqB2w    0.163634
MBsIf01fcA8Vy7vCMMKf2A    0.163736
R_M4P9XetEM-aLE7eHdthw    0.164743
wa8QgXQu1ZxwPgdRl9lYlg    0.165041
-ITj6Pu8Gdw8MmLf0XBEKQ    0.165041
kImf4ivgHInr7kTTJWaVhg    0.165041
pBoWsom3mZFYo9EucIWRDg    0.165041
LtNgP4FqXp5nMFOHErK8cw    0.165041
8CRXSMiyTR9DOMOdrTDRUw    0.165041
YrtAT1B8aFp5dKNss1ICbw    0.165696
EB0vzUuimM2nRXND5VJkIw    0.165696
gA9hCYY7MYl9oZ3aym5dvw    0.165696
a1KXovXPdeDurOrER4ST6A    0.165696
fS9a8AGrcwWPf_9vfn5wPQ    0.166010
XqRY9T8s0JhdehfSmXhCxw    0.167695
TkoyGi8J7YFjA6SbaRzrxg    0.167695
_reykENmCBh03iczpFAufg    0.167695
gRMYm-CeEdjXtaQcVo8vsw    0.167695
U5ZlFP1kBFzlmnaIn1f9sQ    0.167695
DEeGnshpgUOIOjhq_dd5Ug    0.167695
Name: scores_isf, dtype: float64

In [ ]: