In [14]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster   # flat cluster
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_samples, silhouette_score

from matplotlib import pyplot as plt
%matplotlib inline

In [5]:
df_city_label = pd.read_pickle('../data_processeing/spatial_labels.pkl')
df_features = pd.read_pickle('./df_1518.pkl')

In [6]:
df_city_label.head(3)


Out[6]:
spatial_label
EDqCEAGXVGCH4FJXgqtjqg 3
GDnbt3isfhd57T1QqU6flg 2
a1Ba6XeIOP48e64YFD0dMw 6

In [12]:
df_features.head(1)


Out[12]:
categories city hours is_open latitude longitude name neighborhood postal_code review_count ... Music_karaoke Music_live Music_no_music Music_video OutdoorSeating RestaurantsDelivery RestaurantsGoodForGroups Smoking review_count_greater_median cuisine_Chinese
EDqCEAGXVGCH4FJXgqtjqg [Restaurants, Pizza, Chicken Wings, Italian] Toronto [Monday 11:0-2:0, Tuesday 11:0-2:0, Wednesday ... 1 43.661054 -79.429089 Pizza Pizza Dufferin Grove M6H 1L5 7 ... NaN NaN False False False False False False False 1

1 rows × 66 columns


In [10]:
## Label: 3, cuisine_Chinese: 2, Star:>= 4
df = df_features[(df_city_label.spatial_label==3) & (df_features['cuisine_Chinese']==2) & (df_features['stars']>=4)]

In [11]:
df.shape


Out[11]:
(235, 66)

In [13]:
df.head(3)


Out[13]:
categories city hours is_open latitude longitude name neighborhood postal_code review_count ... Music_karaoke Music_live Music_no_music Music_video OutdoorSeating RestaurantsDelivery RestaurantsGoodForGroups Smoking review_count_greater_median cuisine_Chinese
KeQ1cK564cL5C_hBTFrqnA [Delis, Chinese, Indian, Seafood, Restaurants,... Mississauga [Monday 11:30-0:0, Tuesday 17:30-0:0, Wednesda... 1 43.616083 -79.617576 Desi Bar & Grill Mississauga Valley L4W 4C3 51 ... NaN False True False True False False False True 2
f5xm2RiwLv0gbmXU4BkrGA [Hot Pot, Restaurants, Chinese] Markham [Monday 12:0-23:0, Tuesday 12:0-23:0, Wednesda... 1 43.860726 -79.304713 Lion Pavilion Hot Pot Unionville L3R 0W4 4 ... NaN NaN NaN NaN True NaN NaN NaN False 2
q0oPX1DXW86QytTvvrD9MA [Southern, Specialty Food, Sri Lankan, Indian,... Toronto [Tuesday 11:30-22:0, Wednesday 11:30-22:0, Thu... 1 43.836384 -79.251047 Ceylon Flavor Scarborough M1X 0A5 12 ... NaN NaN True False True False False True False 2

3 rows × 66 columns


In [ ]:
## Creatue feature "review_count_greater_median_local" as using local scores
# df['review_count_greater_median_local'] =

In [35]:
### define X by picking Features 
df_res = df.loc[:, 'AgesAllowed' : 'review_count_greater_median']
## Replace NaN with False
df_res.fillna(False, inplace=True)

In [38]:
df_res.head(2)


Out[38]:
AgesAllowed Ambience_casual Ambience_classy Ambience_divey Ambience_hipster Ambience_intimate Ambience_romantic Ambience_touristy Ambience_trendy Ambience_upscale ... Music_jukebox Music_karaoke Music_live Music_no_music Music_video OutdoorSeating RestaurantsDelivery RestaurantsGoodForGroups Smoking review_count_greater_median
KeQ1cK564cL5C_hBTFrqnA False False full_bar True average False casual True False True ... False False False True False True False False False True
f5xm2RiwLv0gbmXU4BkrGA False False beer_and_wine True average False casual False False False ... False False False False False True False False False False

2 rows × 53 columns


In [37]:
## get dummies variables for categorical attributes
X = pd.get_dummies(df_res, columns=['Ambience_classy', 'Ambience_hipster', 'Ambience_romantic', 'BusinessParking_lot'],\
                   drop_first=True)

In [40]:
X.head(2)


Out[40]:
AgesAllowed Ambience_casual Ambience_divey Ambience_intimate Ambience_touristy Ambience_trendy Ambience_upscale BYOB BYOBCorkage BestNights_friday ... Ambience_classy_none Ambience_hipster_average Ambience_hipster_loud Ambience_hipster_quiet Ambience_hipster_very_loud Ambience_romantic_casual Ambience_romantic_dressy Ambience_romantic_formal BusinessParking_lot_free BusinessParking_lot_paid
KeQ1cK564cL5C_hBTFrqnA False False True False True False True True True False ... 0 1 0 0 0 1 0 0 1 0
f5xm2RiwLv0gbmXU4BkrGA False False True False False False False True False False ... 0 1 0 0 0 1 0 0 1 0

2 rows × 61 columns


In [131]:
## Dictionary to store anomaly results
dic_anomaly = {}

In [41]:
## function that gets silhouette scores for a clustering method
def get_silhouette_score(X, cluster_method, method='complete'):
    
    #Choose a range(list) of clusters I would like to try:
    range_n_clusters = range(2,10)
    
    if cluster_method == 'kmeans':
        for n_clusters in range_n_clusters: 
            km_result = KMeans(n_clusters=n_clusters, random_state=22).fit(X) # fit model
            cluster_labels = km_result.labels_

            # clustering model & clustering result to variable "cluster_labels". 
            silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
            print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
                  format(cluster_method, silhouette_avg))

        
    elif cluster_method == 'gaussian_mix':
        for n_clusters in range_n_clusters: 
            gm = GaussianMixture(n_components=n_clusters, random_state=22).fit(X) # fit model
            cluster_labels = gm.predict(X)

            # clustering model & clustering result to variable "cluster_labels". 
            silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
            print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
                  format(cluster_method, silhouette_avg))
        
        # res_mix_cluster.predict(X)
        
    elif cluster_method == 'hierarchical':
        # Define Z
        Z = linkage(X, method)

        for n_clusters in range_n_clusters:   
            cluster_labels=  fcluster(Z, n_clusters, criterion='maxclust') 

            silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
            print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
                  format(cluster_method, silhouette_avg))

Kmeans Clustering


In [42]:
##### Kmeans #########

## choose the number of clusters using silhouette method
get_silhouette_score(X, 'kmeans')


For n_clusters =2,kmeans - average silhouette_score :0.146589726757
For n_clusters =3,kmeans - average silhouette_score :0.122521468185
For n_clusters =4,kmeans - average silhouette_score :0.106126776448
For n_clusters =5,kmeans - average silhouette_score :0.115471525042
For n_clusters =6,kmeans - average silhouette_score :0.099969820901
For n_clusters =7,kmeans - average silhouette_score :0.111450339514
For n_clusters =8,kmeans - average silhouette_score :0.112652640613
For n_clusters =9,kmeans - average silhouette_score :0.09483628879

In [44]:
##### Kmeans clustering -- k=3

k = 2  # number of clusters
#XX= X.ix[:, ] # hour of day data

#train the model.
km=KMeans(n_clusters=k, random_state=22)
res=km.fit(X)
#result.
print(res.labels_)


[1 0 1 0 1 1 1 0 1 0 1 1 1 0 1 1 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 0 1 0 1 1 1 0 1 1 0 1 0 1 0 0 0 1
 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 0 0 0 1 1 0 0 0 0 1 0 0 1 1 0 1 1
 1 1 1 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 1 1 1 1 0 1 0 1 1 1 0 1 0 0 1 0 1 0 0
 0 1 1 0 1 0 1 1 0 0 1 0 0 0 0 0 1 1 1 0 0 1 1 0 1 1 1 1 1 0 1 0 0 0 1 0 1
 1 1 1 1 1 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 1 1 1 0 1
 0 0 1 1 1 0 1 0 0 0 1 0 0]

In [56]:
## Append to df_res
df_res['km'] = res.labels_

## counts of each label
df_res.groupby('km')['AgesAllowed'].count()


Out[56]:
km
0    116
1    119
Name: AgesAllowed, dtype: int64

In [75]:
## Distance 
# KM=KMeans(n_clusters=k,random_state=9)
scor=km.fit_predict(X)

res_p=pd.DataFrame(km.transform(X))  ## distances of each data point to each cluster center!!!
res_p=pd.concat((res_p, pd.DataFrame(km.fit_predict(X))),axis=1)

res_p.columns=list(range(k))+["cluster"]
res_p.loc[:,"score"]=res_p.apply(lambda x: x[int(x["cluster"])],axis=1)
res_p.sort_values("score",ascending=False)[:5]


Out[75]:
0 1 cluster score
223 3.145760 3.311927 0 3.145760
120 2.959371 3.452309 0 2.959371
188 3.096040 2.932424 1 2.932424
179 2.825246 3.198337 0 2.825246
129 3.112702 2.822917 1 2.822917

In [69]:
len(res_p)


Out[69]:
235

In [79]:
df_res['distance_KM'] = res_p.score.values

In [80]:
df_res.head(2)


Out[80]:
AgesAllowed Ambience_casual Ambience_classy Ambience_divey Ambience_hipster Ambience_intimate Ambience_romantic Ambience_touristy Ambience_trendy Ambience_upscale ... Music_live Music_no_music Music_video OutdoorSeating RestaurantsDelivery RestaurantsGoodForGroups Smoking review_count_greater_median km distance_KM
KeQ1cK564cL5C_hBTFrqnA False False full_bar True average False casual True False True ... False True False True False False False True 1 2.575407
f5xm2RiwLv0gbmXU4BkrGA False False beer_and_wine True average False casual False False False ... False False False True False False False False 0 2.413885

2 rows × 55 columns


In [167]:
clustermean = df_res.groupby('km')['distance_KM'].mean()
clustermean


Out[167]:
km
0    2.018185
1    2.093505
Name: distance_KM, dtype: float64

In [139]:
## Get Anomalies
def get_anomaly(df, label_col, dist_or_likelihood, thres):
    
    df_anomalies = pd.DataFrame(columns=df.columns)
    
    clustermean = df.groupby(label_col)[dist_or_likelihood].mean()
    clusterstd = df.groupby(label_col)[dist_or_likelihood].std()
    
    for label in df[label_col].unique():
        anomaly = df[(df[label_col]==label) \
               & (abs(df[dist_or_likelihood] - clustermean[label]) / clusterstd[label] > thres)]
        
        df_anomalies = pd.concat([df_anomalies, anomaly], axis=0)
    
    return df_anomalies

In [143]:
## get anomalies based on kmeans distance
km_anomalies = get_anomaly(df_res, 'km', 'distance_KM', 2)

In [144]:
km_anomalies


Out[144]:
AgesAllowed Ambience_casual Ambience_classy Ambience_divey Ambience_hipster Ambience_intimate Ambience_romantic Ambience_touristy Ambience_trendy Ambience_upscale ... Music_video OutdoorSeating RestaurantsDelivery RestaurantsGoodForGroups Smoking review_count_greater_median km distance_KM loglihood_GM gm
dy0O7DaSUd49NZWSEtvz_g False False full_bar False average False casual False False True ... False True False False False True 1.0 2.744428 156.595322 1.0
td9FZybutwNG7DgocHCiXA True False full_bar False quiet False casual False False True ... False True True False False True 1.0 2.776392 107.297061 1.0
bGuxRBRKv7i1BKKfGbjxEw False False full_bar True average False casual False False True ... True True False False False True 1.0 2.822917 107.308758 1.0
GkY6UWWn0Fz2ehcuBp66pg False False none False average False casual False False True ... False True False False True True 1.0 1.397158 178.857458 1.0
Y3zHWwGWjkdbCPGR7pF6tQ True False full_bar True average False casual False False True ... False True False False False False 1.0 2.747488 152.775213 1.0
D9bf5U0y_0MnL9wMFc4bKQ False False full_bar False average False casual False False True ... False True False False False True 1.0 2.771848 139.124438 1.0
B5JCODx4728Ce2Qca1NFHw False False full_bar False average False casual False False True ... False True False False False True 1.0 2.932424 131.212457 1.0
snw9iNNLpFYZeHotW00uVA False False full_bar False average False dressy False False True ... False True False False False True 1.0 2.733690 107.291214 1.0
E0iHvHraTa-t6ka9rYL8uQ False False full_bar True loud False casual False False True ... False False False False False False 0.0 2.959371 117.787904 0.0
1pttL4MkpxOL6Mj2azOjVQ False False full_bar False quiet False dressy False False True ... False False True False False True 0.0 2.825246 122.358286 0.0
ZDDzKXN_JXe7694zePRuTQ False False none False loud False casual False False True ... True False False False True True 0.0 2.806879 122.365379 0.0
gtcsOodbmk4E0TulYHnlHA False False full_bar False loud False casual False False True ... False False False False False True 0.0 3.145760 117.789582 0.0

12 rows × 57 columns


In [ ]:
#  Store to dictionary
dic_anomaly = km_anomalies[]

In [130]:
len(km_anomalies)


Out[130]:
11

Gaussian Mixture


In [132]:
##### Gaussian Mixture #########

## choose the number of clusters using silhouette method
get_silhouette_score(X, 'gaussian_mix')


For n_clusters =2,gaussian_mix - average silhouette_score :0.113560796581
For n_clusters =3,gaussian_mix - average silhouette_score :0.108171627662
For n_clusters =4,gaussian_mix - average silhouette_score :0.0982821936292
For n_clusters =5,gaussian_mix - average silhouette_score :0.0994682061542
For n_clusters =6,gaussian_mix - average silhouette_score :0.0901306843123
For n_clusters =7,gaussian_mix - average silhouette_score :0.0912426962608
For n_clusters =8,gaussian_mix - average silhouette_score :0.0866085380016
For n_clusters =9,gaussian_mix - average silhouette_score :0.0935505965569

In [170]:
GM=GaussianMixture(n_components=k,random_state=22)
GM.fit(X)

label_gm = GM.predict(X)

df_res['gm'] = label_gm

In [174]:
#df_res.drop('score_IsoForest', inplace=True, axis=1)

In [175]:
df_res.head(2)


Out[175]:
AgesAllowed Ambience_casual Ambience_classy Ambience_divey Ambience_hipster Ambience_intimate Ambience_romantic Ambience_touristy Ambience_trendy Ambience_upscale ... Music_no_music Music_video OutdoorSeating RestaurantsDelivery RestaurantsGoodForGroups Smoking review_count_greater_median km distance_KM gm
KeQ1cK564cL5C_hBTFrqnA False False full_bar True average False casual True False True ... True False True False False False True 1 2.575407 1
f5xm2RiwLv0gbmXU4BkrGA False False beer_and_wine True average False casual False False False ... False False True False False False False 0 2.413885 1

2 rows × 56 columns


In [176]:
## counts of each label
df_res.groupby('gm')['AgesAllowed'].count()


Out[176]:
gm
0     82
1    153
Name: AgesAllowed, dtype: int64

Hierarchical Clustering


In [177]:
## choose the number of clusters using silhouette method
get_silhouette_score(X, 'hierarchical')


For n_clusters =2,hierarchical - average silhouette_score :0.110174011918
For n_clusters =3,hierarchical - average silhouette_score :0.0689252429131
For n_clusters =4,hierarchical - average silhouette_score :0.0838649952624
For n_clusters =5,hierarchical - average silhouette_score :0.0790014552213
For n_clusters =6,hierarchical - average silhouette_score :0.0790014552213
For n_clusters =7,hierarchical - average silhouette_score :0.0650105650991
For n_clusters =8,hierarchical - average silhouette_score :0.0672239802247
For n_clusters =9,hierarchical - average silhouette_score :0.0672239802247

In [178]:
# selected number of cluster is 2 (with the highest average silhouette_score)
Zc = linkage(X, 'complete')

# Fcluster
hir_comp_labels = fcluster(Zc, k, criterion='maxclust')
#print hir_comp_labels

# append 
df_res['hir'] = hir_comp_labels

In [179]:
df_res.head(3)


Out[179]:
AgesAllowed Ambience_casual Ambience_classy Ambience_divey Ambience_hipster Ambience_intimate Ambience_romantic Ambience_touristy Ambience_trendy Ambience_upscale ... Music_video OutdoorSeating RestaurantsDelivery RestaurantsGoodForGroups Smoking review_count_greater_median km distance_KM gm hir
KeQ1cK564cL5C_hBTFrqnA False False full_bar True average False casual True False True ... False True False False False True 1 2.575407 1 1
f5xm2RiwLv0gbmXU4BkrGA False False beer_and_wine True average False casual False False False ... False True False False False False 0 2.413885 1 2
q0oPX1DXW86QytTvvrD9MA False False full_bar True quiet False casual False False True ... False True False False True False 1 2.484057 1 2

3 rows × 57 columns


In [180]:
## counts of each label
df_res.groupby('hir')['AgesAllowed'].count()


Out[180]:
hir
1    105
2    130
Name: AgesAllowed, dtype: int64

In [182]:
### Output

df_cluster_labels = df_res.loc[:, ['km', 'gm', 'hir']]

In [183]:
df_cluster_labels.head(3)


Out[183]:
km gm hir
KeQ1cK564cL5C_hBTFrqnA 1 1 1
f5xm2RiwLv0gbmXU4BkrGA 0 1 2
q0oPX1DXW86QytTvvrD9MA 1 1 2

In [185]:
## save as pickle
df_cluster_labels.to_pickle('Clustering_km_gm_hir_Toronto_Chinese.pkl')

Anomaly Detection

Isolation Forest


In [1]:
### 3) Isolation Foreset
from sklearn.ensemble import IsolationForest

# fit the model
ISF = IsolationForest(max_samples=100, random_state=22)
ISF.fit(X)

## compute anomaly score of the input. The lower, the more abnormal.
score_isf = ISF.decision_function(X)



NameErrorTraceback (most recent call last)
<ipython-input-1-7d9b1cfc7d9d> in <module>()
      4 # fit the model
      5 ISF = IsolationForest(max_samples=100, random_state=22)
----> 6 ISF.fit(X)
      7 
      8 ## compute anomaly score of the input. The lower, the more abnormal.

NameError: name 'X' is not defined

In [169]:
df_res.head(3)


Out[169]:
AgesAllowed Ambience_casual Ambience_classy Ambience_divey Ambience_hipster Ambience_intimate Ambience_romantic Ambience_touristy Ambience_trendy Ambience_upscale ... OutdoorSeating RestaurantsDelivery RestaurantsGoodForGroups Smoking review_count_greater_median km distance_KM loglihood_GM gm score_IsoForest
KeQ1cK564cL5C_hBTFrqnA False False full_bar True average False casual True False True ... True False False False True 1 2.575407 131.006356 1 -0.028755
f5xm2RiwLv0gbmXU4BkrGA False False beer_and_wine True average False casual False False False ... True False False False False 0 2.413885 162.879369 1 0.008594
q0oPX1DXW86QytTvvrD9MA False False full_bar True quiet False casual False False True ... True False False True False 1 2.484057 164.455935 1 0.016512

3 rows × 58 columns

Interpretation

The anomaly score of the input samples. The lower, the more abnormal.


In [ ]: