In [14]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster # flat cluster
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_samples, silhouette_score
from matplotlib import pyplot as plt
%matplotlib inline
In [5]:
df_city_label = pd.read_pickle('../data_processeing/spatial_labels.pkl')
df_features = pd.read_pickle('./df_1518.pkl')
In [6]:
df_city_label.head(3)
Out[6]:
In [12]:
df_features.head(1)
Out[12]:
In [10]:
## Label: 3, cuisine_Chinese: 2, Star:>= 4
df = df_features[(df_city_label.spatial_label==3) & (df_features['cuisine_Chinese']==2) & (df_features['stars']>=4)]
In [11]:
df.shape
Out[11]:
In [13]:
df.head(3)
Out[13]:
In [ ]:
## Creatue feature "review_count_greater_median_local" as using local scores
# df['review_count_greater_median_local'] =
In [35]:
### define X by picking Features
df_res = df.loc[:, 'AgesAllowed' : 'review_count_greater_median']
## Replace NaN with False
df_res.fillna(False, inplace=True)
In [38]:
df_res.head(2)
Out[38]:
In [37]:
## get dummies variables for categorical attributes
X = pd.get_dummies(df_res, columns=['Ambience_classy', 'Ambience_hipster', 'Ambience_romantic', 'BusinessParking_lot'],\
drop_first=True)
In [40]:
X.head(2)
Out[40]:
In [131]:
## Dictionary to store anomaly results
dic_anomaly = {}
In [41]:
## function that gets silhouette scores for a clustering method
def get_silhouette_score(X, cluster_method, method='complete'):
#Choose a range(list) of clusters I would like to try:
range_n_clusters = range(2,10)
if cluster_method == 'kmeans':
for n_clusters in range_n_clusters:
km_result = KMeans(n_clusters=n_clusters, random_state=22).fit(X) # fit model
cluster_labels = km_result.labels_
# clustering model & clustering result to variable "cluster_labels".
silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
format(cluster_method, silhouette_avg))
elif cluster_method == 'gaussian_mix':
for n_clusters in range_n_clusters:
gm = GaussianMixture(n_components=n_clusters, random_state=22).fit(X) # fit model
cluster_labels = gm.predict(X)
# clustering model & clustering result to variable "cluster_labels".
silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
format(cluster_method, silhouette_avg))
# res_mix_cluster.predict(X)
elif cluster_method == 'hierarchical':
# Define Z
Z = linkage(X, method)
for n_clusters in range_n_clusters:
cluster_labels= fcluster(Z, n_clusters, criterion='maxclust')
silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
format(cluster_method, silhouette_avg))
In [42]:
##### Kmeans #########
## choose the number of clusters using silhouette method
get_silhouette_score(X, 'kmeans')
In [44]:
##### Kmeans clustering -- k=3
k = 2 # number of clusters
#XX= X.ix[:, ] # hour of day data
#train the model.
km=KMeans(n_clusters=k, random_state=22)
res=km.fit(X)
#result.
print(res.labels_)
In [56]:
## Append to df_res
df_res['km'] = res.labels_
## counts of each label
df_res.groupby('km')['AgesAllowed'].count()
Out[56]:
In [75]:
## Distance
# KM=KMeans(n_clusters=k,random_state=9)
scor=km.fit_predict(X)
res_p=pd.DataFrame(km.transform(X)) ## distances of each data point to each cluster center!!!
res_p=pd.concat((res_p, pd.DataFrame(km.fit_predict(X))),axis=1)
res_p.columns=list(range(k))+["cluster"]
res_p.loc[:,"score"]=res_p.apply(lambda x: x[int(x["cluster"])],axis=1)
res_p.sort_values("score",ascending=False)[:5]
Out[75]:
In [69]:
len(res_p)
Out[69]:
In [79]:
df_res['distance_KM'] = res_p.score.values
In [80]:
df_res.head(2)
Out[80]:
In [167]:
clustermean = df_res.groupby('km')['distance_KM'].mean()
clustermean
Out[167]:
In [139]:
## Get Anomalies
def get_anomaly(df, label_col, dist_or_likelihood, thres):
df_anomalies = pd.DataFrame(columns=df.columns)
clustermean = df.groupby(label_col)[dist_or_likelihood].mean()
clusterstd = df.groupby(label_col)[dist_or_likelihood].std()
for label in df[label_col].unique():
anomaly = df[(df[label_col]==label) \
& (abs(df[dist_or_likelihood] - clustermean[label]) / clusterstd[label] > thres)]
df_anomalies = pd.concat([df_anomalies, anomaly], axis=0)
return df_anomalies
In [143]:
## get anomalies based on kmeans distance
km_anomalies = get_anomaly(df_res, 'km', 'distance_KM', 2)
In [144]:
km_anomalies
Out[144]:
In [ ]:
# Store to dictionary
dic_anomaly = km_anomalies[]
In [130]:
len(km_anomalies)
Out[130]:
In [132]:
##### Gaussian Mixture #########
## choose the number of clusters using silhouette method
get_silhouette_score(X, 'gaussian_mix')
In [170]:
GM=GaussianMixture(n_components=k,random_state=22)
GM.fit(X)
label_gm = GM.predict(X)
df_res['gm'] = label_gm
In [174]:
#df_res.drop('score_IsoForest', inplace=True, axis=1)
In [175]:
df_res.head(2)
Out[175]:
In [176]:
## counts of each label
df_res.groupby('gm')['AgesAllowed'].count()
Out[176]:
In [177]:
## choose the number of clusters using silhouette method
get_silhouette_score(X, 'hierarchical')
In [178]:
# selected number of cluster is 2 (with the highest average silhouette_score)
Zc = linkage(X, 'complete')
# Fcluster
hir_comp_labels = fcluster(Zc, k, criterion='maxclust')
#print hir_comp_labels
# append
df_res['hir'] = hir_comp_labels
In [179]:
df_res.head(3)
Out[179]:
In [180]:
## counts of each label
df_res.groupby('hir')['AgesAllowed'].count()
Out[180]:
In [182]:
### Output
df_cluster_labels = df_res.loc[:, ['km', 'gm', 'hir']]
In [183]:
df_cluster_labels.head(3)
Out[183]:
In [185]:
## save as pickle
df_cluster_labels.to_pickle('Clustering_km_gm_hir_Toronto_Chinese.pkl')
In [1]:
### 3) Isolation Foreset
from sklearn.ensemble import IsolationForest
# fit the model
ISF = IsolationForest(max_samples=100, random_state=22)
ISF.fit(X)
## compute anomaly score of the input. The lower, the more abnormal.
score_isf = ISF.decision_function(X)
In [169]:
df_res.head(3)
Out[169]:
In [ ]: