In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
%matplotlib inline
In [31]:
df = pd.read_pickle('../data_processeing/Yelp_Cuisine_Japanese.pkl')
d = pd.read_pickle('../data_processeing/Yelp_Cuisine_Japanese.pkl')
df = df.loc[:,'AcceptsInsurance':'cuisine_Japanese']
if_col_keep = 1.0* df.count().sort_values() / len(df) > 0.7
col_keep = df.count().sort_values()[if_col_keep]
col_prefix = []
for c in col_keep.keys().values:
#print c.split('_')[0]
if c == 'cuisine_Chinese':
pass
else:
col_prefix.append(c.split('_')[0])
col_prefix = list(set(col_prefix))
delete_col = ['HairSpecializesIn']
for c in delete_col:
col_prefix.remove(c)
len(col_prefix)
col_with_prefix = []
for c in df.columns[:-1]:
if c.split('_')[0] in col_prefix:
col_with_prefix.append(c)
df1 = df.copy()
md = df1.join(d.review_count).review_count.median()
df1['review_count_greater_median'] = df1.join(d.review_count).review_count > md
df_basic = d[[u'categories', u'city', u'hours', u'is_open', u'latitude', u'longitude', u'name', u'neighborhood', u'postal_code', u'review_count', u'stars', u'state']]
df_final = df_basic.join(df1[col_with_prefix+['review_count_greater_median','cuisine_Japanese']])
In [32]:
spatial_label = pd.read_pickle('../data_processeing/spatial_labels.pkl')
In [33]:
df_new = pd.concat([df_final,spatial_label], axis=1)
In [35]:
df_select = df_new[(df_new['stars'] >= 4) & (df_new['spatial_label'] == 7) & (df_new['cuisine_Japanese'] == 2)]
In [36]:
X = df_select.loc[:, 'AgesAllowed': 'review_count_greater_median']
X = pd.concat([X, df_select[['stars']]], axis=1)
X['stars'] = X['stars'].apply(str)
X = pd.get_dummies(X, dummy_na=False, drop_first=True)
def true_false(x):
if x == True:
return 1
else:
return 0
X['review_count_greater_median'] = X['review_count_greater_median'].apply(true_false)
In [37]:
for n_clusters in range(2,5):
spectural_clustering = SpectralClustering(n_clusters=n_clusters, random_state=100, affinity='sigmoid').fit(X)
labels = spectural_clustering.labels_
print silhouette_score(X, labels, metric='cityblock')
print list(labels).count(0)
print list(labels).count(1)
print list(labels).count(2)
print list(labels).count(3)
print list(labels).count(4)
In [38]:
spectural_clustering = SpectralClustering(n_clusters=2, random_state=999, affinity='sigmoid').fit(X)
labels_spectural = spectural_clustering.labels_
In [39]:
df_select.shape
Out[39]:
In [40]:
## function that gets silhouette scores for a clustering method
def get_silhouette_score(X, cluster_method, method='complete'):
#Choose a range(list) of clusters I would like to try:
range_n_clusters = range(2,10)
if cluster_method == 'kmeans':
for n_clusters in range_n_clusters:
km_result = KMeans(n_clusters=n_clusters, random_state=22).fit(X) # fit model
cluster_labels = km_result.labels_
# clustering model & clustering result to variable "cluster_labels".
silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
format(cluster_method, silhouette_avg))
elif cluster_method == 'gaussian_mix':
for n_clusters in range_n_clusters:
gm = GaussianMixture(n_components=n_clusters, random_state=22).fit(X) # fit model
cluster_labels = gm.predict(X)
# clustering model & clustering result to variable "cluster_labels".
silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
format(cluster_method, silhouette_avg))
# res_mix_cluster.predict(X)
elif cluster_method == 'hierarchical':
# Define Z
Z = linkage(X, method)
for n_clusters in range_n_clusters:
cluster_labels= fcluster(Z, n_clusters, criterion='maxclust')
silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
format(cluster_method, silhouette_avg))
In [41]:
## choose the number of clusters using silhouette method
get_silhouette_score(X, 'kmeans')
In [43]:
n_clusters = 3 # number of clusters
#XX= X.ix[:, ] # hour of day data
#train the model.
km=KMeans(n_clusters=n_clusters, random_state=22).fit(X)
labels_km = km.labels_
print list(labels_km).count(0)
print list(labels_km).count(1)
print list(labels_km).count(2)
In [44]:
## Distance
# KM=KMeans(n_clusters=k,random_state=9)
scor=km.fit_predict(X)
res_p=pd.DataFrame(km.transform(X)) ## distances of each data point to each cluster center!!!
res_p=pd.concat((res_p, pd.DataFrame(km.fit_predict(X))),axis=1)
res_p.columns=list(range(n_clusters))+["cluster"]
res_p.loc[:,"score"]=res_p.apply(lambda x: x[int(x["cluster"])],axis=1)
res_p.sort_values("score",ascending=False)[:5]
Out[44]:
In [45]:
X_df = X.copy()
In [46]:
X_df['km'] = labels_km
In [47]:
X_df['distance_KM'] = res_p.score.values
In [48]:
## Get Anomalies
def get_anomaly(df, label_col, dist_or_likelihood, thres):
df_anomalies = pd.DataFrame(columns=df.columns)
clustermean = df.groupby(label_col)[dist_or_likelihood].mean()
clusterstd = df.groupby(label_col)[dist_or_likelihood].std()
for label in df[label_col].unique():
anomaly = df[(df[label_col]==label) \
& (abs(df[dist_or_likelihood] - clustermean[label]) / clusterstd[label] > thres)]
df_anomalies = pd.concat([df_anomalies, anomaly], axis=0)
return df_anomalies
In [49]:
## get anomalies based on kmeans distance
km_anomalies = get_anomaly(X_df, 'km', 'distance_KM', 2)
In [50]:
km_anomalies['distance_KM']
Out[50]:
In [ ]:
In [ ]:
In [51]:
##### Gaussian Mixture #########
## choose the number of clusters using silhouette method
get_silhouette_score(X, 'gaussian_mix')
In [52]:
k = 4
GM=GaussianMixture(n_components=k,random_state=22)
GM.fit(X)
label_gm = GM.predict(X)
In [53]:
print list(label_gm).count(0)
print list(label_gm).count(1)
print list(label_gm).count(2)
print list(label_gm).count(3)
print list(label_gm).count(4)
In [54]:
### 3) Isolation Foreset
from sklearn.ensemble import IsolationForest
# fit the model
ISF = IsolationForest(max_samples=100, random_state=22)
ISF.fit(X)
## compute anomaly score of the input. The lower, the more abnormal.
score_isf = ISF.decision_function(X)
In [55]:
score_isf.argmin()
Out[55]:
In [56]:
def dist2knn(x, nn, k):
dist_ = []
for i in range(len(nn)):
dist_.append(distance.euclidean(x, nn.iloc[i,:]))
dist_.sort()
return sum(dist_[:k+1]) # +1: remove 0 self
In [57]:
dist_sum_knn = []
for i in range(len(X)):
print '\r{}%'.format(100.0*(i+1)/len(X)),
dist_sum_knn.append(dist2knn(X.iloc[i,:], X, 5))
In [ ]:
In [58]:
result = pd.DataFrame(index=X.index)
In [59]:
result['clusters_sp'] = labels_spectural
result['clusters_km'] = labels_km
result['distance_km'] = km_anomalies['distance_KM']
result['clusters_gm'] = label_gm
result['scores_isf'] = score_isf
result['distance_knn'] = dist_sum_knn
In [60]:
result
Out[60]:
In [61]:
result.to_csv('LasVegas_jap_results.csv', index_label=False, encoding='utf-8' )
In [63]:
anomaly_merged = pd.concat([df_select, result], axis=1)
In [64]:
anomaly_merged['distance_km'].sort_values()
Out[64]:
In [65]:
anomaly_merged['distance_knn'].sort_values()
Out[65]:
In [66]:
anomaly_merged['scores_isf'].sort_values()
Out[66]:
In [73]:
anomaly_merged.loc['IhYo9Szx_gPDfNOcz5-8DQ', :]
Out[73]:
In [ ]: