In [2]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
df = pd.read_pickle('df_1518.pkl')
In [4]:
spatial_label = pd.read_pickle('../data_processeing/spatial_labels.pkl')
In [5]:
df_new = pd.concat([df,spatial_label], axis=1)
In [6]:
df_select = df_new[(df_new['stars'] >= 4) & (df_new['spatial_label'] == 7) & (df_new['cuisine_Chinese'] == 2)]
In [7]:
X = df_select.loc[:, 'AgesAllowed': 'review_count_greater_median']
X = pd.concat([X, df_select[['stars']]], axis=1)
X['stars'] = X['stars'].apply(str)
X = pd.get_dummies(X, dummy_na=False, drop_first=True)
def true_false(x):
if x == True:
return 1
else:
return 0
X['review_count_greater_median'] = X['review_count_greater_median'].apply(true_false)
In [8]:
for n_clusters in range(2,5):
spectural_clustering = SpectralClustering(n_clusters=n_clusters, random_state=100, affinity='sigmoid').fit(X)
labels = spectural_clustering.labels_
print silhouette_score(X, labels, metric='cityblock')
print list(labels).count(0)
print list(labels).count(1)
print list(labels).count(2)
print list(labels).count(3)
print list(labels).count(4)
In [9]:
spectural_clustering = SpectralClustering(n_clusters=2, random_state=999, affinity='sigmoid').fit(X)
labels_spectural = spectural_clustering.labels_
In [10]:
df_select.shape
Out[10]:
In [11]:
## function that gets silhouette scores for a clustering method
def get_silhouette_score(X, cluster_method, method='complete'):
#Choose a range(list) of clusters I would like to try:
range_n_clusters = range(2,10)
if cluster_method == 'kmeans':
for n_clusters in range_n_clusters:
km_result = KMeans(n_clusters=n_clusters, random_state=22).fit(X) # fit model
cluster_labels = km_result.labels_
# clustering model & clustering result to variable "cluster_labels".
silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
format(cluster_method, silhouette_avg))
elif cluster_method == 'gaussian_mix':
for n_clusters in range_n_clusters:
gm = GaussianMixture(n_components=n_clusters, random_state=22).fit(X) # fit model
cluster_labels = gm.predict(X)
# clustering model & clustering result to variable "cluster_labels".
silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
format(cluster_method, silhouette_avg))
# res_mix_cluster.predict(X)
elif cluster_method == 'hierarchical':
# Define Z
Z = linkage(X, method)
for n_clusters in range_n_clusters:
cluster_labels= fcluster(Z, n_clusters, criterion='maxclust')
silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
format(cluster_method, silhouette_avg))
In [12]:
## choose the number of clusters using silhouette method
get_silhouette_score(X, 'kmeans')
In [13]:
n_clusters = 3 # number of clusters
#XX= X.ix[:, ] # hour of day data
#train the model.
km=KMeans(n_clusters=n_clusters, random_state=22).fit(X)
labels_km = km.labels_
print list(labels_km).count(0)
print list(labels_km).count(1)
print list(labels_km).count(2)
In [14]:
## Distance
# KM=KMeans(n_clusters=k,random_state=9)
scor=km.fit_predict(X)
res_p=pd.DataFrame(km.transform(X)) ## distances of each data point to each cluster center!!!
res_p=pd.concat((res_p, pd.DataFrame(km.fit_predict(X))),axis=1)
res_p.columns=list(range(n_clusters))+["cluster"]
res_p.loc[:,"score"]=res_p.apply(lambda x: x[int(x["cluster"])],axis=1)
res_p.sort_values("score",ascending=False)[:5]
Out[14]:
In [15]:
X_df = X.copy()
In [16]:
X_df['km'] = labels_km
In [17]:
X_df['distance_KM'] = res_p.score.values
In [18]:
## Get Anomalies
def get_anomaly(df, label_col, dist_or_likelihood, thres):
df_anomalies = pd.DataFrame(columns=df.columns)
clustermean = df.groupby(label_col)[dist_or_likelihood].mean()
clusterstd = df.groupby(label_col)[dist_or_likelihood].std()
for label in df[label_col].unique():
anomaly = df[(df[label_col]==label) \
& (abs(df[dist_or_likelihood] - clustermean[label]) / clusterstd[label] > thres)]
df_anomalies = pd.concat([df_anomalies, anomaly], axis=0)
return df_anomalies
In [19]:
## get anomalies based on kmeans distance
km_anomalies = get_anomaly(X_df, 'km', 'distance_KM', 2)
In [20]:
km_anomalies['distance_KM']
Out[20]:
In [ ]:
In [ ]:
In [21]:
##### Gaussian Mixture #########
## choose the number of clusters using silhouette method
get_silhouette_score(X, 'gaussian_mix')
In [24]:
k = 2
GM=GaussianMixture(n_components=k,random_state=22)
GM.fit(X)
label_gm = GM.predict(X)
In [25]:
print list(label_gm).count(0)
print list(label_gm).count(1)
print list(label_gm).count(2)
print list(label_gm).count(3)
print list(label_gm).count(4)
In [26]:
### 3) Isolation Foreset
from sklearn.ensemble import IsolationForest
# fit the model
ISF = IsolationForest(max_samples=100, random_state=22)
ISF.fit(X)
## compute anomaly score of the input. The lower, the more abnormal.
score_isf = ISF.decision_function(X)
In [27]:
score_isf.argmin()
Out[27]:
In [28]:
def dist2knn(x, nn, k):
dist_ = []
for i in range(len(nn)):
dist_.append(distance.euclidean(x, nn.iloc[i,:]))
dist_.sort()
return sum(dist_[:k+1]) # +1: remove 0 self
In [29]:
dist_sum_knn = []
for i in range(len(X)):
print '\r{}%'.format(100.0*(i+1)/len(X)),
dist_sum_knn.append(dist2knn(X.iloc[i,:], X, 5))
In [ ]:
In [30]:
result = pd.DataFrame(index=X.index)
In [31]:
result['clusters_sp'] = labels_spectural
result['clusters_km'] = labels_km
result['distance_km'] = km_anomalies['distance_KM']
result['clusters_gm'] = label_gm
result['scores_isf'] = score_isf
result['distance_knn'] = dist_sum_knn
In [32]:
result
Out[32]:
In [33]:
result.to_csv('LasVegas_chi_results.csv', index_label=False, encoding='utf-8' )
In [34]:
anomaly_merged = pd.concat([df_select, result], axis=1)
In [35]:
anomaly_merged['distance_km'].sort_values()
Out[35]:
In [43]:
anomaly_merged['distance_knn'].sort_values()
Out[43]:
In [42]:
anomaly_merged['scores_isf'].sort_values()
Out[42]:
In [46]:
anomaly_merged.loc['zjvnqTjBp56NhMp1GrlO5g', :]
Out[46]:
In [ ]: