In [1]:
import pandas as pd
In [2]:
df = pd.read_pickle("../yelp-challenge/Anomaly/df_1518.pkl")
spatial_label = pd.read_pickle("../yelp-challenge/data_processeing/spatial_labels.pkl")
In [11]:
tmp = df.join(spatial_label)[['city','spatial_label']]
tmp[df.cuisine_Chinese==2].groupby(tmp.spatial_label).size()
Out[11]:
In [40]:
df1 = df.copy()
df1 = df.join(spatial_label)
In [41]:
df.shape
Out[41]:
In [42]:
df1 = df1[df1.spatial_label == 3]
df1.shape
Out[42]:
In [43]:
df1 = df1[df1.cuisine_Chinese == 2]
df1.shape
Out[43]:
In [44]:
df1 = df1[df1.stars >= 4]
df1.shape
Out[44]:
In [46]:
df2 = df1.copy()
In [47]:
df2.columns.values
Out[47]:
In [48]:
df2 = df2.loc[:,'AgesAllowed':'review_count_greater_median']
In [49]:
df2 = df2.join(df1.stars)
In [58]:
is_keep = 1.0 * df2.count() / len(df2) > 0.7
sum(is_keep)
Out[58]:
In [65]:
df2 = df2.loc[:,is_keep]
In [66]:
df2.shape
Out[66]:
In [68]:
df2.head()
Out[68]:
In [118]:
df3 = df2.copy()
#df3 = df3.replace([None], [False])
# df3.Ambience_hipster.unique()
In [119]:
pd.get_dummies(df3.iloc[:,:-1],drop_first=True, dummy_na=False).columns.shape
Out[119]:
In [120]:
df3 = pd.get_dummies(df3.iloc[:,:-1],drop_first=True, dummy_na=False)
In [122]:
from sklearn.cluster import KMeans
In [184]:
k=2
mod = KMeans(n_clusters=k, random_state=1).fit(df3)
for i in range(k):
print sum(mod.labels_ == i)
In [188]:
from scipy.spatial import distance
In [189]:
def dist2knn(x, nn, k):
dist_ = []
for i in range(len(nn)):
dist_.append(distance.euclidean(x, nn.iloc[i,:]))
dist_.sort()
return sum(dist_[:k+1]) # +1: remove 0 self
In [190]:
dist_sum_knn = []
for i in range(len(df3)):
print '\r{}%'.format(100.0*(i+1)/len(df3)),
dist_sum_knn.append(dist2knn(df3.iloc[i,:], df3, 5))
In [221]:
df4 = df3.copy()
df4['dist'] = dist_sum_knn
In [222]:
df4.sort_values(by='dist', ascending=False)[['dist']][:5]
Out[222]:
In [223]:
bid_top5 = df4.sort_values(by='dist', ascending=False)[['dist']][:5].index.values
In [224]:
is_top5 = [ind in bid_top5 for ind in df.index.values]
In [232]:
import geopandas as gpd
import pyproj
from shapely.geometry import Point
In [233]:
proj = pyproj.Proj(init='epsg:4326', preserve_units=True)
gdf = gpd.GeoDataFrame(df)
In [235]:
geometry = []
for i in range(len(gdf)):
row = gdf.iloc[i,:]
geo = Point(proj(float(row['longitude']), float(row['latitude'])))
geometry.append(geo)
In [236]:
gdf['geometry'] = geometry
In [240]:
# gdf.plot(c=is_top5)