In [1]:
    
import pandas as pd
    
In [2]:
    
df = pd.read_pickle("../yelp-challenge/Anomaly/df_1518.pkl")
spatial_label = pd.read_pickle("../yelp-challenge/data_processeing/spatial_labels.pkl")
    
In [11]:
    
tmp = df.join(spatial_label)[['city','spatial_label']]
tmp[df.cuisine_Chinese==2].groupby(tmp.spatial_label).size()
    
    Out[11]:
In [40]:
    
df1 = df.copy()
df1 = df.join(spatial_label)
    
In [41]:
    
df.shape
    
    Out[41]:
In [42]:
    
df1 = df1[df1.spatial_label == 3]
df1.shape
    
    Out[42]:
In [43]:
    
df1 = df1[df1.cuisine_Chinese == 2]
df1.shape
    
    Out[43]:
In [44]:
    
df1 = df1[df1.stars >= 4]
df1.shape
    
    Out[44]:
In [46]:
    
df2 = df1.copy()
    
In [47]:
    
df2.columns.values
    
    Out[47]:
In [48]:
    
df2 = df2.loc[:,'AgesAllowed':'review_count_greater_median']
    
In [49]:
    
df2 = df2.join(df1.stars)
    
In [58]:
    
is_keep = 1.0 * df2.count() / len(df2) > 0.7
sum(is_keep)
    
    Out[58]:
In [65]:
    
df2 = df2.loc[:,is_keep]
    
In [66]:
    
df2.shape
    
    Out[66]:
In [68]:
    
df2.head()
    
    Out[68]:
In [118]:
    
df3 = df2.copy()
#df3 = df3.replace([None], [False])
# df3.Ambience_hipster.unique()
    
In [119]:
    
pd.get_dummies(df3.iloc[:,:-1],drop_first=True, dummy_na=False).columns.shape
    
    Out[119]:
In [120]:
    
df3 = pd.get_dummies(df3.iloc[:,:-1],drop_first=True, dummy_na=False)
    
In [122]:
    
from sklearn.cluster import KMeans
    
In [184]:
    
k=2
mod = KMeans(n_clusters=k, random_state=1).fit(df3)
for i in range(k):
    print sum(mod.labels_ == i)
    
    
In [188]:
    
from scipy.spatial import distance
    
In [189]:
    
def dist2knn(x, nn, k):
    dist_ = []
    for i in range(len(nn)):
        dist_.append(distance.euclidean(x, nn.iloc[i,:]))
    dist_.sort()
    return sum(dist_[:k+1]) # +1: remove 0 self
    
In [190]:
    
dist_sum_knn = []
for i in range(len(df3)):
    print '\r{}%'.format(100.0*(i+1)/len(df3)),
    dist_sum_knn.append(dist2knn(df3.iloc[i,:], df3, 5))
    
    
In [221]:
    
df4 = df3.copy()
df4['dist'] = dist_sum_knn
    
In [222]:
    
df4.sort_values(by='dist', ascending=False)[['dist']][:5]
    
    Out[222]:
In [223]:
    
bid_top5 = df4.sort_values(by='dist', ascending=False)[['dist']][:5].index.values
    
In [224]:
    
is_top5 = [ind in bid_top5 for ind in df.index.values]
    
In [232]:
    
import geopandas as gpd
import pyproj
from shapely.geometry import Point
    
In [233]:
    
proj = pyproj.Proj(init='epsg:4326', preserve_units=True)
gdf = gpd.GeoDataFrame(df)
    
In [235]:
    
geometry = []
for i in range(len(gdf)):
    row = gdf.iloc[i,:]
    geo = Point(proj(float(row['longitude']), float(row['latitude'])))
    geometry.append(geo)
    
In [236]:
    
gdf['geometry'] = geometry
    
In [240]:
    
# gdf.plot(c=is_top5)