In [1]:

    
import pandas as pd



In [2]:

    
df = pd.read_pickle("../yelp-challenge/Anomaly/df_1518.pkl")
spatial_label = pd.read_pickle("../yelp-challenge/data_processeing/spatial_labels.pkl")

3 - the greater T



In [11]:

    
tmp = df.join(spatial_label)[['city','spatial_label']]
tmp[df.cuisine_Chinese==2].groupby(tmp.spatial_label).size()









    Out[11]:





spatial_label
0      80
1      84
2     482
3     853
4      63
5     182
6     194
7     410
8     163
9     139
10     43
dtype: int64



In [40]:

    
df1 = df.copy()
df1 = df.join(spatial_label)



In [41]:

    
df.shape









    Out[41]:





(27314, 66)

Filter

Spatial_label == 3



In [42]:

    
df1 = df1[df1.spatial_label == 3]
df1.shape









    Out[42]:





(6464, 67)

Chinese == 2



In [43]:

    
df1 = df1[df1.cuisine_Chinese == 2]
df1.shape









    Out[43]:





(853, 67)

Star == 4



In [44]:

    
df1 = df1[df1.stars >= 4]
df1.shape









    Out[44]:





(235, 67)

Anomaly Detection



In [46]:

    
df2 = df1.copy()



In [47]:

    
df2.columns.values









    Out[47]:





array([u'categories', u'city', u'hours', u'is_open', u'latitude',
       u'longitude', u'name', u'neighborhood', u'postal_code',
       u'review_count', u'stars', u'state', 'AgesAllowed',
       'Ambience_casual', 'Ambience_classy', 'Ambience_divey',
       'Ambience_hipster', 'Ambience_intimate', 'Ambience_romantic',
       'Ambience_touristy', 'Ambience_trendy', 'Ambience_upscale', 'BYOB',
       'BYOBCorkage', 'BestNights_friday', 'BestNights_monday',
       'BestNights_saturday', 'BestNights_sunday', 'BestNights_thursday',
       'BestNights_tuesday', 'BestNights_wednesday', 'BikeParking',
       'BusinessAcceptsCreditCards', 'BusinessParking_garage',
       'BusinessParking_lot', 'BusinessParking_street',
       'BusinessParking_valet', 'BusinessParking_validated', 'Corkage',
       'DietaryRestrictions_dairy-free', 'DietaryRestrictions_gluten-free',
       'DietaryRestrictions_halal', 'DietaryRestrictions_kosher',
       'DietaryRestrictions_soy-free', 'DietaryRestrictions_vegan',
       'DietaryRestrictions_vegetarian', 'GoodForMeal_breakfast',
       'GoodForMeal_brunch', 'GoodForMeal_dessert', 'GoodForMeal_dinner',
       'GoodForMeal_latenight', 'GoodForMeal_lunch', 'HappyHour',
       'Music_background_music', 'Music_dj', 'Music_jukebox',
       'Music_karaoke', 'Music_live', 'Music_no_music', 'Music_video',
       'OutdoorSeating', 'RestaurantsDelivery', 'RestaurantsGoodForGroups',
       'Smoking', 'review_count_greater_median', 'cuisine_Chinese',
       'spatial_label'], dtype=object)



In [48]:

    
df2 = df2.loc[:,'AgesAllowed':'review_count_greater_median']



In [49]:

    
df2 = df2.join(df1.stars)



In [58]:

    
is_keep = 1.0 * df2.count() / len(df2) > 0.7
sum(is_keep)









    Out[58]:





34



In [65]:

    
df2 = df2.loc[:,is_keep]



In [66]:

    
df2.shape









    Out[66]:





(235, 34)



In [68]:

    
df2.head()









    Out[68]:






  
    
      
      AgesAllowed
      Ambience_classy
      Ambience_divey
      Ambience_hipster
      Ambience_romantic
      Ambience_upscale
      BYOB
      BYOBCorkage
      BestNights_sunday
      BestNights_tuesday
      ...
      GoodForMeal_lunch
      HappyHour
      Music_no_music
      Music_video
      OutdoorSeating
      RestaurantsDelivery
      RestaurantsGoodForGroups
      Smoking
      review_count_greater_median
      stars
    
  
  
    
      KeQ1cK564cL5C_hBTFrqnA
      False
      full_bar
      True
      average
      casual
      True
      True
      True
      True
      False
      ...
      False
      True
      True
      False
      True
      False
      False
      False
      True
      4.0
    
    
      f5xm2RiwLv0gbmXU4BkrGA
      False
      beer_and_wine
      True
      average
      casual
      NaN
      True
      NaN
      True
      NaN
      ...
      NaN
      False
      NaN
      NaN
      True
      NaN
      NaN
      NaN
      False
      4.0
    
    
      q0oPX1DXW86QytTvvrD9MA
      False
      full_bar
      True
      quiet
      casual
      True
      True
      False
      True
      False
      ...
      False
      True
      True
      False
      True
      False
      False
      True
      False
      4.0
    
    
      0C6uuZBD7xKS-uucgwoyQw
      False
      none
      False
      quiet
      casual
      True
      False
      False
      True
      False
      ...
      False
      False
      False
      True
      False
      False
      False
      False
      True
      4.0
    
    
      mevOvXMnlTLbSl4w8fcG2A
      False
      full_bar
      False
      average
      casual
      True
      True
      False
      True
      False
      ...
      False
      True
      True
      False
      False
      False
      False
      False
      False
      4.0
    
  

5 rows × 34 columns



In [118]:

    
df3 = df2.copy()
#df3 = df3.replace([None], [False])
# df3.Ambience_hipster.unique()



In [119]:

    
pd.get_dummies(df3.iloc[:,:-1],drop_first=True, dummy_na=False).columns.shape









    Out[119]:





(36,)



In [120]:

    
df3 = pd.get_dummies(df3.iloc[:,:-1],drop_first=True, dummy_na=False)



In [122]:

    
from sklearn.cluster import KMeans



In [184]:

    
k=2
mod = KMeans(n_clusters=k, random_state=1).fit(df3)
for i in range(k):
    print sum(mod.labels_ == i)



In [188]:

    
from scipy.spatial import distance



In [189]:

    
def dist2knn(x, nn, k):
    dist_ = []
    for i in range(len(nn)):
        dist_.append(distance.euclidean(x, nn.iloc[i,:]))
    dist_.sort()
    return sum(dist_[:k+1]) # +1: remove 0 self



In [190]:

    
dist_sum_knn = []
for i in range(len(df3)):
    print '\r{}%'.format(100.0*(i+1)/len(df3)),
    dist_sum_knn.append(dist2knn(df3.iloc[i,:], df3, 5))



In [221]:

    
df4 = df3.copy()
df4['dist'] = dist_sum_knn

TOP10 Possible Anomaly



In [222]:

    
df4.sort_values(by='dist', ascending=False)[['dist']][:5]









    Out[222]:






  
    
      
      dist
    
  
  
    
      E0iHvHraTa-t6ka9rYL8uQ
      12.836233
    
    
      8xPmlVJy2o6x0J04CBpEMQ
      12.836233
    
    
      td9FZybutwNG7DgocHCiXA
      12.836233
    
    
      bGuxRBRKv7i1BKKfGbjxEw
      12.822648
    
    
      1pttL4MkpxOL6Mj2azOjVQ
      12.622812



In [223]:

    
bid_top5 = df4.sort_values(by='dist', ascending=False)[['dist']][:5].index.values



In [224]:

    
is_top5 = [ind in bid_top5 for ind in df.index.values]



In [232]:

    
import geopandas as gpd
import pyproj
from shapely.geometry import Point



In [233]:

    
proj = pyproj.Proj(init='epsg:4326', preserve_units=True)
gdf = gpd.GeoDataFrame(df)



In [235]:

    
geometry = []
for i in range(len(gdf)):
    row = gdf.iloc[i,:]
    geo = Point(proj(float(row['longitude']), float(row['latitude'])))
    geometry.append(geo)



In [236]:

    
gdf['geometry'] = geometry



In [240]:

    
# gdf.plot(c=is_top5)

	AgesAllowed	Ambience_classy	Ambience_divey	Ambience_hipster	Ambience_romantic	Ambience_upscale	BYOB	BYOBCorkage	BestNights_sunday	BestNights_tuesday	...	GoodForMeal_lunch	HappyHour	Music_no_music	Music_video	OutdoorSeating	RestaurantsDelivery	RestaurantsGoodForGroups	Smoking	review_count_greater_median	stars
KeQ1cK564cL5C_hBTFrqnA	False	full_bar	True	average	casual	True	True	True	True	False	...	False	True	True	False	True	False	False	False	True	4.0
f5xm2RiwLv0gbmXU4BkrGA	False	beer_and_wine	True	average	casual	NaN	True	NaN	True	NaN	...	NaN	False	NaN	NaN	True	NaN	NaN	NaN	False	4.0
q0oPX1DXW86QytTvvrD9MA	False	full_bar	True	quiet	casual	True	True	False	True	False	...	False	True	True	False	True	False	False	True	False	4.0
0C6uuZBD7xKS-uucgwoyQw	False	none	False	quiet	casual	True	False	False	True	False	...	False	False	False	True	False	False	False	False	True	4.0
mevOvXMnlTLbSl4w8fcG2A	False	full_bar	False	average	casual	True	True	False	True	False	...	False	True	True	False	False	False	False	False	False	4.0

	dist
E0iHvHraTa-t6ka9rYL8uQ	12.836233
8xPmlVJy2o6x0J04CBpEMQ	12.836233
td9FZybutwNG7DgocHCiXA	12.836233
bGuxRBRKv7i1BKKfGbjxEw	12.822648
1pttL4MkpxOL6Mj2azOjVQ	12.622812