In [1]:
import pandas as pd

In [2]:
df = pd.read_pickle("../yelp-challenge/Anomaly/df_1518.pkl")
spatial_label = pd.read_pickle("../yelp-challenge/data_processeing/spatial_labels.pkl")

3 - the greater T


In [11]:
tmp = df.join(spatial_label)[['city','spatial_label']]
tmp[df.cuisine_Chinese==2].groupby(tmp.spatial_label).size()


Out[11]:
spatial_label
0      80
1      84
2     482
3     853
4      63
5     182
6     194
7     410
8     163
9     139
10     43
dtype: int64

In [40]:
df1 = df.copy()
df1 = df.join(spatial_label)

In [41]:
df.shape


Out[41]:
(27314, 66)

Filter

Spatial_label == 3


In [42]:
df1 = df1[df1.spatial_label == 3]
df1.shape


Out[42]:
(6464, 67)

Chinese == 2


In [43]:
df1 = df1[df1.cuisine_Chinese == 2]
df1.shape


Out[43]:
(853, 67)

Star == 4


In [44]:
df1 = df1[df1.stars >= 4]
df1.shape


Out[44]:
(235, 67)

Anomaly Detection


In [46]:
df2 = df1.copy()

In [47]:
df2.columns.values


Out[47]:
array([u'categories', u'city', u'hours', u'is_open', u'latitude',
       u'longitude', u'name', u'neighborhood', u'postal_code',
       u'review_count', u'stars', u'state', 'AgesAllowed',
       'Ambience_casual', 'Ambience_classy', 'Ambience_divey',
       'Ambience_hipster', 'Ambience_intimate', 'Ambience_romantic',
       'Ambience_touristy', 'Ambience_trendy', 'Ambience_upscale', 'BYOB',
       'BYOBCorkage', 'BestNights_friday', 'BestNights_monday',
       'BestNights_saturday', 'BestNights_sunday', 'BestNights_thursday',
       'BestNights_tuesday', 'BestNights_wednesday', 'BikeParking',
       'BusinessAcceptsCreditCards', 'BusinessParking_garage',
       'BusinessParking_lot', 'BusinessParking_street',
       'BusinessParking_valet', 'BusinessParking_validated', 'Corkage',
       'DietaryRestrictions_dairy-free', 'DietaryRestrictions_gluten-free',
       'DietaryRestrictions_halal', 'DietaryRestrictions_kosher',
       'DietaryRestrictions_soy-free', 'DietaryRestrictions_vegan',
       'DietaryRestrictions_vegetarian', 'GoodForMeal_breakfast',
       'GoodForMeal_brunch', 'GoodForMeal_dessert', 'GoodForMeal_dinner',
       'GoodForMeal_latenight', 'GoodForMeal_lunch', 'HappyHour',
       'Music_background_music', 'Music_dj', 'Music_jukebox',
       'Music_karaoke', 'Music_live', 'Music_no_music', 'Music_video',
       'OutdoorSeating', 'RestaurantsDelivery', 'RestaurantsGoodForGroups',
       'Smoking', 'review_count_greater_median', 'cuisine_Chinese',
       'spatial_label'], dtype=object)

In [48]:
df2 = df2.loc[:,'AgesAllowed':'review_count_greater_median']

In [49]:
df2 = df2.join(df1.stars)

In [58]:
is_keep = 1.0 * df2.count() / len(df2) > 0.7
sum(is_keep)


Out[58]:
34

In [65]:
df2 = df2.loc[:,is_keep]

In [66]:
df2.shape


Out[66]:
(235, 34)

In [68]:
df2.head()


Out[68]:
AgesAllowed Ambience_classy Ambience_divey Ambience_hipster Ambience_romantic Ambience_upscale BYOB BYOBCorkage BestNights_sunday BestNights_tuesday ... GoodForMeal_lunch HappyHour Music_no_music Music_video OutdoorSeating RestaurantsDelivery RestaurantsGoodForGroups Smoking review_count_greater_median stars
KeQ1cK564cL5C_hBTFrqnA False full_bar True average casual True True True True False ... False True True False True False False False True 4.0
f5xm2RiwLv0gbmXU4BkrGA False beer_and_wine True average casual NaN True NaN True NaN ... NaN False NaN NaN True NaN NaN NaN False 4.0
q0oPX1DXW86QytTvvrD9MA False full_bar True quiet casual True True False True False ... False True True False True False False True False 4.0
0C6uuZBD7xKS-uucgwoyQw False none False quiet casual True False False True False ... False False False True False False False False True 4.0
mevOvXMnlTLbSl4w8fcG2A False full_bar False average casual True True False True False ... False True True False False False False False False 4.0

5 rows × 34 columns


In [118]:
df3 = df2.copy()
#df3 = df3.replace([None], [False])
# df3.Ambience_hipster.unique()

In [119]:
pd.get_dummies(df3.iloc[:,:-1],drop_first=True, dummy_na=False).columns.shape


Out[119]:
(36,)

In [120]:
df3 = pd.get_dummies(df3.iloc[:,:-1],drop_first=True, dummy_na=False)

In [122]:
from sklearn.cluster import KMeans

In [184]:
k=2
mod = KMeans(n_clusters=k, random_state=1).fit(df3)
for i in range(k):
    print sum(mod.labels_ == i)


115
120

In [188]:
from scipy.spatial import distance

In [189]:
def dist2knn(x, nn, k):
    dist_ = []
    for i in range(len(nn)):
        dist_.append(distance.euclidean(x, nn.iloc[i,:]))
    dist_.sort()
    return sum(dist_[:k+1]) # +1: remove 0 self

In [190]:
dist_sum_knn = []
for i in range(len(df3)):
    print '\r{}%'.format(100.0*(i+1)/len(df3)),
    dist_sum_knn.append(dist2knn(df3.iloc[i,:], df3, 5))


100.0%

In [221]:
df4 = df3.copy()
df4['dist'] = dist_sum_knn

TOP10 Possible Anomaly


In [222]:
df4.sort_values(by='dist', ascending=False)[['dist']][:5]


Out[222]:
dist
E0iHvHraTa-t6ka9rYL8uQ 12.836233
8xPmlVJy2o6x0J04CBpEMQ 12.836233
td9FZybutwNG7DgocHCiXA 12.836233
bGuxRBRKv7i1BKKfGbjxEw 12.822648
1pttL4MkpxOL6Mj2azOjVQ 12.622812

In [223]:
bid_top5 = df4.sort_values(by='dist', ascending=False)[['dist']][:5].index.values

In [224]:
is_top5 = [ind in bid_top5 for ind in df.index.values]

In [232]:
import geopandas as gpd
import pyproj
from shapely.geometry import Point

In [233]:
proj = pyproj.Proj(init='epsg:4326', preserve_units=True)
gdf = gpd.GeoDataFrame(df)

In [235]:
geometry = []
for i in range(len(gdf)):
    row = gdf.iloc[i,:]
    geo = Point(proj(float(row['longitude']), float(row['latitude'])))
    geometry.append(geo)

In [236]:
gdf['geometry'] = geometry

In [240]:
# gdf.plot(c=is_top5)