notebook.community

Edit and run



In [41]:

    
import pandas as pd
df = pd.read_pickle('../yelp-challenge/data_processeing/Yelp_Cuisine_Chinese.pkl')



In [112]:

    
d = pd.read_pickle('../yelp-challenge/data_processeing/Yelp_Cuisine_Chinese.pkl')



In [39]:

    
#df = df[df.cuisine_Chinese == 2]



In [42]:

    
df.shape









    Out[42]:





(27314, 98)



In [43]:

    
df.groupby(df.city).size().sort_values(ascending=False)[:5]









    Out[43]:





city
Toronto      3429
Las Vegas    2943
Phoenix      1772
Montréal     1644
Charlotte    1306
dtype: int64



In [37]:

    
#df = df[df.city == 'Toronto']



In [44]:

    
df = df.loc[:,'AcceptsInsurance':'cuisine_Chinese']



In [67]:

    
df.shape[1]-1









    Out[67]:





81

0.8



In [102]:

    
if_col_keep = 1.0* df.count().sort_values() / len(df) > 0.7
sum(if_col_keep)









    Out[102]:





36



In [103]:

    
col_keep = df.count().sort_values()[if_col_keep]



In [104]:

    
col_prefix = []
for c in col_keep.keys().values:
    #print c.split('_')[0]
    if c == 'cuisine_Chinese':
        pass
    else:
        col_prefix.append(c.split('_')[0])
col_prefix = list(set(col_prefix))
col_prefix









    Out[104]:





['BYOB',
 'GoodForMeal',
 'BusinessParking',
 'RestaurantsDelivery',
 'Smoking',
 'OutdoorSeating',
 'HairSpecializesIn',
 'BusinessAcceptsCreditCards',
 'AgesAllowed',
 'Music',
 'BikeParking',
 'Corkage',
 'Ambience',
 'BestNights',
 'DietaryRestrictions',
 'RestaurantsGoodForGroups',
 'HappyHour',
 'BYOBCorkage']



In [105]:

    
len(col_prefix)









    Out[105]:





18



In [106]:

    
delete_col = ['HairSpecializesIn']



In [107]:

    
for c in delete_col:
    col_prefix.remove(c)
len(col_prefix)









    Out[107]:





17



In [110]:

    
col_with_prefix = []
for c in df.columns[:-1]:
    if c.split('_')[0] in col_prefix:
        col_with_prefix.append(c)
len(col_with_prefix)









    Out[110]:





52

Any Other Attribute?



In [113]:

    
df1 = df.copy()



In [137]:

    
md = df1.join(d.review_count).review_count.median()



In [138]:

    
df1['review_count_greater_median'] = df1.join(d.review_count).review_count > md

Join Basic Col



In [144]:

    
df1.shape









    Out[144]:





(27314, 83)



In [149]:

    
df_basic = d[[u'categories', u'city', u'hours', u'is_open', u'latitude', u'longitude', u'name', u'neighborhood', u'postal_code', u'review_count', u'stars', u'state']]



In [155]:

    
df_final = df_basic.join(df1[col_with_prefix+['review_count_greater_median','cuisine_Chinese']])



In [156]:

    
df_final.shape









    Out[156]:





(27314, 66)



In [157]:

    
df_final.to_pickle("df_1518.pkl")

0.8

Delete non-related prefix by hand

Any Other Attribute?

Join Basic Col