In [41]:
import pandas as pd
df = pd.read_pickle('../yelp-challenge/data_processeing/Yelp_Cuisine_Chinese.pkl')

In [112]:
d = pd.read_pickle('../yelp-challenge/data_processeing/Yelp_Cuisine_Chinese.pkl')

In [39]:
#df = df[df.cuisine_Chinese == 2]

In [42]:
df.shape


Out[42]:
(27314, 98)

In [43]:
df.groupby(df.city).size().sort_values(ascending=False)[:5]


Out[43]:
city
Toronto      3429
Las Vegas    2943
Phoenix      1772
Montréal     1644
Charlotte    1306
dtype: int64

In [37]:
#df = df[df.city == 'Toronto']

In [44]:
df = df.loc[:,'AcceptsInsurance':'cuisine_Chinese']

In [67]:
df.shape[1]-1


Out[67]:
81

0.8


In [102]:
if_col_keep = 1.0* df.count().sort_values() / len(df) > 0.7
sum(if_col_keep)


Out[102]:
36

In [103]:
col_keep = df.count().sort_values()[if_col_keep]

In [104]:
col_prefix = []
for c in col_keep.keys().values:
    #print c.split('_')[0]
    if c == 'cuisine_Chinese':
        pass
    else:
        col_prefix.append(c.split('_')[0])
col_prefix = list(set(col_prefix))
col_prefix


Out[104]:
['BYOB',
 'GoodForMeal',
 'BusinessParking',
 'RestaurantsDelivery',
 'Smoking',
 'OutdoorSeating',
 'HairSpecializesIn',
 'BusinessAcceptsCreditCards',
 'AgesAllowed',
 'Music',
 'BikeParking',
 'Corkage',
 'Ambience',
 'BestNights',
 'DietaryRestrictions',
 'RestaurantsGoodForGroups',
 'HappyHour',
 'BYOBCorkage']

In [105]:
len(col_prefix)


Out[105]:
18

In [106]:
delete_col = ['HairSpecializesIn']

In [107]:
for c in delete_col:
    col_prefix.remove(c)
len(col_prefix)


Out[107]:
17

In [110]:
col_with_prefix = []
for c in df.columns[:-1]:
    if c.split('_')[0] in col_prefix:
        col_with_prefix.append(c)
len(col_with_prefix)


Out[110]:
52

Any Other Attribute?


In [113]:
df1 = df.copy()

In [137]:
md = df1.join(d.review_count).review_count.median()

In [138]:
df1['review_count_greater_median'] = df1.join(d.review_count).review_count > md

Join Basic Col


In [144]:
df1.shape


Out[144]:
(27314, 83)

In [149]:
df_basic = d[[u'categories', u'city', u'hours', u'is_open', u'latitude', u'longitude', u'name', u'neighborhood', u'postal_code', u'review_count', u'stars', u'state']]

In [155]:
df_final = df_basic.join(df1[col_with_prefix+['review_count_greater_median','cuisine_Chinese']])

In [156]:
df_final.shape


Out[156]:
(27314, 66)

In [157]:
df_final.to_pickle("df_1518.pkl")