In [41]:
import pandas as pd
df = pd.read_pickle('../yelp-challenge/data_processeing/Yelp_Cuisine_Chinese.pkl')
In [112]:
d = pd.read_pickle('../yelp-challenge/data_processeing/Yelp_Cuisine_Chinese.pkl')
In [39]:
#df = df[df.cuisine_Chinese == 2]
In [42]:
df.shape
Out[42]:
In [43]:
df.groupby(df.city).size().sort_values(ascending=False)[:5]
Out[43]:
In [37]:
#df = df[df.city == 'Toronto']
In [44]:
df = df.loc[:,'AcceptsInsurance':'cuisine_Chinese']
In [67]:
df.shape[1]-1
Out[67]:
In [102]:
if_col_keep = 1.0* df.count().sort_values() / len(df) > 0.7
sum(if_col_keep)
Out[102]:
In [103]:
col_keep = df.count().sort_values()[if_col_keep]
In [104]:
col_prefix = []
for c in col_keep.keys().values:
#print c.split('_')[0]
if c == 'cuisine_Chinese':
pass
else:
col_prefix.append(c.split('_')[0])
col_prefix = list(set(col_prefix))
col_prefix
Out[104]:
In [105]:
len(col_prefix)
Out[105]:
In [106]:
delete_col = ['HairSpecializesIn']
In [107]:
for c in delete_col:
col_prefix.remove(c)
len(col_prefix)
Out[107]:
In [110]:
col_with_prefix = []
for c in df.columns[:-1]:
if c.split('_')[0] in col_prefix:
col_with_prefix.append(c)
len(col_with_prefix)
Out[110]:
In [113]:
df1 = df.copy()
In [137]:
md = df1.join(d.review_count).review_count.median()
In [138]:
df1['review_count_greater_median'] = df1.join(d.review_count).review_count > md
In [144]:
df1.shape
Out[144]:
In [149]:
df_basic = d[[u'categories', u'city', u'hours', u'is_open', u'latitude', u'longitude', u'name', u'neighborhood', u'postal_code', u'review_count', u'stars', u'state']]
In [155]:
df_final = df_basic.join(df1[col_with_prefix+['review_count_greater_median','cuisine_Chinese']])
In [156]:
df_final.shape
Out[156]:
In [157]:
df_final.to_pickle("df_1518.pkl")