In [1]:
import pandas as pd
import yaml
import numpy as np
In [3]:
with open('./DATA/yelp_dataset_challenge_round9/yelp_academic_dataset_business.json', 'rb') as fi:
data = fi.readlines()
with open("./cat_food.txt", 'rw') as fi:
food = fi.read().splitlines()
target_cat = food
In [4]:
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"
df = pd.read_json(data_json_str)
df.reset_index(drop=True, inplace=True)
df_open = df[df.is_open == 1]
df_open.reset_index(drop=True, inplace=True)
df_open = df_open.dropna(subset=['hours'])
df_open.reset_index(drop=True, inplace=True)
df_open = df_open.dropna(subset=['attributes'])
df_open.reset_index(drop=True, inplace=True)
df_open = df_open[df_open.categories.apply(lambda x: not set(x).isdisjoint(set(target_cat)))]
df_open.index = df_open.business_id.values
att = df_open.attributes
a = att.apply(lambda x: yaml.load('['+','.join(x)+']'))
df_open.attributes = a
att_all = set()
for row in a:
for i in row:
if not isinstance(i.values()[0], dict):
att_all.add(i.keys()[0])
else:
prefix = i.keys()[0]
for k in i.values()[0].iterkeys():
suffix = k
temp = prefix + '_' + suffix
att_all.add(temp)
tab = pd.DataFrame(columns=att_all, index=df_open.index)
for ind in tab.index:
for j in a[ind]:
if not isinstance(j.values()[0], dict):
tab.loc[ind, j.keys()[0]] = j.values()[0]
else:
prefix = j.keys()[0]
for k, v in j.values()[0].iteritems():
suffix = k
temp = prefix + '_' + suffix
tab.loc[ind, temp] = v
tab.columns = tab.columns.sort_values()
df_with_attribute = df_open.join(tab)
df_with_attribute.to_pickle("all_cities_preprocess.pkl")
In [73]:
# Be careful! encoding!
# df_with_attribute.to_csv("all_cities_preprocess.csv", encoding='utf-8')
In [74]:
df_with_attribute.shape
Out[74]:
In [41]:
df_with_attribute.count().values
Out[41]:
In [53]:
df_with_attribute.tail(2)
Out[53]:
In [71]:
city_n.sort_values(ascending=False)[:20]
Out[71]:
In [46]:
# tab.head().count().values
In [45]:
# att_all
In [42]:
# df_with_attribute.count().values
In [44]:
# df_with_attribute.tail()