In [1]:
import pandas as pd
import yaml
import numpy as np
In [3]:
# with open('./DATA/yelp_dataset_challenge_round9/yelp_academic_dataset_business.json', 'rb') as fi:
# data = fi.readlines()
# with open("./cat_food.txt", 'rw') as fi:
# food = fi.read().splitlines()
# target_cat = food
In [4]:
# data = map(lambda x: x.rstrip(), data)
# data_json_str = "[" + ','.join(data) + "]"
# df = pd.read_json(data_json_str)
# df.reset_index(drop=True, inplace=True)
# df_open = df[df.is_open == 1]
# df_open.reset_index(drop=True, inplace=True)
# df_open = df_open.dropna(subset=['hours'])
# df_open.reset_index(drop=True, inplace=True)
# df_open = df_open.dropna(subset=['attributes'])
# df_open.reset_index(drop=True, inplace=True)
# df_open = df_open[df_open.categories.apply(lambda x: not set(x).isdisjoint(set(target_cat)))]
# df_open.index = df_open.business_id.values
# att = df_open.attributes
# a = att.apply(lambda x: yaml.load('['+','.join(x)+']'))
# df_open.attributes = a
# att_all = set()
# for row in a:
# for i in row:
# if not isinstance(i.values()[0], dict):
# att_all.add(i.keys()[0])
# else:
# prefix = i.keys()[0]
# for k in i.values()[0].iterkeys():
# suffix = k
# temp = prefix + '_' + suffix
# att_all.add(temp)
# tab = pd.DataFrame(columns=att_all, index=df_open.index)
# for ind in tab.index:
# for j in a[ind]:
# if not isinstance(j.values()[0], dict):
# tab.loc[ind, j.keys()[0]] = j.values()[0]
# else:
# prefix = j.keys()[0]
# for k, v in j.values()[0].iteritems():
# suffix = k
# temp = prefix + '_' + suffix
# tab.loc[ind, temp] = v
# tab.columns = tab.columns.sort_values()
# df_with_attribute = df_open.join(tab)
# df_with_attribute.to_pickle("all_cities_preprocess.pkl")
In [73]:
# Be careful! encoding!
# df_with_attribute.to_csv("all_cities_preprocess.csv", encoding='utf-8')
In [2]:
df_with_attribute = pd.read_pickle('../yelp-challenge/data_all_cities/all_cities_preprocess.pkl')
In [3]:
df_with_attribute.shape
Out[3]:
In [4]:
df_with_attribute.count().values
Out[4]:
In [5]:
df_with_attribute.tail(2)
Out[5]:
In [13]:
city_n = df_with_attribute.groupby(df_with_attribute.city).size()
In [14]:
city_n.sort_values(ascending=False)[:20]
Out[14]: