In [1]:
import pandas as pd
import yaml
import numpy as np

Load


In [3]:
with open('./DATA/yelp_dataset_challenge_round9/yelp_academic_dataset_business.json', 'rb') as fi:
    data = fi.readlines()
    
with open("./cat_food.txt", 'rw') as fi:
    food = fi.read().splitlines() 
target_cat = food

Preprocess


In [4]:
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"
df = pd.read_json(data_json_str)
df.reset_index(drop=True, inplace=True)

df_open = df[df.is_open == 1]
df_open.reset_index(drop=True, inplace=True)

df_open = df_open.dropna(subset=['hours'])
df_open.reset_index(drop=True, inplace=True)

df_open = df_open.dropna(subset=['attributes'])
df_open.reset_index(drop=True, inplace=True)


df_open = df_open[df_open.categories.apply(lambda x: not set(x).isdisjoint(set(target_cat)))]
df_open.index = df_open.business_id.values
att = df_open.attributes
a = att.apply(lambda x: yaml.load('['+','.join(x)+']'))
df_open.attributes = a

att_all = set()
for row in a:
    for i in row:
        if not isinstance(i.values()[0], dict):
            att_all.add(i.keys()[0])
        else:
            prefix = i.keys()[0]
            for k in i.values()[0].iterkeys():
                suffix = k
                temp = prefix + '_' + suffix
                att_all.add(temp)
                
tab = pd.DataFrame(columns=att_all, index=df_open.index)
for ind in tab.index:
    for j in a[ind]:
        if not isinstance(j.values()[0], dict):
            tab.loc[ind, j.keys()[0]] = j.values()[0]
        else:
            prefix = j.keys()[0]
            for k, v in j.values()[0].iteritems():
                suffix = k
                temp = prefix + '_' + suffix
                tab.loc[ind, temp] = v                

tab.columns = tab.columns.sort_values()
df_with_attribute = df_open.join(tab)

df_with_attribute.to_pickle("all_cities_preprocess.pkl")

In [73]:
# Be careful! encoding!
# df_with_attribute.to_csv("all_cities_preprocess.csv", encoding='utf-8')



Description


In [74]:
df_with_attribute.shape


Out[74]:
(35877, 97)

In [41]:
df_with_attribute.count().values


Out[41]:
array([35877, 35877, 35877, 35877, 35877, 35877, 35877, 35877, 35877,
       35877, 35877, 35877, 35877, 35877, 35877, 35877,  3649, 24083,
        5026,    37, 25065, 25430, 23536,  4510, 24840,   164,   164,
       33382, 26382, 24557,  4510,   261,  3649, 26980,   891, 30204,
       24557, 24083,  4510, 19581, 24083, 23726,     6, 24557,   164,
         575,     4,  2185, 30322, 24557,     4, 30915, 24557, 25861,
        4876,  3200,   203,     4,   164,  4510,  3649,  5420, 22555,
       24083, 24083, 31268,   432,  4510, 24557, 27793,     6, 34121,
         147,   164, 26428, 15643,  3649,  1259,  3649,     6,  4510,
       31268, 31268,     6,  3649, 24083,  4510,   164, 24557, 31268,
       16426,  4361,  4574,   164, 24557,     6,  3649])

In [53]:
df_with_attribute.tail(2)


Out[53]:
address attributes business_id categories city hours is_open latitude longitude name ... RestaurantsCounterService RestaurantsDelivery RestaurantsGoodForGroups RestaurantsPriceRange2 RestaurantsReservations RestaurantsTableService RestaurantsTakeOut Smoking WheelchairAccessible WiFi
eHLUQ2W_hXx61NmiL9kSVg 18221 N Pima Rd, Ste 140 [{u'Alcohol': u'full_bar'}, {u'Ambience': {u'r... eHLUQ2W_hXx61NmiL9kSVg [Pizza, Italian, Restaurants] Scottsdale [Monday 11:0-0:0, Tuesday 11:0-0:0, Wednesday ... 1 33.651369 -111.886245 Rosati's Pizza ... NaN False False True NaN NaN NaN True NaN NaN
OgwN65jZebPRIPSmNpRP7A 7610 Elmwood Ave [{u'Alcohol': u'none'}, {u'Ambience': {u'roman... OgwN65jZebPRIPSmNpRP7A [Restaurants, Ethnic Food, Chinese, Seafood, S... Middleton [Monday 16:0-20:30, Tuesday 16:0-20:30, Wednes... 1 43.095930 -89.513004 Taigu ... NaN False False NaN NaN NaN NaN True NaN NaN

2 rows × 97 columns

Top100

by # of records


In [71]:
city_n.sort_values(ascending=False)[:20]


Out[71]:
city
Toronto        4565
Las Vegas      4005
Phoenix        2377
Montréal       2124
Charlotte      1740
Pittsburgh     1595
Edinburgh      1361
Scottsdale      995
Cleveland       872
Stuttgart       850
Mississauga     815
Madison         803
Mesa            762
Tempe           648
Henderson       574
Chandler        551
Markham         450
Glendale        445
Gilbert         378
Vaughan         276
dtype: int64

In [46]:
# tab.head().count().values

In [45]:
# att_all

In [42]:
# df_with_attribute.count().values

In [44]:
# df_with_attribute.tail()