In [1]:

    
import pandas as pd
import yaml
import numpy as np

Load



In [3]:

    
with open('./DATA/yelp_dataset_challenge_round9/yelp_academic_dataset_business.json', 'rb') as fi:
    data = fi.readlines()
    
with open("./cat_food.txt", 'rw') as fi:
    food = fi.read().splitlines() 
target_cat = food

Preprocess



In [4]:

    
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"
df = pd.read_json(data_json_str)
df.reset_index(drop=True, inplace=True)

df_open = df[df.is_open == 1]
df_open.reset_index(drop=True, inplace=True)

df_open = df_open.dropna(subset=['hours'])
df_open.reset_index(drop=True, inplace=True)

df_open = df_open.dropna(subset=['attributes'])
df_open.reset_index(drop=True, inplace=True)


df_open = df_open[df_open.categories.apply(lambda x: not set(x).isdisjoint(set(target_cat)))]
df_open.index = df_open.business_id.values
att = df_open.attributes
a = att.apply(lambda x: yaml.load('['+','.join(x)+']'))
df_open.attributes = a

att_all = set()
for row in a:
    for i in row:
        if not isinstance(i.values()[0], dict):
            att_all.add(i.keys()[0])
        else:
            prefix = i.keys()[0]
            for k in i.values()[0].iterkeys():
                suffix = k
                temp = prefix + '_' + suffix
                att_all.add(temp)
                
tab = pd.DataFrame(columns=att_all, index=df_open.index)
for ind in tab.index:
    for j in a[ind]:
        if not isinstance(j.values()[0], dict):
            tab.loc[ind, j.keys()[0]] = j.values()[0]
        else:
            prefix = j.keys()[0]
            for k, v in j.values()[0].iteritems():
                suffix = k
                temp = prefix + '_' + suffix
                tab.loc[ind, temp] = v                

tab.columns = tab.columns.sort_values()
df_with_attribute = df_open.join(tab)

df_with_attribute.to_pickle("all_cities_preprocess.pkl")



In [73]:

    
# Be careful! encoding!
# df_with_attribute.to_csv("all_cities_preprocess.csv", encoding='utf-8')

Description



In [74]:

    
df_with_attribute.shape









    Out[74]:





(35877, 97)



In [41]:

    
df_with_attribute.count().values









    Out[41]:





array([35877, 35877, 35877, 35877, 35877, 35877, 35877, 35877, 35877,
       35877, 35877, 35877, 35877, 35877, 35877, 35877,  3649, 24083,
        5026,    37, 25065, 25430, 23536,  4510, 24840,   164,   164,
       33382, 26382, 24557,  4510,   261,  3649, 26980,   891, 30204,
       24557, 24083,  4510, 19581, 24083, 23726,     6, 24557,   164,
         575,     4,  2185, 30322, 24557,     4, 30915, 24557, 25861,
        4876,  3200,   203,     4,   164,  4510,  3649,  5420, 22555,
       24083, 24083, 31268,   432,  4510, 24557, 27793,     6, 34121,
         147,   164, 26428, 15643,  3649,  1259,  3649,     6,  4510,
       31268, 31268,     6,  3649, 24083,  4510,   164, 24557, 31268,
       16426,  4361,  4574,   164, 24557,     6,  3649])



In [53]:

    
df_with_attribute.tail(2)









    Out[53]:






  
    
      
      address
      attributes
      business_id
      categories
      city
      hours
      is_open
      latitude
      longitude
      name
      ...
      RestaurantsCounterService
      RestaurantsDelivery
      RestaurantsGoodForGroups
      RestaurantsPriceRange2
      RestaurantsReservations
      RestaurantsTableService
      RestaurantsTakeOut
      Smoking
      WheelchairAccessible
      WiFi
    
  
  
    
      eHLUQ2W_hXx61NmiL9kSVg
      18221 N Pima Rd, Ste 140
      [{u'Alcohol': u'full_bar'}, {u'Ambience': {u'r...
      eHLUQ2W_hXx61NmiL9kSVg
      [Pizza, Italian, Restaurants]
      Scottsdale
      [Monday 11:0-0:0, Tuesday 11:0-0:0, Wednesday ...
      1
      33.651369
      -111.886245
      Rosati's Pizza
      ...
      NaN
      False
      False
      True
      NaN
      NaN
      NaN
      True
      NaN
      NaN
    
    
      OgwN65jZebPRIPSmNpRP7A
      7610 Elmwood Ave
      [{u'Alcohol': u'none'}, {u'Ambience': {u'roman...
      OgwN65jZebPRIPSmNpRP7A
      [Restaurants, Ethnic Food, Chinese, Seafood, S...
      Middleton
      [Monday 16:0-20:30, Tuesday 16:0-20:30, Wednes...
      1
      43.095930
      -89.513004
      Taigu
      ...
      NaN
      False
      False
      NaN
      NaN
      NaN
      NaN
      True
      NaN
      NaN
    
  

2 rows × 97 columns

Top100

by # of records



In [71]:

    
city_n.sort_values(ascending=False)[:20]









    Out[71]:





city
Toronto        4565
Las Vegas      4005
Phoenix        2377
Montréal       2124
Charlotte      1740
Pittsburgh     1595
Edinburgh      1361
Scottsdale      995
Cleveland       872
Stuttgart       850
Mississauga     815
Madison         803
Mesa            762
Tempe           648
Henderson       574
Chandler        551
Markham         450
Glendale        445
Gilbert         378
Vaughan         276
dtype: int64



In [46]:

    
# tab.head().count().values



In [45]:

    
# att_all



In [42]:

    
# df_with_attribute.count().values



In [44]:

    
# df_with_attribute.tail()

	address	attributes	business_id	categories	city	hours	is_open	latitude	longitude	name	...	RestaurantsCounterService	RestaurantsDelivery	RestaurantsGoodForGroups	RestaurantsPriceRange2	RestaurantsReservations	RestaurantsTableService	RestaurantsTakeOut	Smoking	WheelchairAccessible	WiFi
eHLUQ2W_hXx61NmiL9kSVg	18221 N Pima Rd, Ste 140	[{u'Alcohol': u'full_bar'}, {u'Ambience': {u'r...	eHLUQ2W_hXx61NmiL9kSVg	[Pizza, Italian, Restaurants]	Scottsdale	[Monday 11:0-0:0, Tuesday 11:0-0:0, Wednesday ...	1	33.651369	-111.886245	Rosati's Pizza	...	NaN	False	False	True	NaN	NaN	NaN	True	NaN	NaN
OgwN65jZebPRIPSmNpRP7A	7610 Elmwood Ave	[{u'Alcohol': u'none'}, {u'Ambience': {u'roman...	OgwN65jZebPRIPSmNpRP7A	[Restaurants, Ethnic Food, Chinese, Seafood, S...	Middleton	[Monday 16:0-20:30, Tuesday 16:0-20:30, Wednes...	1	43.095930	-89.513004	Taigu	...	NaN	False	False	NaN	NaN	NaN	NaN	True	NaN	NaN