27314 in total



In [1]:

    
import pandas as pd
import yaml
import numpy as np

Load



In [3]:

    
# with open('./DATA/yelp_dataset_challenge_round9/yelp_academic_dataset_business.json', 'rb') as fi:
#     data = fi.readlines()
    
# with open("./cat_food.txt", 'rw') as fi:
#     food = fi.read().splitlines() 
# target_cat = food

Preprocess



In [4]:

    
# data = map(lambda x: x.rstrip(), data)
# data_json_str = "[" + ','.join(data) + "]"
# df = pd.read_json(data_json_str)
# df.reset_index(drop=True, inplace=True)

# df_open = df[df.is_open == 1]
# df_open.reset_index(drop=True, inplace=True)

# df_open = df_open.dropna(subset=['hours'])
# df_open.reset_index(drop=True, inplace=True)

# df_open = df_open.dropna(subset=['attributes'])
# df_open.reset_index(drop=True, inplace=True)


# df_open = df_open[df_open.categories.apply(lambda x: not set(x).isdisjoint(set(target_cat)))]
# df_open.index = df_open.business_id.values
# att = df_open.attributes
# a = att.apply(lambda x: yaml.load('['+','.join(x)+']'))
# df_open.attributes = a

# att_all = set()
# for row in a:
#     for i in row:
#         if not isinstance(i.values()[0], dict):
#             att_all.add(i.keys()[0])
#         else:
#             prefix = i.keys()[0]
#             for k in i.values()[0].iterkeys():
#                 suffix = k
#                 temp = prefix + '_' + suffix
#                 att_all.add(temp)
                
# tab = pd.DataFrame(columns=att_all, index=df_open.index)
# for ind in tab.index:
#     for j in a[ind]:
#         if not isinstance(j.values()[0], dict):
#             tab.loc[ind, j.keys()[0]] = j.values()[0]
#         else:
#             prefix = j.keys()[0]
#             for k, v in j.values()[0].iteritems():
#                 suffix = k
#                 temp = prefix + '_' + suffix
#                 tab.loc[ind, temp] = v                

# tab.columns = tab.columns.sort_values()
# df_with_attribute = df_open.join(tab)

# df_with_attribute.to_pickle("all_cities_preprocess.pkl")



In [73]:

    
# Be careful! encoding!
# df_with_attribute.to_csv("all_cities_preprocess.csv", encoding='utf-8')

Code above same as Old_all_cities_preprocess.ipynb



In [2]:

    
df_with_attribute = pd.read_pickle('../yelp-challenge/data_all_cities/all_cities_preprocess.pkl')

Description



In [3]:

    
df_with_attribute.shape









    Out[3]:





(27314, 97)



In [4]:

    
df_with_attribute.count().values









    Out[4]:





array([27314, 27314, 27314, 27314, 27314, 27314, 27314, 27314, 27314,
       27314, 27314, 27314, 27314, 27314, 27314, 27314,  2619, 23688,
        3667,     7, 23174, 24032, 22282,  3285, 24840,  3285,   159,
       25460, 25339, 23217,  3285,   261,  2619, 25511,   891, 22907,
       23217, 23688,  3285, 19314, 23688, 21387,     6, 23217,   159,
         575,     4,  1722, 23096, 23217,     4, 26006, 23217, 25259,
         159,  3200,   192,     4,   159,   159,  2619,  4853, 22555,
       23688, 23688, 23590,   432,  3285, 23217, 24864,     6, 26138,
          44,   159, 25242, 14844,  2619,   388,  2619,     6,  3285,
       23590, 23590,     6,  2619, 23688,  3285,  3550, 23217, 23590,
       12867,  3134,  3316,   159, 23217,     6,  2619])



In [5]:

    
df_with_attribute.tail(2)









    Out[5]:






  
    
      
      address
      attributes
      business_id
      categories
      city
      hours
      is_open
      latitude
      longitude
      name
      ...
      RestaurantsCounterService
      RestaurantsDelivery
      RestaurantsGoodForGroups
      RestaurantsPriceRange2
      RestaurantsReservations
      RestaurantsTableService
      RestaurantsTakeOut
      Smoking
      WheelchairAccessible
      WiFi
    
  
  
    
      eHLUQ2W_hXx61NmiL9kSVg
      18221 N Pima Rd, Ste 140
      [{u'Alcohol': u'full_bar'}, {u'Ambience': {u'r...
      eHLUQ2W_hXx61NmiL9kSVg
      [Pizza, Italian, Restaurants]
      Scottsdale
      [Monday 11:0-0:0, Tuesday 11:0-0:0, Wednesday ...
      1
      33.651369
      -111.886245
      Rosati's Pizza
      ...
      NaN
      False
      False
      True
      NaN
      NaN
      NaN
      True
      NaN
      NaN
    
    
      OgwN65jZebPRIPSmNpRP7A
      7610 Elmwood Ave
      [{u'Alcohol': u'none'}, {u'Ambience': {u'roman...
      OgwN65jZebPRIPSmNpRP7A
      [Restaurants, Ethnic Food, Chinese, Seafood, S...
      Middleton
      [Monday 16:0-20:30, Tuesday 16:0-20:30, Wednes...
      1
      43.095930
      -89.513004
      Taigu
      ...
      NaN
      False
      False
      NaN
      NaN
      NaN
      NaN
      True
      NaN
      NaN
    
  

2 rows × 97 columns

Top100

by # of records



In [13]:

    
city_n = df_with_attribute.groupby(df_with_attribute.city).size()



In [14]:

    
city_n.sort_values(ascending=False)[:20]









    Out[14]:





city
Toronto        3429
Las Vegas      2943
Phoenix        1772
Montréal       1644
Charlotte      1306
Pittsburgh     1190
Edinburgh       893
Scottsdale      738
Stuttgart       728
Mississauga     678
Cleveland       671
Madison         602
Mesa            566
Tempe           483
Henderson       424
Chandler        406
Markham         340
Glendale        330
Gilbert         264
Vaughan         235
dtype: int64

	address	attributes	business_id	categories	city	hours	is_open	latitude	longitude	name	...	RestaurantsCounterService	RestaurantsDelivery	RestaurantsGoodForGroups	RestaurantsPriceRange2	RestaurantsReservations	RestaurantsTableService	RestaurantsTakeOut	Smoking	WheelchairAccessible	WiFi
eHLUQ2W_hXx61NmiL9kSVg	18221 N Pima Rd, Ste 140	[{u'Alcohol': u'full_bar'}, {u'Ambience': {u'r...	eHLUQ2W_hXx61NmiL9kSVg	[Pizza, Italian, Restaurants]	Scottsdale	[Monday 11:0-0:0, Tuesday 11:0-0:0, Wednesday ...	1	33.651369	-111.886245	Rosati's Pizza	...	NaN	False	False	True	NaN	NaN	NaN	True	NaN	NaN
OgwN65jZebPRIPSmNpRP7A	7610 Elmwood Ave	[{u'Alcohol': u'none'}, {u'Ambience': {u'roman...	OgwN65jZebPRIPSmNpRP7A	[Restaurants, Ethnic Food, Chinese, Seafood, S...	Middleton	[Monday 16:0-20:30, Tuesday 16:0-20:30, Wednes...	1	43.095930	-89.513004	Taigu	...	NaN	False	False	NaN	NaN	NaN	NaN	True	NaN	NaN