27314 in total


In [1]:
import pandas as pd
import yaml
import numpy as np

Load


In [3]:
# with open('./DATA/yelp_dataset_challenge_round9/yelp_academic_dataset_business.json', 'rb') as fi:
#     data = fi.readlines()
    
# with open("./cat_food.txt", 'rw') as fi:
#     food = fi.read().splitlines() 
# target_cat = food

Preprocess


In [4]:
# data = map(lambda x: x.rstrip(), data)
# data_json_str = "[" + ','.join(data) + "]"
# df = pd.read_json(data_json_str)
# df.reset_index(drop=True, inplace=True)

# df_open = df[df.is_open == 1]
# df_open.reset_index(drop=True, inplace=True)

# df_open = df_open.dropna(subset=['hours'])
# df_open.reset_index(drop=True, inplace=True)

# df_open = df_open.dropna(subset=['attributes'])
# df_open.reset_index(drop=True, inplace=True)


# df_open = df_open[df_open.categories.apply(lambda x: not set(x).isdisjoint(set(target_cat)))]
# df_open.index = df_open.business_id.values
# att = df_open.attributes
# a = att.apply(lambda x: yaml.load('['+','.join(x)+']'))
# df_open.attributes = a

# att_all = set()
# for row in a:
#     for i in row:
#         if not isinstance(i.values()[0], dict):
#             att_all.add(i.keys()[0])
#         else:
#             prefix = i.keys()[0]
#             for k in i.values()[0].iterkeys():
#                 suffix = k
#                 temp = prefix + '_' + suffix
#                 att_all.add(temp)
                
# tab = pd.DataFrame(columns=att_all, index=df_open.index)
# for ind in tab.index:
#     for j in a[ind]:
#         if not isinstance(j.values()[0], dict):
#             tab.loc[ind, j.keys()[0]] = j.values()[0]
#         else:
#             prefix = j.keys()[0]
#             for k, v in j.values()[0].iteritems():
#                 suffix = k
#                 temp = prefix + '_' + suffix
#                 tab.loc[ind, temp] = v                

# tab.columns = tab.columns.sort_values()
# df_with_attribute = df_open.join(tab)

# df_with_attribute.to_pickle("all_cities_preprocess.pkl")

In [73]:
# Be careful! encoding!
# df_with_attribute.to_csv("all_cities_preprocess.csv", encoding='utf-8')

Code above same as Old_all_cities_preprocess.ipynb




In [2]:
df_with_attribute = pd.read_pickle('../yelp-challenge/data_all_cities/all_cities_preprocess.pkl')

Description


In [3]:
df_with_attribute.shape


Out[3]:
(27314, 97)

In [4]:
df_with_attribute.count().values


Out[4]:
array([27314, 27314, 27314, 27314, 27314, 27314, 27314, 27314, 27314,
       27314, 27314, 27314, 27314, 27314, 27314, 27314,  2619, 23688,
        3667,     7, 23174, 24032, 22282,  3285, 24840,  3285,   159,
       25460, 25339, 23217,  3285,   261,  2619, 25511,   891, 22907,
       23217, 23688,  3285, 19314, 23688, 21387,     6, 23217,   159,
         575,     4,  1722, 23096, 23217,     4, 26006, 23217, 25259,
         159,  3200,   192,     4,   159,   159,  2619,  4853, 22555,
       23688, 23688, 23590,   432,  3285, 23217, 24864,     6, 26138,
          44,   159, 25242, 14844,  2619,   388,  2619,     6,  3285,
       23590, 23590,     6,  2619, 23688,  3285,  3550, 23217, 23590,
       12867,  3134,  3316,   159, 23217,     6,  2619])

In [5]:
df_with_attribute.tail(2)


Out[5]:
address attributes business_id categories city hours is_open latitude longitude name ... RestaurantsCounterService RestaurantsDelivery RestaurantsGoodForGroups RestaurantsPriceRange2 RestaurantsReservations RestaurantsTableService RestaurantsTakeOut Smoking WheelchairAccessible WiFi
eHLUQ2W_hXx61NmiL9kSVg 18221 N Pima Rd, Ste 140 [{u'Alcohol': u'full_bar'}, {u'Ambience': {u'r... eHLUQ2W_hXx61NmiL9kSVg [Pizza, Italian, Restaurants] Scottsdale [Monday 11:0-0:0, Tuesday 11:0-0:0, Wednesday ... 1 33.651369 -111.886245 Rosati's Pizza ... NaN False False True NaN NaN NaN True NaN NaN
OgwN65jZebPRIPSmNpRP7A 7610 Elmwood Ave [{u'Alcohol': u'none'}, {u'Ambience': {u'roman... OgwN65jZebPRIPSmNpRP7A [Restaurants, Ethnic Food, Chinese, Seafood, S... Middleton [Monday 16:0-20:30, Tuesday 16:0-20:30, Wednes... 1 43.095930 -89.513004 Taigu ... NaN False False NaN NaN NaN NaN True NaN NaN

2 rows × 97 columns

Top100

by # of records


In [13]:
city_n = df_with_attribute.groupby(df_with_attribute.city).size()

In [14]:
city_n.sort_values(ascending=False)[:20]


Out[14]:
city
Toronto        3429
Las Vegas      2943
Phoenix        1772
Montréal       1644
Charlotte      1306
Pittsburgh     1190
Edinburgh       893
Scottsdale      738
Stuttgart       728
Mississauga     678
Cleveland       671
Madison         602
Mesa            566
Tempe           483
Henderson       424
Chandler        406
Markham         340
Glendale        330
Gilbert         264
Vaughan         235
dtype: int64