Build Table Business vs. Categories


In [2]:
import json
import sqlite3

import pandas as pd
import numpy as np
#from sqlalchemy import create_engine
from collections import defaultdict

import sys  

reload(sys)  
sys.setdefaultencoding('utf8')
############# # Might need to change path to read the file in your local machine business = pd.read_json('yelp_dataset_challenge_round9/yelp_academic_dataset_business.json', \ lines=True, encoding='utf8') business.head(3)

In [8]:
df_c = business[['business_id','categories']]
df_c.head(3)


Out[8]:
business_id categories
0 0DI8Dt2PJp07XkVvIElIcQ [Tobacco Shops, Nightlife, Vape Shops, Shopping]
1 LTlCaCGZE14GuaUXUGbamg [Caterers, Grocery, Food, Event Planning & Ser...
2 EDqCEAGXVGCH4FJXgqtjqg [Restaurants, Pizza, Chicken Wings, Italian]

In [9]:
len(df_c)


Out[9]:
144072

In [10]:
# Build table of business and categories 
def yes_no(df,col): # data, which col to expand(name)
    """
    the function takes a dataframe and column name
    df: dataframe
    col: name of the column containing a list of words(i.e.categories)
    returns a dictionary with 
        keys--all the words; 
        values as 0/1--0 if the business's categories do not include this word; 1 if it does
    """
    
    dic = defaultdict(list) # dictionary whose values are lists.

    for i in range(len(df)):
        #lst = df.iloc[:,col][i].encode('ascii','ignore')[3:-2].split('\', u\'') # convert unicode list to list of strings
        lst_u = df[col][i]   
        
        try:     
            lst = []
            for j in range(len(lst_u)):
                lst.append(str(lst_u[j]))

            for k in dic.keys(): 
                if k in lst:
                    dic[k].append(1)
                elif k not in lst:
                    dic[k].append(0)
                else:
                    pass
            for c in lst:
                if c not in dic.keys():
                    zeros = [0] * i
                    dic[c] = zeros + [1]
        
        except:
            # if a business does not have "categoriy" items, add 0 for all keys
            for k in dic.keys():
                dic[k].append(0)
            continue


    return dic

In [11]:
##!! Caution!! takes some time to run
dic_all = yes_no(df_c, 'categories')

In [12]:
len(df_c)


Out[12]:
144072

In [13]:
df_cat = pd.DataFrame.from_dict(dic_all)
df_cat.index = df_c.business_id
df_cat.head()


Out[13]:
& Probates 3D Printing ATV Rentals/Tours Acai Bowls Accessories Accountants Acne Treatment Active Life Acupuncture Addiction Medicine ... Wine Tours Wineries Wok Women's Clothing Workers Compensation Law Wraps Yelp Events Yoga Ziplining Zoos
business_id
0DI8Dt2PJp07XkVvIElIcQ 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
LTlCaCGZE14GuaUXUGbamg 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
EDqCEAGXVGCH4FJXgqtjqg 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
cnGIivYRLxpF7tBVR_JwWA 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
cdk-qqJ71q6P7TJTww_DSA 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 1191 columns


In [14]:
# Check
df_cat.loc['SWkmyhQBV1A7Qu4LRHoVXQ',:][df_cat.loc['SWkmyhQBV1A7Qu4LRHoVXQ',:]==1]


Out[14]:
Acupuncture         1
Beauty & Spas       1
Chiropractors       1
Health & Medical    1
Massage             1
Massage Therapy     1
Physical Therapy    1
Name: SWkmyhQBV1A7Qu4LRHoVXQ, dtype: int64

In [ ]:
# Save as pickle file
df_cat.to_pickle('Yelp_Business_Catergory_Table.p')

Select Categories representing resturants


In [9]:
import pandas as pd
import numpy as np

In [3]:
## load all cat from pickle
df = pd.read_pickle('Yelp_Business_Catergory_Table.p')

In [4]:
df.shape


Out[4]:
(144072, 1191)

In [5]:
for col in df.columns[:5]:
    print col
# pick columns columns_food = [] columns_culture = [] columns_not_sure = [] columns_unrelated = [] for col in df.columns: print (col) food = raw_input('about food? ') if food == 'y': columns_food.append(col) culture = raw_input('about culture? ') if culture == 'y': columns_culture.append(col) elif (food != 'y') & (culture != 'y'): very_sure = raw_input('nothing to do with restuarant ?') if very_sure == 'y': columns_unrelated.append(col) else: columns_not_sure.append(col)

In [11]:
#columns_food

In [7]:
#columns_culture

In [8]:
#columns_unrelated

In [42]:
### adjustment ###
# columns_food.append('___')
# columns_unrelated.remove('____')

In [10]:
# Avoid repetitive words
columns_food= list(set(columns_food))

In [59]:
# Save to text file
thefile = open('cat_unrelated.txt', 'w')

for item in columns_unrelated:
    thefile.write("%s\n" % item)

In [61]:
### might need to extract Cultural words... ###

Cultural Food


In [6]:
df.head(3)


Out[6]:
& Probates 3D Printing ATV Rentals/Tours Acai Bowls Accessories Accountants Acne Treatment Active Life Acupuncture Addiction Medicine ... Wine Tours Wineries Wok Women's Clothing Workers Compensation Law Wraps Yelp Events Yoga Ziplining Zoos
business_id
0DI8Dt2PJp07XkVvIElIcQ 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
LTlCaCGZE14GuaUXUGbamg 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
EDqCEAGXVGCH4FJXgqtjqg 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 1191 columns


In [7]:
df.shape


Out[7]:
(144072, 1191)

In [34]:
# read saved txt file
list_food = open('cat_food.txt', 'rw').read().split('\n')
list_culture = open('cat_culture.txt', 'rw').read().split('\n')

In [35]:
list_all = list(set(list_food + list_culture))

In [36]:
len(list_food)


Out[36]:
105

In [37]:
len(list_culture)


Out[37]:
107

In [38]:
len(list_all)


Out[38]:
201

In [39]:
df_fc = df.loc[:, df.columns[df.columns.isin(list_all)]]

In [40]:
df_fc['cat_sum'] = df_fc.sum(axis=1)
df_fc['food_sum'] = df_fc.loc[:, df_fc.columns[df_fc.columns.isin(list_food)]].sum(axis=1)
df_fc['cul_sum'] = df_fc.loc[:, df_fc.columns[df_fc.columns.isin(list_culture)]].sum(axis=1)

In [15]:
df_fc = df_fc[df_fc['cat_sum'] > 0 ]

In [16]:
df_fc.shape


Out[16]:
(64341, 204)

In [17]:
df_fc.head()


Out[17]:
Acai Bowls Afghan African Alsatian American (New) American (Traditional) Arabian Argentine Armenian Asian Fusion ... Vegan Vegetarian Venezuelan Vietnamese Waffles Whiskey Bars Wine Bars cat_sum food_sum cul_sum
business_id
LTlCaCGZE14GuaUXUGbamg 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 2 2 0
EDqCEAGXVGCH4FJXgqtjqg 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 4 3 1
GDnbt3isfhd57T1QqU6flg 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 4 3 1
Nbr0kbtIrVlEcKIZoXWbSw 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 2 2 0
42romV8altAeuZuP2OC1gw 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 3 2 1

5 rows × 204 columns


In [18]:
#### check those are not labeled as Restaurants
len(df_fc[df_fc['Restaurants']!=1])


Out[18]:
15856

In [19]:
## check if there is non-res with cultral vacabularies ----- only "restuarants" are assigned a cultural vacabulary
df_fc[(df_fc['Restaurants']!=1) & (df_fc['cul_sum']!=0)]


Out[19]:
Acai Bowls Afghan African Alsatian American (New) American (Traditional) Arabian Argentine Armenian Asian Fusion ... Vegan Vegetarian Venezuelan Vietnamese Waffles Whiskey Bars Wine Bars cat_sum food_sum cul_sum
business_id

0 rows × 204 columns


In [ ]:


In [20]:
##### Show all non-"res"
non_restaurants_cat = []
for biz in df_fc[df_fc['Restaurants']!=1].index:
    non_restaurants_cat.append(df_fc.columns[df_fc.ix[biz, :-3] == 1])

In [21]:
pd.DataFrame(non_restaurants_cat).head(5)


Out[21]:
0 1 2 3 4 5 6 7
0 Food Specialty Food None None None None None None
1 Bakeries Food None None None None None None
2 Bakeries Food None None None None None None
3 Sports Clubs None None None None None None None
4 Coffee & Tea Desserts Food None None None None None

In [290]:
#df_fc.loc['XxjrqA5jrzLH0EdCKUG9Mw', :][df_fc.loc['XxjrqA5jrzLH0EdCKUG9Mw', :] == 1]

In [22]:
#################
#### check those are labeled as "Restaurants"
len(df_fc[df_fc['Restaurants']==1])


Out[22]:
48485

In [23]:
df_r = df_fc[df_fc['Restaurants']==1]

In [24]:
df_r.head(5)


Out[24]:
Acai Bowls Afghan African Alsatian American (New) American (Traditional) Arabian Argentine Armenian Asian Fusion ... Vegan Vegetarian Venezuelan Vietnamese Waffles Whiskey Bars Wine Bars cat_sum food_sum cul_sum
business_id
EDqCEAGXVGCH4FJXgqtjqg 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 4 3 1
GDnbt3isfhd57T1QqU6flg 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 4 3 1
42romV8altAeuZuP2OC1gw 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 3 2 1
DNyYOxVAfu0oUcPNL1ljCQ 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 2 2 0
a1Ba6XeIOP48e64YFD0dMw 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 4 4 0

5 rows × 204 columns


In [68]:
##### Show all "res"
restaurants_cat = []
for biz in df_fc[df_fc['Restaurants']==1].index:
    restaurants_cat.append(df_fc.columns[df_fc.ix[biz, :-3] == 1])

In [69]:
restaurants_cat[:10]


Out[69]:
[Index([u'Chicken Wings', u'Italian', u'Pizza', u'Restaurants'], dtype='object'),
 Index([u'Fast Food', u'Mexican', u'Restaurants', u'Tex-Mex'], dtype='object'),
 Index([u'Barbeque', u'Hawaiian', u'Restaurants'], dtype='object'),
 Index([u'Cafes', u'Restaurants'], dtype='object'),
 Index([u'Breakfast & Brunch', u'Restaurants', u'Salad', u'Sandwiches'], dtype='object'),
 Index([u'Fast Food', u'Mexican', u'Restaurants'], dtype='object'),
 Index([u'Italian', u'Restaurants', u'Seafood'], dtype='object'),
 Index([u'American (Traditional)', u'Mexican', u'Restaurants'], dtype='object'),
 Index([u'Restaurants', u'Soul Food', u'Southern', u'Waffles'], dtype='object'),
 Index([u'Ethnic Food', u'Food', u'Greek', u'Persian/Iranian', u'Restaurants',
        u'Specialty Food'],
       dtype='object')]

In [25]:
##################################
df_fc['Restaurants'][df_fc['Swiss Food'] == 1]


Out[25]:
business_id
GzF4lVSh61sayvwVMp8RmA    1
XPduWZA2t7ELlMHWjv4J9Q    1
Ywz-J95QRVt4t7rOBJl4Hg    1
Name: Restaurants, dtype: int64

In [44]:
df_fc.loc['GzF4lVSh61sayvwVMp8RmA', :][df_fc.loc['GzF4lVSh61sayvwVMp8RmA', :] == 1]


Out[44]:
American (Traditional)    1
Canadian (New)            1
Comfort Food              1
Restaurants               1
Swiss Food                1
Name: GzF4lVSh61sayvwVMp8RmA, dtype: int64

In [26]:
## double check
df_culonly = df_fc[df_fc['food_sum'] == 0 ]
df_culonly


Out[26]:
Acai Bowls Afghan African Alsatian American (New) American (Traditional) Arabian Argentine Armenian Asian Fusion ... Vegan Vegetarian Venezuelan Vietnamese Waffles Whiskey Bars Wine Bars cat_sum food_sum cul_sum
business_id

0 rows × 204 columns


In [27]:
## Take a closer look at the cultural vacab
cultureonly = df_fc.loc[:, df_fc.columns[df_fc.columns.isin(list_culture)]]
biz_cultureonly = cultureonly[cultureonly.sum(axis=1)>0].index
df_cul = df_fc.loc[biz_cultureonly, :]

In [71]:
df_cul.head()


Out[71]:
Acai Bowls Afghan African Alsatian American (New) American (Traditional) Arabian Argentine Armenian Asian Fusion ... Vegan Vegetarian Venezuelan Vietnamese Waffles Whiskey Bars Wine Bars cat_sum food_sum cul_sum
business_id
EDqCEAGXVGCH4FJXgqtjqg 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 4 3 1
GDnbt3isfhd57T1QqU6flg 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 4 3 1
42romV8altAeuZuP2OC1gw 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 3 2 1
826djy6K_9Fp0ptqJ2_Yag 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 3 2 1
Mi5uhdFB9OJteXPd0_IKfw 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 3 2 1

5 rows × 204 columns


In [72]:
(df_cul.food_sum != 0 ).sum() == len(df_cul)


Out[72]:
True

In [28]:
### Examine cultures only
biz_cultures = df_cul[df_cul.cul_sum > 0]
biz_cultures = biz_cultures.loc[:, list_culture + ['cat_sum', 'food_sum','cul_sum']]
biz_cultures.shape


Out[28]:
(31182, 99)

In [ ]:
##############################

In [30]:
biz_cultures.head(10)


Out[30]:
Cantonese Szechuan Pan Asian Shanghainese Hainan Chinese Taiwanese Asian Fusion Salvadoran Slovakian ... Peruvian Baden Sri Lankan Japanese Cambodian Spanish Nicaraguan cat_sum food_sum cul_sum
business_id
EDqCEAGXVGCH4FJXgqtjqg 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 4 3 1
GDnbt3isfhd57T1QqU6flg 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 4 3 1
42romV8altAeuZuP2OC1gw 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 3 2 1
826djy6K_9Fp0ptqJ2_Yag 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 3 2 1
Mi5uhdFB9OJteXPd0_IKfw 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 3 2 1
Uxh0fXFH_QQBivRnIBpdiw 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 3 1 2
YPavuOh2XsnRbLfl0DH2lQ 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 4 3 1
saWZO6hB4B8P-mIzS1--Xw 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 6 4 2
vJ-KyaojDQj1R2BNrlDToA 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 2 1 1
hovoWva_UjbnyLWEbnFvBw 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 2 1 1

10 rows × 99 columns


In [76]:
lst = biz_cultures.columns[biz_cultures.iloc[5,:-3] == 1].values
lst


Out[76]:
array(['American (Traditional)', 'Mexican'], dtype=object)

In [77]:
lsts_cul = map(lambda biz: biz_cultures.columns[biz_cultures.ix[biz,:-3] == 1].values, biz_cultures.index.values)

In [78]:
### Create culture table
columns = map(lambda c: 'culture_{}'.format(c), range(1, df_cul.cul_sum.max()+1))
df_CULTURE = pd.DataFrame(data=lsts_cul, index = df_cul.index, columns = columns)

In [79]:
df_CULTURE


Out[79]:
culture_1 culture_2 culture_3 culture_4 culture_5 culture_6 culture_7
business_id
EDqCEAGXVGCH4FJXgqtjqg Italian None None None None None None
GDnbt3isfhd57T1QqU6flg Mexican None None None None None None
42romV8altAeuZuP2OC1gw Hawaiian None None None None None None
826djy6K_9Fp0ptqJ2_Yag Mexican None None None None None None
Mi5uhdFB9OJteXPd0_IKfw Italian None None None None None None
Uxh0fXFH_QQBivRnIBpdiw American (Traditional) Mexican None None None None None
YPavuOh2XsnRbLfl0DH2lQ Southern None None None None None None
saWZO6hB4B8P-mIzS1--Xw Persian/Iranian Greek None None None None None
vJ-KyaojDQj1R2BNrlDToA American (Traditional) None None None None None None
hovoWva_UjbnyLWEbnFvBw Thai None None None None None None
93otbGHE0s0m-lU1osvg9w American (New) None None None None None None
L_thK7r3K_h5M4tV7amEKQ Canadian (New) Italian None None None None None
LIvXzdMJrIb6IY5Y3tqicA American (New) None None None None None None
F53MSa5SYzO9BG8c_JhskQ Vietnamese None None None None None None
hEcn9k6ONd5n2mq0lB2aew American (Traditional) None None None None None None
diXqpz9WOr4WLQq6uvARkg Indian None None None None None None
6pSUvtk5-OOaJfX0hbkb8Q Japanese None None None None None None
el3HILPUZ3gxN20wcTJr3A Filipino None None None None None None
hMh9XOwNQcu31NAOCqhAEw Indian None None None None None None
pmJqSsCfgbo3TxPWpQNLIw American (New) None None None None None None
Ja4stXdNYr39u5CZHMNtjw Mexican None None None None None None
st_pnALlGB6OCWkb4e14mg Italian None None None None None None
6TKJDgTrUoD3zu77A4JHnQ American (Traditional) None None None None None None
V94gk_Vx0Ot623UGHG6PrA American (Traditional) Mexican None None None None None
kUUBBLBHCasOl2a5nW9nAw Thai None None None None None None
mRv3Z25F56qduMKnvnCkVA Korean None None None None None None
NmiX_7talK5QTgMKvHPUaQ American (Traditional) None None None None None None
Ir_QIzs-4o9ElOtiGuxJrw Mexican None None None None None None
ODwqVEORhb9YAgo3MP0PXw Korean None None None None None None
h7uxML49NQ_Wj-7zk0DSqg American (Traditional) None None None None None None
... ... ... ... ... ... ... ...
ofXqwIbHprYF6TP47XudOw Mexican None None None None None None
ORloM7z9R9eh-G-an5DQLA American (New) American (Traditional) None None None None None
tquWyW3Mm5ka8LJtzodALg Chinese None None None None None None
ZL_EMDDN4QQ8tm6oXduArg Japanese None None None None None None
ZZ8sAXHb0gX-c81Ea1M3OQ American (New) None None None None None None
7yQMwsuV_8rV-EchuvCG5g Mexican None None None None None None
22RFH-U5gboNJgws_GBG3w American (Traditional) None None None None None None
zIOLoKIOXpGrzndARGhkAw Latin American Mexican None None None None None
9iJMGMsTK-q6W5MB1_Ny3Q Chinese None None None None None None
J6tgF98vwUz4D8T-N72org Middle Eastern None None None None None None
xCTpRfAUp3zrhACPkzxp2Q Japanese None None None None None None
UkoseRjoLT4K2bEzvmyhYA American (Traditional) None None None None None None
w5CSi-An5meLnxjKSFn0wQ Asian Fusion Thai None None None None None
PTPd2xamYtUCH00NqML-YA Chinese None None None None None None
Q3UkgxNNInsPcUFhsQFcrg Chinese None None None None None None
Ty7IyZN721l6VvBfc_zRVQ Asian Fusion None None None None None None
JFnfylrxzH2gAVI_7HJZcQ Japanese None None None None None None
JZE9AfuO_jUBMWzhg-SAcQ Vietnamese None None None None None None
J_PAsINZhwL6J9aV2CZrfw American (Traditional) None None None None None None
GIfZNMP0oIJCje_Xp0Bgrw American (New) None None None None None None
bVgRVpdPgwHBLa7RHb9CpQ French American (New) None None None None None
2hUENHFcBIqop8-uWvrf6A Irish None None None None None None
J1RDyyPxhioqm8c_fi4P4Q American (New) None None None None None None
C_chktl5jFeRqu1C0rmScA Indian None None None None None None
xowy4YxBXXGOYuajsxjwkA American (New) None None None None None None
g9gAuOEXBbWnLlu2uOftwA Indian None None None None None None
-McKyjNSqS1h9dDJH3dyUA German None None None None None None
eHLUQ2W_hXx61NmiL9kSVg Italian None None None None None None
OgwN65jZebPRIPSmNpRP7A Chinese None None None None None None
umBsUDjSPzF28LOw-U_NfA Latin American Mexican None None None None None

31182 rows × 7 columns


In [80]:
df_CULTURE.culture_7.unique()


Out[80]:
array([None, 'Lebanese'], dtype=object)

Combine and Examine individual Cuisine

Chinese food, non-Chinese food, no-labeled


In [32]:
### Dictionary of combined cultures (keep building...)
dic_cuisine = {}

In [41]:
# List of Chinese food
list_chinese = ['Cantonese','Szechuan','Shanghainese','Hainan','Chinese','Taiwanese',\
                'Asian Fusion',\
                'Dim Sum', 'Hot Pot', 'Hong Kong Style Cafe']
dic_cuisine['Chinese'] = list_chinese

In [82]:
biz_cultures.head(3)


Out[82]:
Cantonese Szechuan Pan Asian Shanghainese Hainan Chinese Taiwanese Asian Fusion Salvadoran Slovakian ... Peruvian Baden Sri Lankan Japanese Cambodian Spanish Nicaraguan cat_sum food_sum cul_sum
business_id
EDqCEAGXVGCH4FJXgqtjqg 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 4 3 1
GDnbt3isfhd57T1QqU6flg 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 4 3 1
42romV8altAeuZuP2OC1gw 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 3 2 1

3 rows × 99 columns


In [ ]:


In [ ]: