Build Table Business vs. Categories



In [2]:

    
import json
import sqlite3

import pandas as pd
import numpy as np
#from sqlalchemy import create_engine
from collections import defaultdict

import sys  

reload(sys)  
sys.setdefaultencoding('utf8')

############# # Might need to change path to read the file in your local machine business = pd.read_json('yelp_dataset_challenge_round9/yelp_academic_dataset_business.json', \ lines=True, encoding='utf8') business.head(3)



In [8]:

    
df_c = business[['business_id','categories']]
df_c.head(3)









    Out[8]:






  
    
      
      business_id
      categories
    
  
  
    
      0
      0DI8Dt2PJp07XkVvIElIcQ
      [Tobacco Shops, Nightlife, Vape Shops, Shopping]
    
    
      1
      LTlCaCGZE14GuaUXUGbamg
      [Caterers, Grocery, Food, Event Planning & Ser...
    
    
      2
      EDqCEAGXVGCH4FJXgqtjqg
      [Restaurants, Pizza, Chicken Wings, Italian]



In [9]:

    
len(df_c)









    Out[9]:





144072



In [10]:

    
# Build table of business and categories 
def yes_no(df,col): # data, which col to expand(name)
    """
    the function takes a dataframe and column name
    df: dataframe
    col: name of the column containing a list of words(i.e.categories)
    returns a dictionary with 
        keys--all the words; 
        values as 0/1--0 if the business's categories do not include this word; 1 if it does
    """
    
    dic = defaultdict(list) # dictionary whose values are lists.

    for i in range(len(df)):
        #lst = df.iloc[:,col][i].encode('ascii','ignore')[3:-2].split('\', u\'') # convert unicode list to list of strings
        lst_u = df[col][i]   
        
        try:     
            lst = []
            for j in range(len(lst_u)):
                lst.append(str(lst_u[j]))

            for k in dic.keys(): 
                if k in lst:
                    dic[k].append(1)
                elif k not in lst:
                    dic[k].append(0)
                else:
                    pass
            for c in lst:
                if c not in dic.keys():
                    zeros = [0] * i
                    dic[c] = zeros + [1]
        
        except:
            # if a business does not have "categoriy" items, add 0 for all keys
            for k in dic.keys():
                dic[k].append(0)
            continue


    return dic



In [11]:

    
##!! Caution!! takes some time to run
dic_all = yes_no(df_c, 'categories')



In [12]:

    
len(df_c)









    Out[12]:





144072



In [13]:

    
df_cat = pd.DataFrame.from_dict(dic_all)
df_cat.index = df_c.business_id
df_cat.head()









    Out[13]:






  
    
      
      & Probates
      3D Printing
      ATV Rentals/Tours
      Acai Bowls
      Accessories
      Accountants
      Acne Treatment
      Active Life
      Acupuncture
      Addiction Medicine
      ...
      Wine Tours
      Wineries
      Wok
      Women's Clothing
      Workers Compensation Law
      Wraps
      Yelp Events
      Yoga
      Ziplining
      Zoos
    
    
      business_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0DI8Dt2PJp07XkVvIElIcQ
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      LTlCaCGZE14GuaUXUGbamg
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      EDqCEAGXVGCH4FJXgqtjqg
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      cnGIivYRLxpF7tBVR_JwWA
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      cdk-qqJ71q6P7TJTww_DSA
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 1191 columns



In [14]:

    
# Check
df_cat.loc['SWkmyhQBV1A7Qu4LRHoVXQ',:][df_cat.loc['SWkmyhQBV1A7Qu4LRHoVXQ',:]==1]









    Out[14]:





Acupuncture         1
Beauty & Spas       1
Chiropractors       1
Health & Medical    1
Massage             1
Massage Therapy     1
Physical Therapy    1
Name: SWkmyhQBV1A7Qu4LRHoVXQ, dtype: int64



In [ ]:

    
# Save as pickle file
df_cat.to_pickle('Yelp_Business_Catergory_Table.p')

Select Categories representing resturants



In [9]:

    
import pandas as pd
import numpy as np



In [3]:

    
## load all cat from pickle
df = pd.read_pickle('Yelp_Business_Catergory_Table.p')



In [4]:

    
df.shape









    Out[4]:





(144072, 1191)



In [5]:

    
for col in df.columns[:5]:
    print col

# pick columns columns_food = [] columns_culture = [] columns_not_sure = [] columns_unrelated = [] for col in df.columns: print (col) food = raw_input('about food? ') if food == 'y': columns_food.append(col) culture = raw_input('about culture? ') if culture == 'y': columns_culture.append(col) elif (food != 'y') & (culture != 'y'): very_sure = raw_input('nothing to do with restuarant ?') if very_sure == 'y': columns_unrelated.append(col) else: columns_not_sure.append(col)



In [11]:

    
#columns_food



In [7]:

    
#columns_culture



In [8]:

    
#columns_unrelated



In [42]:

    
### adjustment ###
# columns_food.append('___')
# columns_unrelated.remove('____')



In [10]:

    
# Avoid repetitive words
columns_food= list(set(columns_food))



In [59]:

    
# Save to text file
thefile = open('cat_unrelated.txt', 'w')

for item in columns_unrelated:
    thefile.write("%s\n" % item)



In [61]:

    
### might need to extract Cultural words... ###

Cultural Food



In [6]:

    
df.head(3)









    Out[6]:






  
    
      
      & Probates
      3D Printing
      ATV Rentals/Tours
      Acai Bowls
      Accessories
      Accountants
      Acne Treatment
      Active Life
      Acupuncture
      Addiction Medicine
      ...
      Wine Tours
      Wineries
      Wok
      Women's Clothing
      Workers Compensation Law
      Wraps
      Yelp Events
      Yoga
      Ziplining
      Zoos
    
    
      business_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0DI8Dt2PJp07XkVvIElIcQ
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      LTlCaCGZE14GuaUXUGbamg
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      EDqCEAGXVGCH4FJXgqtjqg
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

3 rows × 1191 columns



In [7]:

    
df.shape









    Out[7]:





(144072, 1191)



In [34]:

    
# read saved txt file
list_food = open('cat_food.txt', 'rw').read().split('\n')
list_culture = open('cat_culture.txt', 'rw').read().split('\n')



In [35]:

    
list_all = list(set(list_food + list_culture))



In [36]:

    
len(list_food)









    Out[36]:





105



In [37]:

    
len(list_culture)









    Out[37]:





107



In [38]:

    
len(list_all)









    Out[38]:





201



In [39]:

    
df_fc = df.loc[:, df.columns[df.columns.isin(list_all)]]



In [40]:

    
df_fc['cat_sum'] = df_fc.sum(axis=1)
df_fc['food_sum'] = df_fc.loc[:, df_fc.columns[df_fc.columns.isin(list_food)]].sum(axis=1)
df_fc['cul_sum'] = df_fc.loc[:, df_fc.columns[df_fc.columns.isin(list_culture)]].sum(axis=1)



In [15]:

    
df_fc = df_fc[df_fc['cat_sum'] > 0 ]



In [16]:

    
df_fc.shape









    Out[16]:





(64341, 204)



In [17]:

    
df_fc.head()









    Out[17]:






  
    
      
      Acai Bowls
      Afghan
      African
      Alsatian
      American (New)
      American (Traditional)
      Arabian
      Argentine
      Armenian
      Asian Fusion
      ...
      Vegan
      Vegetarian
      Venezuelan
      Vietnamese
      Waffles
      Whiskey Bars
      Wine Bars
      cat_sum
      food_sum
      cul_sum
    
    
      business_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      LTlCaCGZE14GuaUXUGbamg
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      2
      2
      0
    
    
      EDqCEAGXVGCH4FJXgqtjqg
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      4
      3
      1
    
    
      GDnbt3isfhd57T1QqU6flg
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      4
      3
      1
    
    
      Nbr0kbtIrVlEcKIZoXWbSw
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      2
      2
      0
    
    
      42romV8altAeuZuP2OC1gw
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      3
      2
      1
    
  

5 rows × 204 columns



In [18]:

    
#### check those are not labeled as Restaurants
len(df_fc[df_fc['Restaurants']!=1])









    Out[18]:





15856



In [19]:

    
## check if there is non-res with cultral vacabularies ----- only "restuarants" are assigned a cultural vacabulary
df_fc[(df_fc['Restaurants']!=1) & (df_fc['cul_sum']!=0)]









    Out[19]:






  
    
      
      Acai Bowls
      Afghan
      African
      Alsatian
      American (New)
      American (Traditional)
      Arabian
      Argentine
      Armenian
      Asian Fusion
      ...
      Vegan
      Vegetarian
      Venezuelan
      Vietnamese
      Waffles
      Whiskey Bars
      Wine Bars
      cat_sum
      food_sum
      cul_sum
    
    
      business_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
  

0 rows × 204 columns



In [ ]:



In [20]:

    
##### Show all non-"res"
non_restaurants_cat = []
for biz in df_fc[df_fc['Restaurants']!=1].index:
    non_restaurants_cat.append(df_fc.columns[df_fc.ix[biz, :-3] == 1])



In [21]:

    
pd.DataFrame(non_restaurants_cat).head(5)









    Out[21]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
    
  
  
    
      0
      Food
      Specialty Food
      None
      None
      None
      None
      None
      None
    
    
      1
      Bakeries
      Food
      None
      None
      None
      None
      None
      None
    
    
      2
      Bakeries
      Food
      None
      None
      None
      None
      None
      None
    
    
      3
      Sports Clubs
      None
      None
      None
      None
      None
      None
      None
    
    
      4
      Coffee & Tea
      Desserts
      Food
      None
      None
      None
      None
      None



In [290]:

    
#df_fc.loc['XxjrqA5jrzLH0EdCKUG9Mw', :][df_fc.loc['XxjrqA5jrzLH0EdCKUG9Mw', :] == 1]



In [22]:

    
#################
#### check those are labeled as "Restaurants"
len(df_fc[df_fc['Restaurants']==1])









    Out[22]:





48485



In [23]:

    
df_r = df_fc[df_fc['Restaurants']==1]



In [24]:

    
df_r.head(5)









    Out[24]:






  
    
      
      Acai Bowls
      Afghan
      African
      Alsatian
      American (New)
      American (Traditional)
      Arabian
      Argentine
      Armenian
      Asian Fusion
      ...
      Vegan
      Vegetarian
      Venezuelan
      Vietnamese
      Waffles
      Whiskey Bars
      Wine Bars
      cat_sum
      food_sum
      cul_sum
    
    
      business_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      EDqCEAGXVGCH4FJXgqtjqg
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      4
      3
      1
    
    
      GDnbt3isfhd57T1QqU6flg
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      4
      3
      1
    
    
      42romV8altAeuZuP2OC1gw
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      3
      2
      1
    
    
      DNyYOxVAfu0oUcPNL1ljCQ
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      2
      2
      0
    
    
      a1Ba6XeIOP48e64YFD0dMw
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      4
      4
      0
    
  

5 rows × 204 columns



In [68]:

    
##### Show all "res"
restaurants_cat = []
for biz in df_fc[df_fc['Restaurants']==1].index:
    restaurants_cat.append(df_fc.columns[df_fc.ix[biz, :-3] == 1])



In [69]:

    
restaurants_cat[:10]









    Out[69]:





[Index([u'Chicken Wings', u'Italian', u'Pizza', u'Restaurants'], dtype='object'),
 Index([u'Fast Food', u'Mexican', u'Restaurants', u'Tex-Mex'], dtype='object'),
 Index([u'Barbeque', u'Hawaiian', u'Restaurants'], dtype='object'),
 Index([u'Cafes', u'Restaurants'], dtype='object'),
 Index([u'Breakfast & Brunch', u'Restaurants', u'Salad', u'Sandwiches'], dtype='object'),
 Index([u'Fast Food', u'Mexican', u'Restaurants'], dtype='object'),
 Index([u'Italian', u'Restaurants', u'Seafood'], dtype='object'),
 Index([u'American (Traditional)', u'Mexican', u'Restaurants'], dtype='object'),
 Index([u'Restaurants', u'Soul Food', u'Southern', u'Waffles'], dtype='object'),
 Index([u'Ethnic Food', u'Food', u'Greek', u'Persian/Iranian', u'Restaurants',
        u'Specialty Food'],
       dtype='object')]



In [25]:

    
##################################
df_fc['Restaurants'][df_fc['Swiss Food'] == 1]









    Out[25]:





business_id
GzF4lVSh61sayvwVMp8RmA    1
XPduWZA2t7ELlMHWjv4J9Q    1
Ywz-J95QRVt4t7rOBJl4Hg    1
Name: Restaurants, dtype: int64



In [44]:

    
df_fc.loc['GzF4lVSh61sayvwVMp8RmA', :][df_fc.loc['GzF4lVSh61sayvwVMp8RmA', :] == 1]









    Out[44]:





American (Traditional)    1
Canadian (New)            1
Comfort Food              1
Restaurants               1
Swiss Food                1
Name: GzF4lVSh61sayvwVMp8RmA, dtype: int64



In [26]:

    
## double check
df_culonly = df_fc[df_fc['food_sum'] == 0 ]
df_culonly









    Out[26]:






  
    
      
      Acai Bowls
      Afghan
      African
      Alsatian
      American (New)
      American (Traditional)
      Arabian
      Argentine
      Armenian
      Asian Fusion
      ...
      Vegan
      Vegetarian
      Venezuelan
      Vietnamese
      Waffles
      Whiskey Bars
      Wine Bars
      cat_sum
      food_sum
      cul_sum
    
    
      business_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
  

0 rows × 204 columns



In [27]:

    
## Take a closer look at the cultural vacab
cultureonly = df_fc.loc[:, df_fc.columns[df_fc.columns.isin(list_culture)]]
biz_cultureonly = cultureonly[cultureonly.sum(axis=1)>0].index
df_cul = df_fc.loc[biz_cultureonly, :]



In [71]:

    
df_cul.head()









    Out[71]:






  
    
      
      Acai Bowls
      Afghan
      African
      Alsatian
      American (New)
      American (Traditional)
      Arabian
      Argentine
      Armenian
      Asian Fusion
      ...
      Vegan
      Vegetarian
      Venezuelan
      Vietnamese
      Waffles
      Whiskey Bars
      Wine Bars
      cat_sum
      food_sum
      cul_sum
    
    
      business_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      EDqCEAGXVGCH4FJXgqtjqg
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      4
      3
      1
    
    
      GDnbt3isfhd57T1QqU6flg
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      4
      3
      1
    
    
      42romV8altAeuZuP2OC1gw
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      3
      2
      1
    
    
      826djy6K_9Fp0ptqJ2_Yag
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      3
      2
      1
    
    
      Mi5uhdFB9OJteXPd0_IKfw
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      3
      2
      1
    
  

5 rows × 204 columns



In [72]:

    
(df_cul.food_sum != 0 ).sum() == len(df_cul)









    Out[72]:





True



In [28]:

    
### Examine cultures only
biz_cultures = df_cul[df_cul.cul_sum > 0]
biz_cultures = biz_cultures.loc[:, list_culture + ['cat_sum', 'food_sum','cul_sum']]
biz_cultures.shape









    Out[28]:





(31182, 99)



In [ ]:

    
##############################



In [30]:

    
biz_cultures.head(10)









    Out[30]:






  
    
      
      Cantonese
      Szechuan
      Pan Asian
      Shanghainese
      Hainan
      Chinese
      Taiwanese
      Asian Fusion
      Salvadoran
      Slovakian
      ...
      Peruvian
      Baden
      Sri Lankan
      Japanese
      Cambodian
      Spanish
      Nicaraguan
      cat_sum
      food_sum
      cul_sum
    
    
      business_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      EDqCEAGXVGCH4FJXgqtjqg
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      4
      3
      1
    
    
      GDnbt3isfhd57T1QqU6flg
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      4
      3
      1
    
    
      42romV8altAeuZuP2OC1gw
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      3
      2
      1
    
    
      826djy6K_9Fp0ptqJ2_Yag
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      3
      2
      1
    
    
      Mi5uhdFB9OJteXPd0_IKfw
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      3
      2
      1
    
    
      Uxh0fXFH_QQBivRnIBpdiw
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      3
      1
      2
    
    
      YPavuOh2XsnRbLfl0DH2lQ
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      4
      3
      1
    
    
      saWZO6hB4B8P-mIzS1--Xw
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      6
      4
      2
    
    
      vJ-KyaojDQj1R2BNrlDToA
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      2
      1
      1
    
    
      hovoWva_UjbnyLWEbnFvBw
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      2
      1
      1
    
  

10 rows × 99 columns



In [76]:

    
lst = biz_cultures.columns[biz_cultures.iloc[5,:-3] == 1].values
lst









    Out[76]:





array(['American (Traditional)', 'Mexican'], dtype=object)



In [77]:

    
lsts_cul = map(lambda biz: biz_cultures.columns[biz_cultures.ix[biz,:-3] == 1].values, biz_cultures.index.values)



In [78]:

    
### Create culture table
columns = map(lambda c: 'culture_{}'.format(c), range(1, df_cul.cul_sum.max()+1))
df_CULTURE = pd.DataFrame(data=lsts_cul, index = df_cul.index, columns = columns)



In [79]:

    
df_CULTURE









    Out[79]:






  
    
      
      culture_1
      culture_2
      culture_3
      culture_4
      culture_5
      culture_6
      culture_7
    
    
      business_id
      
      
      
      
      
      
      
    
  
  
    
      EDqCEAGXVGCH4FJXgqtjqg
      Italian
      None
      None
      None
      None
      None
      None
    
    
      GDnbt3isfhd57T1QqU6flg
      Mexican
      None
      None
      None
      None
      None
      None
    
    
      42romV8altAeuZuP2OC1gw
      Hawaiian
      None
      None
      None
      None
      None
      None
    
    
      826djy6K_9Fp0ptqJ2_Yag
      Mexican
      None
      None
      None
      None
      None
      None
    
    
      Mi5uhdFB9OJteXPd0_IKfw
      Italian
      None
      None
      None
      None
      None
      None
    
    
      Uxh0fXFH_QQBivRnIBpdiw
      American (Traditional)
      Mexican
      None
      None
      None
      None
      None
    
    
      YPavuOh2XsnRbLfl0DH2lQ
      Southern
      None
      None
      None
      None
      None
      None
    
    
      saWZO6hB4B8P-mIzS1--Xw
      Persian/Iranian
      Greek
      None
      None
      None
      None
      None
    
    
      vJ-KyaojDQj1R2BNrlDToA
      American (Traditional)
      None
      None
      None
      None
      None
      None
    
    
      hovoWva_UjbnyLWEbnFvBw
      Thai
      None
      None
      None
      None
      None
      None
    
    
      93otbGHE0s0m-lU1osvg9w
      American (New)
      None
      None
      None
      None
      None
      None
    
    
      L_thK7r3K_h5M4tV7amEKQ
      Canadian (New)
      Italian
      None
      None
      None
      None
      None
    
    
      LIvXzdMJrIb6IY5Y3tqicA
      American (New)
      None
      None
      None
      None
      None
      None
    
    
      F53MSa5SYzO9BG8c_JhskQ
      Vietnamese
      None
      None
      None
      None
      None
      None
    
    
      hEcn9k6ONd5n2mq0lB2aew
      American (Traditional)
      None
      None
      None
      None
      None
      None
    
    
      diXqpz9WOr4WLQq6uvARkg
      Indian
      None
      None
      None
      None
      None
      None
    
    
      6pSUvtk5-OOaJfX0hbkb8Q
      Japanese
      None
      None
      None
      None
      None
      None
    
    
      el3HILPUZ3gxN20wcTJr3A
      Filipino
      None
      None
      None
      None
      None
      None
    
    
      hMh9XOwNQcu31NAOCqhAEw
      Indian
      None
      None
      None
      None
      None
      None
    
    
      pmJqSsCfgbo3TxPWpQNLIw
      American (New)
      None
      None
      None
      None
      None
      None
    
    
      Ja4stXdNYr39u5CZHMNtjw
      Mexican
      None
      None
      None
      None
      None
      None
    
    
      st_pnALlGB6OCWkb4e14mg
      Italian
      None
      None
      None
      None
      None
      None
    
    
      6TKJDgTrUoD3zu77A4JHnQ
      American (Traditional)
      None
      None
      None
      None
      None
      None
    
    
      V94gk_Vx0Ot623UGHG6PrA
      American (Traditional)
      Mexican
      None
      None
      None
      None
      None
    
    
      kUUBBLBHCasOl2a5nW9nAw
      Thai
      None
      None
      None
      None
      None
      None
    
    
      mRv3Z25F56qduMKnvnCkVA
      Korean
      None
      None
      None
      None
      None
      None
    
    
      NmiX_7talK5QTgMKvHPUaQ
      American (Traditional)
      None
      None
      None
      None
      None
      None
    
    
      Ir_QIzs-4o9ElOtiGuxJrw
      Mexican
      None
      None
      None
      None
      None
      None
    
    
      ODwqVEORhb9YAgo3MP0PXw
      Korean
      None
      None
      None
      None
      None
      None
    
    
      h7uxML49NQ_Wj-7zk0DSqg
      American (Traditional)
      None
      None
      None
      None
      None
      None
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      ofXqwIbHprYF6TP47XudOw
      Mexican
      None
      None
      None
      None
      None
      None
    
    
      ORloM7z9R9eh-G-an5DQLA
      American (New)
      American (Traditional)
      None
      None
      None
      None
      None
    
    
      tquWyW3Mm5ka8LJtzodALg
      Chinese
      None
      None
      None
      None
      None
      None
    
    
      ZL_EMDDN4QQ8tm6oXduArg
      Japanese
      None
      None
      None
      None
      None
      None
    
    
      ZZ8sAXHb0gX-c81Ea1M3OQ
      American (New)
      None
      None
      None
      None
      None
      None
    
    
      7yQMwsuV_8rV-EchuvCG5g
      Mexican
      None
      None
      None
      None
      None
      None
    
    
      22RFH-U5gboNJgws_GBG3w
      American (Traditional)
      None
      None
      None
      None
      None
      None
    
    
      zIOLoKIOXpGrzndARGhkAw
      Latin American
      Mexican
      None
      None
      None
      None
      None
    
    
      9iJMGMsTK-q6W5MB1_Ny3Q
      Chinese
      None
      None
      None
      None
      None
      None
    
    
      J6tgF98vwUz4D8T-N72org
      Middle Eastern
      None
      None
      None
      None
      None
      None
    
    
      xCTpRfAUp3zrhACPkzxp2Q
      Japanese
      None
      None
      None
      None
      None
      None
    
    
      UkoseRjoLT4K2bEzvmyhYA
      American (Traditional)
      None
      None
      None
      None
      None
      None
    
    
      w5CSi-An5meLnxjKSFn0wQ
      Asian Fusion
      Thai
      None
      None
      None
      None
      None
    
    
      PTPd2xamYtUCH00NqML-YA
      Chinese
      None
      None
      None
      None
      None
      None
    
    
      Q3UkgxNNInsPcUFhsQFcrg
      Chinese
      None
      None
      None
      None
      None
      None
    
    
      Ty7IyZN721l6VvBfc_zRVQ
      Asian Fusion
      None
      None
      None
      None
      None
      None
    
    
      JFnfylrxzH2gAVI_7HJZcQ
      Japanese
      None
      None
      None
      None
      None
      None
    
    
      JZE9AfuO_jUBMWzhg-SAcQ
      Vietnamese
      None
      None
      None
      None
      None
      None
    
    
      J_PAsINZhwL6J9aV2CZrfw
      American (Traditional)
      None
      None
      None
      None
      None
      None
    
    
      GIfZNMP0oIJCje_Xp0Bgrw
      American (New)
      None
      None
      None
      None
      None
      None
    
    
      bVgRVpdPgwHBLa7RHb9CpQ
      French
      American (New)
      None
      None
      None
      None
      None
    
    
      2hUENHFcBIqop8-uWvrf6A
      Irish
      None
      None
      None
      None
      None
      None
    
    
      J1RDyyPxhioqm8c_fi4P4Q
      American (New)
      None
      None
      None
      None
      None
      None
    
    
      C_chktl5jFeRqu1C0rmScA
      Indian
      None
      None
      None
      None
      None
      None
    
    
      xowy4YxBXXGOYuajsxjwkA
      American (New)
      None
      None
      None
      None
      None
      None
    
    
      g9gAuOEXBbWnLlu2uOftwA
      Indian
      None
      None
      None
      None
      None
      None
    
    
      -McKyjNSqS1h9dDJH3dyUA
      German
      None
      None
      None
      None
      None
      None
    
    
      eHLUQ2W_hXx61NmiL9kSVg
      Italian
      None
      None
      None
      None
      None
      None
    
    
      OgwN65jZebPRIPSmNpRP7A
      Chinese
      None
      None
      None
      None
      None
      None
    
    
      umBsUDjSPzF28LOw-U_NfA
      Latin American
      Mexican
      None
      None
      None
      None
      None
    
  

31182 rows × 7 columns



In [80]:

    
df_CULTURE.culture_7.unique()









    Out[80]:





array([None, 'Lebanese'], dtype=object)

Combine and Examine individual Cuisine

Chinese food, non-Chinese food, no-labeled



In [32]:

    
### Dictionary of combined cultures (keep building...)
dic_cuisine = {}



In [41]:

    
# List of Chinese food
list_chinese = ['Cantonese','Szechuan','Shanghainese','Hainan','Chinese','Taiwanese',\
                'Asian Fusion',\
                'Dim Sum', 'Hot Pot', 'Hong Kong Style Cafe']
dic_cuisine['Chinese'] = list_chinese



In [82]:

    
biz_cultures.head(3)









    Out[82]:






  
    
      
      Cantonese
      Szechuan
      Pan Asian
      Shanghainese
      Hainan
      Chinese
      Taiwanese
      Asian Fusion
      Salvadoran
      Slovakian
      ...
      Peruvian
      Baden
      Sri Lankan
      Japanese
      Cambodian
      Spanish
      Nicaraguan
      cat_sum
      food_sum
      cul_sum
    
    
      business_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      EDqCEAGXVGCH4FJXgqtjqg
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      4
      3
      1
    
    
      GDnbt3isfhd57T1QqU6flg
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      4
      3
      1
    
    
      42romV8altAeuZuP2OC1gw
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      3
      2
      1
    
  

3 rows × 99 columns



In [ ]:



In [ ]:

	business_id	categories
0	0DI8Dt2PJp07XkVvIElIcQ	[Tobacco Shops, Nightlife, Vape Shops, Shopping]
1	LTlCaCGZE14GuaUXUGbamg	[Caterers, Grocery, Food, Event Planning & Ser...
2	EDqCEAGXVGCH4FJXgqtjqg	[Restaurants, Pizza, Chicken Wings, Italian]

	& Probates	3D Printing	ATV Rentals/Tours	Acai Bowls	Accessories	Accountants	Acne Treatment	Active Life	Acupuncture	Addiction Medicine	...	Wine Tours	Wineries	Wok	Women's Clothing	Workers Compensation Law	Wraps	Yelp Events	Yoga	Ziplining	Zoos
business_id
0DI8Dt2PJp07XkVvIElIcQ	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
LTlCaCGZE14GuaUXUGbamg	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
EDqCEAGXVGCH4FJXgqtjqg	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
cnGIivYRLxpF7tBVR_JwWA	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
cdk-qqJ71q6P7TJTww_DSA	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	0	1	2	3	4	5	6	7
0	Food	Specialty Food	None	None	None	None	None	None
1	Bakeries	Food	None	None	None	None	None	None
2	Bakeries	Food	None	None	None	None	None	None
3	Sports Clubs	None	None	None	None	None	None	None
4	Coffee & Tea	Desserts	Food	None	None	None	None	None

	Acai Bowls	Afghan	African	Alsatian	American (New)	American (Traditional)	Arabian	Argentine	Armenian	Asian Fusion	...	Vegan	Vegetarian	Venezuelan	Vietnamese	Waffles	Whiskey Bars	Wine Bars	cat_sum	food_sum	cul_sum
business_id
EDqCEAGXVGCH4FJXgqtjqg	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	4	3	1
GDnbt3isfhd57T1QqU6flg	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	4	3	1
42romV8altAeuZuP2OC1gw	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	3	2	1
DNyYOxVAfu0oUcPNL1ljCQ	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	2	2	0
a1Ba6XeIOP48e64YFD0dMw	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	4	4	0

	culture_1	culture_2	culture_3	culture_4	culture_5	culture_6	culture_7
business_id
EDqCEAGXVGCH4FJXgqtjqg	Italian	None	None	None	None	None	None
GDnbt3isfhd57T1QqU6flg	Mexican	None	None	None	None	None	None
42romV8altAeuZuP2OC1gw	Hawaiian	None	None	None	None	None	None
826djy6K_9Fp0ptqJ2_Yag	Mexican	None	None	None	None	None	None
Mi5uhdFB9OJteXPd0_IKfw	Italian	None	None	None	None	None	None
Uxh0fXFH_QQBivRnIBpdiw	American (Traditional)	Mexican	None	None	None	None	None
YPavuOh2XsnRbLfl0DH2lQ	Southern	None	None	None	None	None	None
saWZO6hB4B8P-mIzS1--Xw	Persian/Iranian	Greek	None	None	None	None	None
vJ-KyaojDQj1R2BNrlDToA	American (Traditional)	None	None	None	None	None	None
hovoWva_UjbnyLWEbnFvBw	Thai	None	None	None	None	None	None
93otbGHE0s0m-lU1osvg9w	American (New)	None	None	None	None	None	None
L_thK7r3K_h5M4tV7amEKQ	Canadian (New)	Italian	None	None	None	None	None
LIvXzdMJrIb6IY5Y3tqicA	American (New)	None	None	None	None	None	None
F53MSa5SYzO9BG8c_JhskQ	Vietnamese	None	None	None	None	None	None
hEcn9k6ONd5n2mq0lB2aew	American (Traditional)	None	None	None	None	None	None
diXqpz9WOr4WLQq6uvARkg	Indian	None	None	None	None	None	None
6pSUvtk5-OOaJfX0hbkb8Q	Japanese	None	None	None	None	None	None
el3HILPUZ3gxN20wcTJr3A	Filipino	None	None	None	None	None	None
hMh9XOwNQcu31NAOCqhAEw	Indian	None	None	None	None	None	None
pmJqSsCfgbo3TxPWpQNLIw	American (New)	None	None	None	None	None	None
Ja4stXdNYr39u5CZHMNtjw	Mexican	None	None	None	None	None	None
st_pnALlGB6OCWkb4e14mg	Italian	None	None	None	None	None	None
6TKJDgTrUoD3zu77A4JHnQ	American (Traditional)	None	None	None	None	None	None
V94gk_Vx0Ot623UGHG6PrA	American (Traditional)	Mexican	None	None	None	None	None
kUUBBLBHCasOl2a5nW9nAw	Thai	None	None	None	None	None	None
mRv3Z25F56qduMKnvnCkVA	Korean	None	None	None	None	None	None
NmiX_7talK5QTgMKvHPUaQ	American (Traditional)	None	None	None	None	None	None
Ir_QIzs-4o9ElOtiGuxJrw	Mexican	None	None	None	None	None	None
ODwqVEORhb9YAgo3MP0PXw	Korean	None	None	None	None	None	None
h7uxML49NQ_Wj-7zk0DSqg	American (Traditional)	None	None	None	None	None	None
...	...	...	...	...	...	...	...
ofXqwIbHprYF6TP47XudOw	Mexican	None	None	None	None	None	None
ORloM7z9R9eh-G-an5DQLA	American (New)	American (Traditional)	None	None	None	None	None
tquWyW3Mm5ka8LJtzodALg	Chinese	None	None	None	None	None	None
ZL_EMDDN4QQ8tm6oXduArg	Japanese	None	None	None	None	None	None
ZZ8sAXHb0gX-c81Ea1M3OQ	American (New)	None	None	None	None	None	None
7yQMwsuV_8rV-EchuvCG5g	Mexican	None	None	None	None	None	None
22RFH-U5gboNJgws_GBG3w	American (Traditional)	None	None	None	None	None	None
zIOLoKIOXpGrzndARGhkAw	Latin American	Mexican	None	None	None	None	None
9iJMGMsTK-q6W5MB1_Ny3Q	Chinese	None	None	None	None	None	None
J6tgF98vwUz4D8T-N72org	Middle Eastern	None	None	None	None	None	None
xCTpRfAUp3zrhACPkzxp2Q	Japanese	None	None	None	None	None	None
UkoseRjoLT4K2bEzvmyhYA	American (Traditional)	None	None	None	None	None	None
w5CSi-An5meLnxjKSFn0wQ	Asian Fusion	Thai	None	None	None	None	None
PTPd2xamYtUCH00NqML-YA	Chinese	None	None	None	None	None	None
Q3UkgxNNInsPcUFhsQFcrg	Chinese	None	None	None	None	None	None
Ty7IyZN721l6VvBfc_zRVQ	Asian Fusion	None	None	None	None	None	None
JFnfylrxzH2gAVI_7HJZcQ	Japanese	None	None	None	None	None	None
JZE9AfuO_jUBMWzhg-SAcQ	Vietnamese	None	None	None	None	None	None
J_PAsINZhwL6J9aV2CZrfw	American (Traditional)	None	None	None	None	None	None
GIfZNMP0oIJCje_Xp0Bgrw	American (New)	None	None	None	None	None	None
bVgRVpdPgwHBLa7RHb9CpQ	French	American (New)	None	None	None	None	None
2hUENHFcBIqop8-uWvrf6A	Irish	None	None	None	None	None	None
J1RDyyPxhioqm8c_fi4P4Q	American (New)	None	None	None	None	None	None
C_chktl5jFeRqu1C0rmScA	Indian	None	None	None	None	None	None
xowy4YxBXXGOYuajsxjwkA	American (New)	None	None	None	None	None	None
g9gAuOEXBbWnLlu2uOftwA	Indian	None	None	None	None	None	None
-McKyjNSqS1h9dDJH3dyUA	German	None	None	None	None	None	None
eHLUQ2W_hXx61NmiL9kSVg	Italian	None	None	None	None	None	None
OgwN65jZebPRIPSmNpRP7A	Chinese	None	None	None	None	None	None
umBsUDjSPzF28LOw-U_NfA	Latin American	Mexican	None	None	None	None	None