Combine Cuisine



In [1]:

    
import pandas as pd
import numpy as np
import pickle



In [3]:

    
## load all cat from pickle
#df = pd.read_pickle('../data_all_cities/all_cities_preprocess.pkl')
# df.head(2)

Function that creates a cuisine type feature



In [2]:

    
import pandas as pd
import numpy as np

def create_cuisine(path_df, tar_column, cuisine, path_culture, path_cuisine, save = False):
    """
    This function creates a columne with spefic cruisine types
    
    Attribute:
    path_df: imported dataframe with businesses and a column of lists of categories
    tar_column: the columne in the df used to compare with our lists
    cuisine: name of the created cuisine type column
    path_culture: file path of the text file with a list of cultural words
    path_cuisine: file path of the text file with a list of selected cuisine words
    save: change it to "True" if save the output to pickle file
    
    Returns the original dataframe with a new cuisine column
         -- 2 if the business category belongs to the selected cuisine
         -- 1 if Not the selected cuisine (but with other region/culture related words)
         -- 0 if it has no cultural labels
    """
    # load dataframe
    df = pd.read_pickle(path_df)
    
    # load saved txt file
    list_culture = open(path_culture, 'rw').read().split('\n')
    list_cuisine = open(path_cuisine, 'rw').read().split('\n')
    
    # assign numbers to each business
    type_cuisine =  df[tar_column].apply(lambda l: 2 if len(set(l).intersection(list_cuisine)) > 0 \
                                   else 0 if len(set(l).intersection(list_culture)) == 0 \
                                   else 1)
    df['cuisine_{}'.format(cuisine)] = type_cuisine
    
    if save:
        df.to_pickle('Yelp_Cuisine_{}.p'.format(cuisine))
    
    return df



In [ ]:

1. Cuisine-- Chinese

Chinese food, non-Chinese food, no-labeled



In [3]:

    
# parameters
path_df = '../data_all_cities/all_cities_preprocess.pkl'
path_culture = 'cat_culture.txt'
path_cuisine = 'cat_culture-Chinese.txt'
cuisine = 'Chinese'
tar_column = 'categories'



In [4]:

    
# create column for Chinese cuisine type
df_chinese = create_cuisine(path_df, tar_column, cuisine, path_culture, path_cuisine, save = False)



In [5]:

    
df_chinese.head(3)









    Out[5]:






  
    
      
      address
      attributes
      business_id
      categories
      city
      hours
      is_open
      latitude
      longitude
      name
      ...
      RestaurantsDelivery
      RestaurantsGoodForGroups
      RestaurantsPriceRange2
      RestaurantsReservations
      RestaurantsTableService
      RestaurantsTakeOut
      Smoking
      WheelchairAccessible
      WiFi
      cuisine_Chinese
    
  
  
    
      EDqCEAGXVGCH4FJXgqtjqg
      979 Bloor Street W
      [{u'Alcohol': u'none'}, {u'Ambience': {u'roman...
      EDqCEAGXVGCH4FJXgqtjqg
      [Restaurants, Pizza, Chicken Wings, Italian]
      Toronto
      [Monday 11:0-2:0, Tuesday 11:0-2:0, Wednesday ...
      1
      43.661054
      -79.429089
      Pizza Pizza
      ...
      False
      False
      NaN
      NaN
      NaN
      NaN
      False
      NaN
      NaN
      1
    
    
      GDnbt3isfhd57T1QqU6flg
      11072 No Frank Lloyd Wright
      [{u'Alcohol': u'none'}, {u'Ambience': {u'roman...
      GDnbt3isfhd57T1QqU6flg
      [Tex-Mex, Mexican, Fast Food, Restaurants]
      Scottsdale
      [Monday 10:0-22:0, Tuesday 10:0-22:0, Wednesda...
      1
      33.586710
      -111.835410
      Taco Bell
      ...
      False
      False
      NaN
      NaN
      NaN
      NaN
      False
      NaN
      NaN
      1
    
    
      a1Ba6XeIOP48e64YFD0dMw
      2000 Mansfield Street, Suite 104
      [{u'Caters': True}]
      a1Ba6XeIOP48e64YFD0dMw
      [Sandwiches, Breakfast & Brunch, Salad, Restau...
      Montréal
      [Monday 6:30-17:0, Tuesday 6:30-17:0, Wednesda...
      1
      45.502346
      -73.573807
      La Prep
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      0
    
  

3 rows × 98 columns



In [54]:

    
# Check
# df_chinese[df_chinese.cuisine_Chinese == 0]



In [6]:

    
df_chinese.shape









    Out[6]:





(27314, 98)



In [7]:

    
df_chinese.groupby(df_chinese.cuisine_Chinese).size()









    Out[7]:





cuisine_Chinese
0     8458
1    16163
2     2693
dtype: int64



In [8]:

    
df_chinese.to_pickle('Yelp_Cuisine_Chinese.pkl')

# if save as .csv df_chinese.to_csv('Yelp_Cuisine_Chinese.csv', encoding="utf8")

Japanese Cuisine



In [4]:

    
# parameters
path_df = '../data_all_cities/all_cities_preprocess.pkl'
path_culture = 'cat_culture.txt'
path_cuisine = 'cat_culture-Japanese.txt'
cuisine = 'Japanese'
tar_column = 'categories'



In [5]:

    
# create column for Chinese cuisine type
df_japanese = create_cuisine(path_df, tar_column, cuisine, path_culture, path_cuisine, save = False)



In [26]:

    
#df_japanese.head(2)



In [8]:

    
df_japanese.shape









    Out[8]:





(27314, 98)



In [9]:

    
df_japanese.groupby(df_japanese.cuisine_Japanese).size()









    Out[9]:





cuisine_Japanese
0     8458
1    17231
2     1625
dtype: int64



In [11]:

    
# save
df_japanese.to_pickle('Yelp_Cuisine_Japanese.pkl')

American Cuisine



In [18]:

    
# parameters
path_df = '../data_all_cities/all_cities_preprocess.pkl'
path_culture = 'cat_culture.txt'
path_cuisine = 'cat_culture-American.txt'
cuisine = 'American'
tar_column = 'categories'

# create column for Indian cuisine type
df_american = create_cuisine(path_df, tar_column, cuisine, path_culture, path_cuisine, save = False)

df_american.shape









    Out[18]:





(27314, 98)



In [19]:

    
df_american.groupby(df_american.cuisine_American).size()









    Out[19]:





cuisine_American
0     8458
1    13477
2     5379
dtype: int64



In [20]:

    
# save
df_american.to_pickle('Yelp_Cuisine_American.pkl')

Indian Cuisine



In [15]:

    
# parameters
path_df = '../data_all_cities/all_cities_preprocess.pkl'
path_culture = 'cat_culture.txt'
path_cuisine = 'cat_culture-Indian.txt'
cuisine = 'Indian'
tar_column = 'categories'

# create column for Indian cuisine type
df_indian = create_cuisine(path_df, tar_column, cuisine, path_culture, path_cuisine, save = False)

df_indian.shape









    Out[15]:





(27314, 98)



In [22]:

    
df_indian.groupby(df_indian.cuisine_Indian).size()









    Out[22]:





cuisine_Indian
0     8458
1    17528
2     1328
dtype: int64



In [21]:

    
# save
df_indian.to_pickle('Yelp_Cuisine_Indian.pkl')

Spanish Cuisine



In [23]:

    
# parameters
path_df = '../data_all_cities/all_cities_preprocess.pkl'
path_culture = 'cat_culture.txt'
path_cuisine = 'cat_culture-Spanish.txt'
cuisine = 'Spanish'
tar_column = 'categories'

# create column for Indian cuisine type
df_spanish = create_cuisine(path_df, tar_column, cuisine, path_culture, path_cuisine, save = False)

df_spanish.shape









    Out[23]:





(27314, 98)



In [24]:

    
df_spanish.groupby(df_spanish.cuisine_Spanish).size()









    Out[24]:





cuisine_Spanish
0     8458
1    16323
2     2533
dtype: int64



In [25]:

    
# save
df_spanish.to_pickle('Yelp_Cuisine_Spanish.pkl')



In [ ]:



In [ ]:

	address	attributes	business_id	categories	city	hours	is_open	latitude	longitude	name	...	RestaurantsDelivery	RestaurantsGoodForGroups	RestaurantsPriceRange2	RestaurantsReservations	RestaurantsTableService	RestaurantsTakeOut	Smoking	WheelchairAccessible	WiFi	cuisine_Chinese
EDqCEAGXVGCH4FJXgqtjqg	979 Bloor Street W	[{u'Alcohol': u'none'}, {u'Ambience': {u'roman...	EDqCEAGXVGCH4FJXgqtjqg	[Restaurants, Pizza, Chicken Wings, Italian]	Toronto	[Monday 11:0-2:0, Tuesday 11:0-2:0, Wednesday ...	1	43.661054	-79.429089	Pizza Pizza	...	False	False	NaN	NaN	NaN	NaN	False	NaN	NaN	1
GDnbt3isfhd57T1QqU6flg	11072 No Frank Lloyd Wright	[{u'Alcohol': u'none'}, {u'Ambience': {u'roman...	GDnbt3isfhd57T1QqU6flg	[Tex-Mex, Mexican, Fast Food, Restaurants]	Scottsdale	[Monday 10:0-22:0, Tuesday 10:0-22:0, Wednesda...	1	33.586710	-111.835410	Taco Bell	...	False	False	NaN	NaN	NaN	NaN	False	NaN	NaN	1
a1Ba6XeIOP48e64YFD0dMw	2000 Mansfield Street, Suite 104	[{u'Caters': True}]	a1Ba6XeIOP48e64YFD0dMw	[Sandwiches, Breakfast & Brunch, Salad, Restau...	Montréal	[Monday 6:30-17:0, Tuesday 6:30-17:0, Wednesda...	1	45.502346	-73.573807	La Prep	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0