Combine Cuisine


In [1]:
import pandas as pd
import numpy as np
import pickle

In [3]:
## load all cat from pickle
#df = pd.read_pickle('../data_all_cities/all_cities_preprocess.pkl')
# df.head(2)

Function that creates a cuisine type feature


In [2]:
import pandas as pd
import numpy as np

def create_cuisine(path_df, tar_column, cuisine, path_culture, path_cuisine, save = False):
    """
    This function creates a columne with spefic cruisine types
    
    Attribute:
    path_df: imported dataframe with businesses and a column of lists of categories
    tar_column: the columne in the df used to compare with our lists
    cuisine: name of the created cuisine type column
    path_culture: file path of the text file with a list of cultural words
    path_cuisine: file path of the text file with a list of selected cuisine words
    save: change it to "True" if save the output to pickle file
    
    Returns the original dataframe with a new cuisine column
         -- 2 if the business category belongs to the selected cuisine
         -- 1 if Not the selected cuisine (but with other region/culture related words)
         -- 0 if it has no cultural labels
    """
    # load dataframe
    df = pd.read_pickle(path_df)
    
    # load saved txt file
    list_culture = open(path_culture, 'rw').read().split('\n')
    list_cuisine = open(path_cuisine, 'rw').read().split('\n')
    
    # assign numbers to each business
    type_cuisine =  df[tar_column].apply(lambda l: 2 if len(set(l).intersection(list_cuisine)) > 0 \
                                   else 0 if len(set(l).intersection(list_culture)) == 0 \
                                   else 1)
    df['cuisine_{}'.format(cuisine)] = type_cuisine
    
    if save:
        df.to_pickle('Yelp_Cuisine_{}.p'.format(cuisine))
    
    return df

In [ ]:

1. Cuisine-- Chinese

Chinese food, non-Chinese food, no-labeled


In [3]:
# parameters
path_df = '../data_all_cities/all_cities_preprocess.pkl'
path_culture = 'cat_culture.txt'
path_cuisine = 'cat_culture-Chinese.txt'
cuisine = 'Chinese'
tar_column = 'categories'

In [4]:
# create column for Chinese cuisine type
df_chinese = create_cuisine(path_df, tar_column, cuisine, path_culture, path_cuisine, save = False)

In [5]:
df_chinese.head(3)


Out[5]:
address attributes business_id categories city hours is_open latitude longitude name ... RestaurantsDelivery RestaurantsGoodForGroups RestaurantsPriceRange2 RestaurantsReservations RestaurantsTableService RestaurantsTakeOut Smoking WheelchairAccessible WiFi cuisine_Chinese
EDqCEAGXVGCH4FJXgqtjqg 979 Bloor Street W [{u'Alcohol': u'none'}, {u'Ambience': {u'roman... EDqCEAGXVGCH4FJXgqtjqg [Restaurants, Pizza, Chicken Wings, Italian] Toronto [Monday 11:0-2:0, Tuesday 11:0-2:0, Wednesday ... 1 43.661054 -79.429089 Pizza Pizza ... False False NaN NaN NaN NaN False NaN NaN 1
GDnbt3isfhd57T1QqU6flg 11072 No Frank Lloyd Wright [{u'Alcohol': u'none'}, {u'Ambience': {u'roman... GDnbt3isfhd57T1QqU6flg [Tex-Mex, Mexican, Fast Food, Restaurants] Scottsdale [Monday 10:0-22:0, Tuesday 10:0-22:0, Wednesda... 1 33.586710 -111.835410 Taco Bell ... False False NaN NaN NaN NaN False NaN NaN 1
a1Ba6XeIOP48e64YFD0dMw 2000 Mansfield Street, Suite 104 [{u'Caters': True}] a1Ba6XeIOP48e64YFD0dMw [Sandwiches, Breakfast & Brunch, Salad, Restau... Montréal [Monday 6:30-17:0, Tuesday 6:30-17:0, Wednesda... 1 45.502346 -73.573807 La Prep ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0

3 rows × 98 columns


In [54]:
# Check
# df_chinese[df_chinese.cuisine_Chinese == 0]

In [6]:
df_chinese.shape


Out[6]:
(27314, 98)

In [7]:
df_chinese.groupby(df_chinese.cuisine_Chinese).size()


Out[7]:
cuisine_Chinese
0     8458
1    16163
2     2693
dtype: int64

In [8]:
df_chinese.to_pickle('Yelp_Cuisine_Chinese.pkl')
# if save as .csv df_chinese.to_csv('Yelp_Cuisine_Chinese.csv', encoding="utf8")

Japanese Cuisine


In [4]:
# parameters
path_df = '../data_all_cities/all_cities_preprocess.pkl'
path_culture = 'cat_culture.txt'
path_cuisine = 'cat_culture-Japanese.txt'
cuisine = 'Japanese'
tar_column = 'categories'

In [5]:
# create column for Chinese cuisine type
df_japanese = create_cuisine(path_df, tar_column, cuisine, path_culture, path_cuisine, save = False)

In [26]:
#df_japanese.head(2)

In [8]:
df_japanese.shape


Out[8]:
(27314, 98)

In [9]:
df_japanese.groupby(df_japanese.cuisine_Japanese).size()


Out[9]:
cuisine_Japanese
0     8458
1    17231
2     1625
dtype: int64

In [11]:
# save
df_japanese.to_pickle('Yelp_Cuisine_Japanese.pkl')

American Cuisine


In [18]:
# parameters
path_df = '../data_all_cities/all_cities_preprocess.pkl'
path_culture = 'cat_culture.txt'
path_cuisine = 'cat_culture-American.txt'
cuisine = 'American'
tar_column = 'categories'

# create column for Indian cuisine type
df_american = create_cuisine(path_df, tar_column, cuisine, path_culture, path_cuisine, save = False)

df_american.shape


Out[18]:
(27314, 98)

In [19]:
df_american.groupby(df_american.cuisine_American).size()


Out[19]:
cuisine_American
0     8458
1    13477
2     5379
dtype: int64

In [20]:
# save
df_american.to_pickle('Yelp_Cuisine_American.pkl')

Indian Cuisine


In [15]:
# parameters
path_df = '../data_all_cities/all_cities_preprocess.pkl'
path_culture = 'cat_culture.txt'
path_cuisine = 'cat_culture-Indian.txt'
cuisine = 'Indian'
tar_column = 'categories'

# create column for Indian cuisine type
df_indian = create_cuisine(path_df, tar_column, cuisine, path_culture, path_cuisine, save = False)

df_indian.shape


Out[15]:
(27314, 98)

In [22]:
df_indian.groupby(df_indian.cuisine_Indian).size()


Out[22]:
cuisine_Indian
0     8458
1    17528
2     1328
dtype: int64

In [21]:
# save
df_indian.to_pickle('Yelp_Cuisine_Indian.pkl')

Spanish Cuisine


In [23]:
# parameters
path_df = '../data_all_cities/all_cities_preprocess.pkl'
path_culture = 'cat_culture.txt'
path_cuisine = 'cat_culture-Spanish.txt'
cuisine = 'Spanish'
tar_column = 'categories'

# create column for Indian cuisine type
df_spanish = create_cuisine(path_df, tar_column, cuisine, path_culture, path_cuisine, save = False)

df_spanish.shape


Out[23]:
(27314, 98)

In [24]:
df_spanish.groupby(df_spanish.cuisine_Spanish).size()


Out[24]:
cuisine_Spanish
0     8458
1    16323
2     2533
dtype: int64

In [25]:
# save
df_spanish.to_pickle('Yelp_Cuisine_Spanish.pkl')

In [ ]:


In [ ]: