In [1]:
import pandas as pd
import numpy as np

In [6]:
df = pd.read_pickle('../data_processeing/Yelp_Cuisine_Chinese.pkl')
print df.shape
df.head(1)


(27314, 98)
Out[6]:
address attributes business_id categories city hours is_open latitude longitude name ... RestaurantsDelivery RestaurantsGoodForGroups RestaurantsPriceRange2 RestaurantsReservations RestaurantsTableService RestaurantsTakeOut Smoking WheelchairAccessible WiFi cuisine_Chinese
EDqCEAGXVGCH4FJXgqtjqg 979 Bloor Street W [{u'Alcohol': u'none'}, {u'Ambience': {u'roman... EDqCEAGXVGCH4FJXgqtjqg [Restaurants, Pizza, Chicken Wings, Italian] Toronto [Monday 11:0-2:0, Tuesday 11:0-2:0, Wednesday ... 1 43.661054 -79.429089 Pizza Pizza ... False False NaN NaN NaN NaN False NaN NaN 1

1 rows × 98 columns


In [7]:
df.groupby(df.name).size()[df.groupby(df.name).size().values > 100]


Out[7]:
name
Chipotle Mexican Grill    156
Domino's Pizza            170
KFC                       116
McDonald's                139
Panda Express             123
Panera Bread              119
Pizza Hut                 108
Subway                    160
Taco Bell                 215
dtype: int64

In [8]:
df_cn = df[df.cuisine_Chinese == 2]
df_cn.groupby(df_cn.name).size()[df_cn.groupby(df_cn.name).size().values > 10]


Out[8]:
name
Noodles & Company      19
P.F. Chang's           15
Panda Express         123
Pei Wei                20
Teriyaki Madness       12
The Captain's Boil     11
dtype: int64

In [9]:
df2 = df.copy()
df2 = df2[df2.cuisine_Chinese != 0]
print len(df), df2.shape
df2.dropna(axis=1, inplace=True, how='all')
print df2.shape


27314 (18856, 98)
(18856, 90)

In [11]:
df2.groupby(df2.cuisine_Chinese).size()


Out[11]:
cuisine_Chinese
1    16163
2     2693
dtype: int64

In [6]:
df_att = df2.loc[ :, u'AcceptsInsurance':'WiFi'] # include 'WiFi'
att_type = {}
col_b = []
col_m = []
for i in range(df_att.shape[1]):
    temp = df_att.groupby(df_att.iloc[:,i]).size().keys()
    print temp.name, ': ', temp.values
    if set(temp.values) == set([False, True]) or set(temp.values).issubset(set([False, True])):
        att_type[temp.name] = 'b'
        col_b.append(temp.name)
    else:
        att_type[temp.name] = 'm'
        col_m.append(temp.name)

print len(col_b), len(col_m)
col_all = col_b + col_m


AcceptsInsurance :  [False True]
AgesAllowed :  [False True]
Alcohol :  [False True]
Ambience_casual :  [False True]
Ambience_classy :  ['beer_and_wine' 'full_bar' 'none']
Ambience_divey :  [False True]
Ambience_hipster :  ['average' 'loud' 'quiet' 'very_loud']
Ambience_intimate :  [False True]
Ambience_romantic :  ['casual' 'dressy' 'formal']
Ambience_touristy :  [False True]
Ambience_trendy :  [False True]
Ambience_upscale :  [False True]
BYOB :  [False True]
BYOBCorkage :  [False True]
BestNights_friday :  [False]
BestNights_monday :  [False True]
BestNights_saturday :  [False True]
BestNights_sunday :  [False True]
BestNights_thursday :  [False 'yes_corkage' 'yes_free']
BestNights_tuesday :  [False True]
BestNights_wednesday :  [False True]
BikeParking :  [False True]
BusinessAcceptsBitcoin :  [False True]
BusinessAcceptsCreditCards :  [False True]
BusinessParking_garage :  [False True]
BusinessParking_lot :  [False 'free' 'paid']
BusinessParking_valet :  [False True]
BusinessParking_validated :  [False True]
ByAppointmentOnly :  [False True]
CoatCheck :  [False True]
Corkage :  [False True]
DietaryRestrictions_dairy-free :  [False True]
DietaryRestrictions_halal :  [False True]
DietaryRestrictions_kosher :  [False True]
DietaryRestrictions_soy-free :  [False True]
DietaryRestrictions_vegan :  [False True]
DietaryRestrictions_vegetarian :  [False True]
DogsAllowed :  [False True]
GoodForDancing :  [False True]
GoodForKids :  [False True]
GoodForMeal_breakfast :  [False True]
GoodForMeal_brunch :  [False True]
GoodForMeal_dessert :  [False True]
GoodForMeal_dinner :  [False True]
GoodForMeal_latenight :  [False True]
GoodForMeal_lunch :  [False True]
HairSpecializesIn_africanamerican :  [False True]
HairSpecializesIn_asian :  [False True]
HairSpecializesIn_coloring :  [False True]
HairSpecializesIn_curly :  [False True]
HairSpecializesIn_kids :  [1 2 3 4]
HairSpecializesIn_perms :  ['18plus' '19plus' '21plus' 'allages']
HairSpecializesIn_straightperms :  [False True]
HappyHour :  [False True]
HasTV :  [False True]
Music_background_music :  [False True]
Music_dj :  [False True]
Music_jukebox :  [False True]
Music_live :  [False True]
Music_no_music :  [False True]
Music_video :  [False True]
Open24Hours :  [False True]
OutdoorSeating :  [False True]
RestaurantsAttire :  [False True]
RestaurantsCounterService :  [False True]
RestaurantsDelivery :  [False True]
RestaurantsGoodForGroups :  [False True]
RestaurantsPriceRange2 :  [False True]
RestaurantsReservations :  [False True 'outdoor']
RestaurantsTableService :  [False True]
RestaurantsTakeOut :  [False True]
Smoking :  [False True]
WiFi :  [False True]
65 8

Potential Col: 'city', 'state', 'hours', 'review_count', 'start'


In [7]:
df_att_b = df2.loc[:,col_b].join(df.cuisine_Chinese)
df_att_b.head(1).iloc[:,:-1]


Out[7]:
AcceptsInsurance AgesAllowed Alcohol Ambience_casual Ambience_divey Ambience_intimate Ambience_touristy Ambience_trendy Ambience_upscale BYOB ... OutdoorSeating RestaurantsAttire RestaurantsCounterService RestaurantsDelivery RestaurantsGoodForGroups RestaurantsPriceRange2 RestaurantsTableService RestaurantsTakeOut Smoking WiFi
EDqCEAGXVGCH4FJXgqtjqg NaN False NaN NaN True NaN NaN NaN True False ... False NaN NaN False False NaN NaN NaN False NaN

1 rows × 65 columns


In [8]:
df_att_m = df2.loc[:,col_m].join(df.cuisine_Chinese)
df_att_m.head(1)


Out[8]:
Ambience_classy Ambience_hipster Ambience_romantic BestNights_thursday BusinessParking_lot HairSpecializesIn_kids HairSpecializesIn_perms RestaurantsReservations cuisine_Chinese
EDqCEAGXVGCH4FJXgqtjqg none quiet casual NaN free 1 NaN NaN 1

In [9]:
df_att_b.to_csv('df_att_b.csv',index_label=False)
df_att_m.to_csv('df_att_m.csv',index_label=False)
df_att_all = df_att.join(df.cuisine_Chinese)
df_att_all.to_csv('df_att_all.csv',index_label=False)

In [10]:
# from sklearn.naive_bayes import MultinomialNB, BernoulliNB

In [11]:
# mod_b = BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None)

In [12]:
# mod_b.fit(X=df_att_b.iloc[:,:-1], y=df_att_b.cuisine_Chinese)

naive bayes function in sklearn can not deal with NA