In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_pickle('../data_processeing/Yelp_Cuisine_Chinese.pkl')

In [15]:
df_chi = df[df['cuisine_Chinese'] == 2]

In [8]:
df.columns


Out[8]:
Index([                          u'address',
                              u'attributes',
                             u'business_id',
                              u'categories',
                                    u'city',
                                   u'hours',
                                 u'is_open',
                                u'latitude',
                               u'longitude',
                                    u'name',
                            u'neighborhood',
                             u'postal_code',
                            u'review_count',
                                   u'stars',
                                   u'state',
                                    u'type',
                        u'AcceptsInsurance',
                             u'AgesAllowed',
                                 u'Alcohol',
                         u'Ambience_casual',
                         u'Ambience_classy',
                          u'Ambience_divey',
                        u'Ambience_hipster',
                       u'Ambience_intimate',
                       u'Ambience_romantic',
                       u'Ambience_touristy',
                         u'Ambience_trendy',
                        u'Ambience_upscale',
                                    u'BYOB',
                             u'BYOBCorkage',
                       u'BestNights_friday',
                       u'BestNights_monday',
                     u'BestNights_saturday',
                       u'BestNights_sunday',
                     u'BestNights_thursday',
                      u'BestNights_tuesday',
                    u'BestNights_wednesday',
                             u'BikeParking',
                  u'BusinessAcceptsBitcoin',
              u'BusinessAcceptsCreditCards',
                  u'BusinessParking_garage',
                     u'BusinessParking_lot',
                  u'BusinessParking_street',
                   u'BusinessParking_valet',
               u'BusinessParking_validated',
                       u'ByAppointmentOnly',
                                  u'Caters',
                               u'CoatCheck',
                                 u'Corkage',
          u'DietaryRestrictions_dairy-free',
         u'DietaryRestrictions_gluten-free',
               u'DietaryRestrictions_halal',
              u'DietaryRestrictions_kosher',
            u'DietaryRestrictions_soy-free',
               u'DietaryRestrictions_vegan',
          u'DietaryRestrictions_vegetarian',
                             u'DogsAllowed',
                               u'DriveThru',
                          u'GoodForDancing',
                             u'GoodForKids',
                   u'GoodForMeal_breakfast',
                      u'GoodForMeal_brunch',
                     u'GoodForMeal_dessert',
                      u'GoodForMeal_dinner',
                   u'GoodForMeal_latenight',
                       u'GoodForMeal_lunch',
       u'HairSpecializesIn_africanamerican',
                 u'HairSpecializesIn_asian',
              u'HairSpecializesIn_coloring',
                 u'HairSpecializesIn_curly',
            u'HairSpecializesIn_extensions',
                  u'HairSpecializesIn_kids',
                 u'HairSpecializesIn_perms',
         u'HairSpecializesIn_straightperms',
                               u'HappyHour',
                                   u'HasTV',
                  u'Music_background_music',
                                u'Music_dj',
                           u'Music_jukebox',
                           u'Music_karaoke',
                              u'Music_live',
                          u'Music_no_music',
                             u'Music_video',
                              u'NoiseLevel',
                             u'Open24Hours',
                          u'OutdoorSeating',
                       u'RestaurantsAttire',
               u'RestaurantsCounterService',
                     u'RestaurantsDelivery',
                u'RestaurantsGoodForGroups',
                  u'RestaurantsPriceRange2',
                 u'RestaurantsReservations',
                 u'RestaurantsTableService',
                      u'RestaurantsTakeOut',
                                 u'Smoking',
                    u'WheelchairAccessible',
                                    u'WiFi',
                         u'cuisine_Chinese'],
      dtype='object')

In [20]:
df_chi['neighborhood']


Out[20]:
q_KQbgnaYDlPx8EHTydcBQ              Brown's Corners
2px99IppAcnxR238eq_8_w                             
TkXbFJtFCdM_WTLkHa6Erw                             
dYmm5468BdWxWgksXpy2TQ                             
OS5IyYEXgYV9_Z0iS4MkOw                 St. Lawrence
01aNlDhbMObjc9OdAHuNpQ                             
KeQ1cK564cL5C_hBTFrqnA           Mississauga Valley
ZcMRFFuvFt5gH2hdnpyigg    Rosemont-La Petite-Patrie
1uPQTz5XZSWr0ti7kUpaDw                  Ville-Marie
oWDLyENvjBa5LZdRGGOrow                             
3PhF188adiC8ZutKRP9gyQ                             
rb4d1_OtpVD91J5NDp4Y3w                    South End
cXdQ3H0GqcIzRgrvBEaBxw                             
-GDfRHphXjccSWhliFHnzg                             
ls3r7doDW0p65cwMWxEIdQ                  Scarborough
VzUo-RURV3VnfNItAYM8yg                Spring Valley
98gLNkYYJCg4f_uJ7o2wBg                Plaza Midwood
3GfdCuI0YCc5U3rLLLPHUw                             
N7wlv3nrhDPuDDY8yX6QvQ                             
H_eO04NZAQIDcbtFQ4BUag                     Westside
f5xm2RiwLv0gbmXU4BkrGA                   Unionville
7sMxjqfidcS1VTqdXBWhnA                             
NiviplzqF1RtwLvYnL79fQ                             
0BEaW7yAystJ2OG-S0jv-Q                             
2xbxXWeu3tpZEJdGTTGbLg                     Westside
q0oPX1DXW86QytTvvrD9MA                  Scarborough
0C6uuZBD7xKS-uucgwoyQw                             
N3zuaqGESF5iZsi_md9c1Q                Spring Valley
KUI1j3pYVVQ6ddLqTS0GWw                             
qMJCVx6-ZYvZQHlfrZ_cug                             
                                    ...            
R8Mw0iFenJ3nwXuZb7Ql4A                             
u6JJojkbqVlVI4NqKsL54Q                Bickford Park
5XX1w4WbwSlrACQ5vKuLDw                  Ville-Marie
B6xVgae6x4Xm5wVNrxaHiw                             
DIxgItsSI9QwX9H8lVptYg                    Northwest
_5XClj4E5VCIsEscbrrPKg                 Little Italy
2reHgsuAWofHfocfJ4Xs5w                    Chinatown
fojiPcgWlM-FJTxZlz4Idg                             
-yzBFD877La-RP739VxF7w                   Willowdale
bj6Km1RvgD6oyppCe5yDog                             
D5tehjlw9ZIoUzhOEblFqA                             
Gg0sdZWtkk9yEgpS5W_PIA                             
VbY4yD6UmA9jKKxHscrxvg                             
mofOjB6flg-eAWOFbOkHfQ                             
aXnWWp6Reaep4-0iKWlYHQ                Downtown Core
pFg0ZuTVVZMZpANZXI0LtA                             
bM-rAr7gxsxxjD-FlRcIJA                             
phTLTkqSvylLkmz5wAwhTw                    Northwest
XdfYrFt4tkHn3TZWtBPDWw                             
2YBO1LEKIgyle0uX50u15Q                             
QpOAv584eb-ecG4TLEm95g                    Southeast
JPfi__QJAaRzmfh5aOyFEw                             
osvcl7Fh_J5VwUikh5Vskg                     Milliken
9Z_6rRy7Tl_C6HIgm7y6FA                             
uLUl_dMl6a9m774LGReDVA                             
tquWyW3Mm5ka8LJtzodALg                             
9iJMGMsTK-q6W5MB1_Ny3Q                             
w5CSi-An5meLnxjKSFn0wQ                             
Q3UkgxNNInsPcUFhsQFcrg                             
OgwN65jZebPRIPSmNpRP7A                             
Name: neighborhood, dtype: object

In [18]:
X = df_chi.iloc[:, 16:-1]

In [48]:
y = df_chi['stars']

In [49]:
y.plot(kind='kde')


Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f15b92d5a90>

In [31]:
X = pd.get_dummies(X, dummy_na=False, drop_first=True)

In [50]:
y.value_counts()


Out[50]:
3.5    862
4.0    725
3.0    544
4.5    228
2.5    210
2.0     69
5.0     33
1.5     15
1.0      7
Name: stars, dtype: int64

In [35]:
X.sum().sort_values()


Out[35]:
HairSpecializesIn_straightperms_True         1
BusinessParking_validated_True               1
GoodForDancing_True                          1
Ambience_intimate_True                       2
GoodForKids_True                             2
CoatCheck_True                               2
Ambience_romantic_formal                     3
DietaryRestrictions_vegan_True               3
DietaryRestrictions_dairy-free_True          4
ByAppointmentOnly_True                       4
Music_live_True                              4
RestaurantsReservations_True                 5
HairSpecializesIn_africanamerican_True       6
Open24Hours_True                             7
BestNights_thursday_yes_corkage              7
BestNights_saturday_True                     7
BusinessAcceptsBitcoin_True                  8
Ambience_trendy_True                         8
RestaurantsTakeOut_True                      8
BestNights_monday_True                      10
BestNights_tuesday_True                     11
Music_jukebox_True                          11
BusinessParking_lot_paid                    12
DietaryRestrictions_kosher_True             12
RestaurantsTableService_True                15
HairSpecializesIn_coloring_True             15
HairSpecializesIn_kids_4                    17
BestNights_wednesday_True                   17
Ambience_touristy_True                      18
GoodForMeal_breakfast_True                  22
                                          ... 
RestaurantsCounterService_True              64
GoodForMeal_latenight_True                  69
GoodForMeal_dinner_True                     93
HairSpecializesIn_kids_3                    99
BYOBCorkage_True                           123
Music_video_True                           133
AgesAllowed_True                           147
Ambience_hipster_loud                      153
GoodForMeal_lunch_True                     455
HairSpecializesIn_curly_True               532
Ambience_hipster_quiet                     615
Ambience_classy_full_bar                   647
BusinessParking_lot_free                   713
HappyHour_True                             936
RestaurantsPriceRange2_True               1061
Ambience_divey_True                       1065
BusinessAcceptsCreditCards_True           1085
Ambience_classy_none                      1117
Music_no_music_True                       1144
OutdoorSeating_True                       1193
BYOB_True                                 1291
Smoking_True                              1324
BusinessParking_garage_True               1357
HairSpecializesIn_kids_2                  1389
Corkage_True                              1665
GoodForMeal_dessert_True                  1759
Ambience_upscale_True                     2248
BestNights_sunday_True                    2313
DietaryRestrictions_soy-free_True         2347
DietaryRestrictions_halal_True            2516
dtype: int64

In [51]:
def good_bad(x):
    if x >= 4.0:
        return 1
    else:
        return 0
y = y.apply(good_bad)

In [52]:
y


Out[52]:
q_KQbgnaYDlPx8EHTydcBQ    0
2px99IppAcnxR238eq_8_w    1
TkXbFJtFCdM_WTLkHa6Erw    0
dYmm5468BdWxWgksXpy2TQ    0
OS5IyYEXgYV9_Z0iS4MkOw    0
01aNlDhbMObjc9OdAHuNpQ    1
KeQ1cK564cL5C_hBTFrqnA    1
ZcMRFFuvFt5gH2hdnpyigg    0
1uPQTz5XZSWr0ti7kUpaDw    1
oWDLyENvjBa5LZdRGGOrow    0
3PhF188adiC8ZutKRP9gyQ    0
rb4d1_OtpVD91J5NDp4Y3w    1
cXdQ3H0GqcIzRgrvBEaBxw    1
-GDfRHphXjccSWhliFHnzg    0
ls3r7doDW0p65cwMWxEIdQ    0
VzUo-RURV3VnfNItAYM8yg    1
98gLNkYYJCg4f_uJ7o2wBg    0
3GfdCuI0YCc5U3rLLLPHUw    1
N7wlv3nrhDPuDDY8yX6QvQ    0
H_eO04NZAQIDcbtFQ4BUag    1
f5xm2RiwLv0gbmXU4BkrGA    1
7sMxjqfidcS1VTqdXBWhnA    0
NiviplzqF1RtwLvYnL79fQ    1
0BEaW7yAystJ2OG-S0jv-Q    0
2xbxXWeu3tpZEJdGTTGbLg    1
q0oPX1DXW86QytTvvrD9MA    1
0C6uuZBD7xKS-uucgwoyQw    1
N3zuaqGESF5iZsi_md9c1Q    1
KUI1j3pYVVQ6ddLqTS0GWw    0
qMJCVx6-ZYvZQHlfrZ_cug    1
                         ..
R8Mw0iFenJ3nwXuZb7Ql4A    0
u6JJojkbqVlVI4NqKsL54Q    1
5XX1w4WbwSlrACQ5vKuLDw    1
B6xVgae6x4Xm5wVNrxaHiw    0
DIxgItsSI9QwX9H8lVptYg    1
_5XClj4E5VCIsEscbrrPKg    1
2reHgsuAWofHfocfJ4Xs5w    1
fojiPcgWlM-FJTxZlz4Idg    0
-yzBFD877La-RP739VxF7w    1
bj6Km1RvgD6oyppCe5yDog    0
D5tehjlw9ZIoUzhOEblFqA    0
Gg0sdZWtkk9yEgpS5W_PIA    1
VbY4yD6UmA9jKKxHscrxvg    1
mofOjB6flg-eAWOFbOkHfQ    0
aXnWWp6Reaep4-0iKWlYHQ    0
pFg0ZuTVVZMZpANZXI0LtA    0
bM-rAr7gxsxxjD-FlRcIJA    1
phTLTkqSvylLkmz5wAwhTw    0
XdfYrFt4tkHn3TZWtBPDWw    1
2YBO1LEKIgyle0uX50u15Q    0
QpOAv584eb-ecG4TLEm95g    0
JPfi__QJAaRzmfh5aOyFEw    1
osvcl7Fh_J5VwUikh5Vskg    0
9Z_6rRy7Tl_C6HIgm7y6FA    1
uLUl_dMl6a9m774LGReDVA    0
tquWyW3Mm5ka8LJtzodALg    0
9iJMGMsTK-q6W5MB1_Ny3Q    1
w5CSi-An5meLnxjKSFn0wQ    1
Q3UkgxNNInsPcUFhsQFcrg    1
OgwN65jZebPRIPSmNpRP7A    1
Name: stars, dtype: int64

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [76]:
def RF_model(X, y, test_size, random_state_split, random_state_model, n_estimators):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state_split)
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state_model)
    rf = rf.fit(X_train, y_train)
    y_predict = rf.predict(X_test)
    return roc_auc_score(y_test, y_predict)

In [86]:
RF_model(X, y, test_size=0.33, random_state_split=1, random_state_model=1, n_estimators=100)


Out[86]:
0.57762115757407917