In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_pickle('UC02_df_uc_food_att.p')

In [4]:
data.columns


Out[4]:
Index([                        u'address',                      u'attributes',
                           u'business_id',                      u'categories',
                                  u'city',                           u'hours',
                               u'is_open',                        u'latitude',
                             u'longitude',                            u'name',
                          u'neighborhood',                     u'postal_code',
                          u'review_count',                           u'stars',
                                 u'state',                            u'type',
                      u'AcceptsInsurance',                     u'AgesAllowed',
                               u'Alcohol',                 u'Ambience_casual',
                       u'Ambience_classy',                  u'Ambience_divey',
                      u'Ambience_hipster',               u'Ambience_intimate',
                     u'Ambience_romantic',               u'Ambience_touristy',
                       u'Ambience_trendy',                u'Ambience_upscale',
                                  u'BYOB',                     u'BYOBCorkage',
                     u'BestNights_friday',               u'BestNights_monday',
                   u'BestNights_saturday',               u'BestNights_sunday',
                   u'BestNights_thursday',              u'BestNights_tuesday',
                  u'BestNights_wednesday',                     u'BikeParking',
                u'BusinessAcceptsBitcoin',      u'BusinessAcceptsCreditCards',
                u'BusinessParking_garage',             u'BusinessParking_lot',
                u'BusinessParking_street',           u'BusinessParking_valet',
             u'BusinessParking_validated',               u'ByAppointmentOnly',
                                u'Caters',                       u'CoatCheck',
                               u'Corkage',  u'DietaryRestrictions_dairy-free',
       u'DietaryRestrictions_gluten-free',       u'DietaryRestrictions_halal',
            u'DietaryRestrictions_kosher',    u'DietaryRestrictions_soy-free',
             u'DietaryRestrictions_vegan',  u'DietaryRestrictions_vegetarian',
                           u'DogsAllowed',                       u'DriveThru',
                        u'GoodForDancing',                     u'GoodForKids',
                 u'GoodForMeal_breakfast',              u'GoodForMeal_brunch',
                   u'GoodForMeal_dessert',              u'GoodForMeal_dinner',
                 u'GoodForMeal_latenight',               u'GoodForMeal_lunch',
                             u'HappyHour',                           u'HasTV',
                u'Music_background_music',                        u'Music_dj',
                         u'Music_jukebox',                   u'Music_karaoke',
                            u'Music_live',                  u'Music_no_music',
                           u'Music_video',                      u'NoiseLevel',
                           u'Open24Hours',                  u'OutdoorSeating',
                     u'RestaurantsAttire',       u'RestaurantsCounterService',
                   u'RestaurantsDelivery',        u'RestaurantsGoodForGroups',
                u'RestaurantsPriceRange2',         u'RestaurantsReservations',
               u'RestaurantsTableService',              u'RestaurantsTakeOut',
                               u'Smoking',            u'WheelchairAccessible',
                                  u'WiFi'],
      dtype='object')

In [80]:
X = data.iloc[:, 16:]

In [18]:
len(X)


Out[18]:
334

In [16]:
X.isnull().sum()


Out[16]:
AcceptsInsurance              291
AgesAllowed                    94
Alcohol                       333
Ambience_casual                80
Ambience_classy                84
Ambience_divey                 94
Ambience_hipster              285
Ambience_intimate              94
Ambience_romantic             332
Ambience_touristy             332
Ambience_trendy                 2
Ambience_upscale               79
BYOB                           86
BYOBCorkage                   285
BestNights_friday             322
BestNights_monday             291
BestNights_saturday           299
BestNights_sunday              73
BestNights_thursday           295
BestNights_tuesday             30
BestNights_wednesday           86
BikeParking                    94
BusinessAcceptsBitcoin        285
BusinessAcceptsCreditCards    132
BusinessParking_garage         94
BusinessParking_lot           106
BusinessParking_street         86
BusinessParking_valet         332
BusinessParking_validated     303
ByAppointmentOnly             302
                             ... 
GoodForKids                    94
GoodForMeal_breakfast          94
GoodForMeal_brunch             30
GoodForMeal_dessert           291
GoodForMeal_dinner             86
GoodForMeal_latenight          63
GoodForMeal_lunch               4
HappyHour                     332
HasTV                         315
Music_background_music         81
Music_dj                       86
Music_jukebox                 291
Music_karaoke                 319
Music_live                    291
Music_no_music                285
Music_video                    30
NoiseLevel                     30
Open24Hours                   291
OutdoorSeating                 94
RestaurantsAttire             285
RestaurantsCounterService     332
RestaurantsDelivery            86
RestaurantsGoodForGroups       30
RestaurantsPriceRange2        280
RestaurantsReservations       288
RestaurantsTableService       282
RestaurantsTakeOut            332
Smoking                        86
WheelchairAccessible          291
WiFi                          332
dtype: int64

In [81]:
X = pd.get_dummies(X, dummy_na=False, drop_first=True)

In [82]:
X


Out[82]:
AcceptsInsurance_True AgesAllowed_True Ambience_casual_full_bar Ambience_casual_none Ambience_classy_True Ambience_divey_loud Ambience_divey_quiet Ambience_divey_very_loud Ambience_hipster_True Ambience_intimate_dressy ... Open24Hours_True OutdoorSeating_True RestaurantsAttire_True RestaurantsDelivery_True RestaurantsPriceRange2_True RestaurantsReservations_True RestaurantsReservations_outdoor RestaurantsTableService_True Smoking_True WheelchairAccessible_True
iLxG2Oo8HNSZFvJvba4W6g 0 0 1 0 1 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 1 0
-zEpEmDfFQL-ph0N3BDlXA 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
D6SCh4BwNb52wZIqXdS4JQ 1 0 1 0 1 1 0 0 1 0 ... 0 1 0 0 0 0 1 0 1 1
5lw5rrhFMz-liykUn8dGSw 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
rP-MueqMyZbiDbOEUELXtQ 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
xLeOANU-lqtDOIV6Owm3EQ 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2YnUIGGmhoJX4Cd61nQLkg 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
t952QLgrh-5l1woJ4IFgUA 0 0 0 1 0 0 1 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
xWGn7oqLOZu_00H8pZOtsQ 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
Epr8Ft7LwYAzotFwxQX6uw 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
HbK6IfznbVToEaKVC0WcSg 1 0 1 0 1 0 0 0 1 0 ... 0 1 0 0 1 0 1 0 0 0
myyJxx7nvs-5_T-Uvc-kLw 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
uf8WNaHKKssMUbQVg8nJ7g 0 0 0 1 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
BLtzUCwmGOSVgPJ78N3ORA 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
ETxB6iSdRyhX4PTY0AnAIA 0 0 0 0 1 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
gEJZ5USuvC1r8_QvSvDCNA 0 0 0 1 0 0 1 0 0 0 ... 0 1 0 0 0 0 0 0 1 0
6o6XxnI8RWhnAVSdxqCJWQ 0 0 0 1 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 1 0
SFh9uHtGR1lVPH2RTS2SWQ 0 0 0 1 1 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
cGuUEFw374zfpr3Cp8ywJw 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
Bfh_ON-LQUcJerjcDF5KNA 0 0 0 1 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
pCer-vIZM05p423MFmiSng 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
pjO0ZRGpCwlaOzipZDa3HA 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5jXwI9WpDxGDmzmkmE0RTg 0 0 1 0 1 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
Gvox18F9QxsSS9qG2O8E-A 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
hX-foSZsr_fW_n7ydo4D0Q 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
Ih9my8hp4KqcyBXI9Cyevg 1 0 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 1 0
Icj9QzQxqvzOmPm_liAq1Q 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
M8G8S2takaE_NOVutGzEkg 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
-XH3-e3jDVdjHJq7GfNY7w 0 0 0 1 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
e0prCZXtHGQIKeQ_wTW3uw 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
vVdY2qDO7dkUrU8LXYZ1uA 0 0 0 1 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 1 0
BKNPoWf3bQoxFWf-Kjt3wQ 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
O3zVw-VAcJi8ta4B6v5iAA 0 0 1 0 1 0 0 0 0 0 ... 0 0 0 1 0 0 1 0 0 0
ZxQlHVm0pj0ERqpwhEHc6w 0 0 0 1 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
J5U-nbhKSnnX7DJGT6QELg 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
s173l3ubdDLHeFIBO3ydrw 0 0 0 1 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
m6nLTIrEJHtHSYJjVavnzw 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
Ah4i15g8Ow_zphzcpulTxQ 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
NZ3A0xYdnmIG68mZeZV6AA 1 0 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
QdganL5AW9E_fNoRVroLig 0 0 0 1 1 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 1 0
x6fvLWdx-Kb8_oMeqaHhRQ 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
z-Q3uNXOjr7iYpl9kexFzQ 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
UTrqsP-oQrwoIOmBv_Jtzg 0 1 0 1 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
w2xwgqHM7dJlHfyu4ouhBg 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
dn9lwYUxmhs_mLKPu7L25Q 1 0 1 0 1 0 0 0 0 0 ... 0 1 1 0 0 0 1 0 0 0
K2lscpx0S9wpqNz0s9AIwQ 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
a39B-BfbCipNkNH7vwCfug 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
OrX0D6xwxmymxfzWRatT7Q 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
JbWQtVLQDEOBlvbVWk2OFg 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 1 0
o80p16nM5CWbl22JP1nB-w 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
xE4WM5ZvtBkFnojFUTQfqg 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
xeN57OX3lJfY96nfEx63fg 0 0 0 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
nplkC6vnh4qT9xH-vhup6w 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4cPM1Hcj3UiN3fxyOsiR9w 0 0 0 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
DysXlYb_WVlksoip8atIvQ 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
XbHxWOciYlBhJOjKRQbo9g 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
mofOjB6flg-eAWOFbOkHfQ 0 0 1 0 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
vRAM7lfJDY6pLZlxd4ge1Q 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
swdilw4Vbk-196sTVdMo-A 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
-i3dOjumvOw-52aGXU1xDg 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

334 rows × 65 columns


In [44]:
from sklearn.preprocessing import label_binarize

In [70]:
y = data['stars']
def good_bad(x):
    if x >= 4.0:
        return 1
    else:
        return 0
y = y.apply(good_bad)

In [66]:
y = data['stars'].astype(str)

In [59]:
list(y.unique())


Out[59]:
['3.0', '4.5', '4.0', '5.0', '3.5', '1.0', '2.5', '2.0', '1.5']

In [60]:
y = label_binarize(y, classes=list(y.unique()))

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [84]:
rf = RandomForestClassifier(n_estimators=1000, random_state=1)
rf = rf.fit(X_train, y_train)
y_predict = rf.predict(X_test)
print roc_auc_score(y_test, y_predict, average=None)


0.624823943662

In [85]:
X.head()


Out[85]:
AcceptsInsurance_True AgesAllowed_True Ambience_casual_full_bar Ambience_casual_none Ambience_classy_True Ambience_divey_loud Ambience_divey_quiet Ambience_divey_very_loud Ambience_hipster_True Ambience_intimate_dressy ... Open24Hours_True OutdoorSeating_True RestaurantsAttire_True RestaurantsDelivery_True RestaurantsPriceRange2_True RestaurantsReservations_True RestaurantsReservations_outdoor RestaurantsTableService_True Smoking_True WheelchairAccessible_True
iLxG2Oo8HNSZFvJvba4W6g 0 0 1 0 1 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 1 0
-zEpEmDfFQL-ph0N3BDlXA 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
D6SCh4BwNb52wZIqXdS4JQ 1 0 1 0 1 1 0 0 1 0 ... 0 1 0 0 0 0 1 0 1 1
5lw5rrhFMz-liykUn8dGSw 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
rP-MueqMyZbiDbOEUELXtQ 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 65 columns


In [52]:
rf.feature_importances_


Out[52]:
array([  7.08695551e-04,   2.06634883e-03,   4.34665938e-03,
         5.03297170e-03,   1.08354919e-03,   1.53620781e-02,
         1.62823596e-02,   1.80114810e-03,   1.67958944e-02,
         3.00554804e-03,   8.30724616e-03,   1.80113435e-02,
         7.48980642e-03,   3.84432887e-03,   3.39230978e-03,
         1.11355238e-03,   7.52062326e-04,   3.36672656e-03,
         6.24440868e-03,   0.00000000e+00,   8.32592635e-04,
         3.80086042e-03,   0.00000000e+00,   1.35057481e-02,
         5.89276382e-03,   5.29669822e-03,   7.62559073e-03,
         1.02760482e-03,   1.34006380e-03,   1.07889008e-02,
         3.75518228e-03,   4.45420953e-04,   1.24656731e-02,
         1.47229267e-02,   1.13829971e-02,   5.97326995e-03,
         0.00000000e+00,   6.07759961e-04,   7.19162873e-03,
         4.53248785e-03,   6.61315983e-03,   1.14859418e-03,
         1.00969076e-02,   7.85605326e-03,   0.00000000e+00,
         1.80354062e-03,   1.97225241e-02,   2.04615671e-02,
         2.07151550e-02,   6.27180613e-03,   2.69253569e-02,
         3.47211350e-03,   1.10906329e-02,   0.00000000e+00,
         2.57384098e-03,   9.42248213e-05,   1.16468480e-02,
         0.00000000e+00,   2.14355364e-02,   2.37136476e-02,
         1.92164451e-02,   1.71599747e-03,   1.30621469e-02,
         1.53760928e-02,   0.00000000e+00,   0.00000000e+00,
         7.28479026e-03,   6.11827271e-03,   9.71579664e-03,
         3.85889954e-03,   2.91576335e-02,   2.40160697e-02,
         2.63775773e-03,   2.87838242e-03,   8.22989673e-04,
         4.86899912e-03,   6.04050649e-03,   8.68059154e-03,
         1.18679078e-02,   1.69991812e-02,   1.03044012e-02,
         6.17210930e-03,   3.01120654e-03,   1.50453312e-02,
         1.26807987e-03,   2.40146995e-02,   6.90181167e-03,
         5.41047616e-03,   1.65603005e-03,   0.00000000e+00,
         3.79754380e-03,   3.30567549e-02,   1.55401802e-02,
         3.44982923e-02,   5.12798070e-03,   2.38013460e-03,
         0.00000000e+00,   5.15156015e-04,   7.08011372e-03,
         1.96277394e-02,   1.69055014e-02,   1.27257664e-03,
         2.72101801e-03,   2.48652326e-03,   0.00000000e+00,
         1.34384643e-03,   1.01046127e-02,   4.16048281e-03,
         5.01987723e-03,   2.75382291e-03,   5.13855683e-03,
         2.98582637e-02,   5.52172487e-03,   4.53506205e-03,
         7.33982020e-03,   7.45133173e-04,   2.55157156e-03,
         2.45231122e-02,   1.86727509e-03,   3.87602835e-03,
         3.93666043e-04,   1.58354507e-03,   8.46021549e-05,
         6.12744552e-03,   6.57586204e-03,   3.06295662e-03,
         1.61247640e-03,   0.00000000e+00,   1.83856872e-04,
         1.07195123e-03,   3.41867141e-03,   2.74444544e-03,
         0.00000000e+00,   2.59908490e-02,   3.36744914e-03,
         1.12271249e-03,   3.13690064e-03,   1.23458247e-03])

In [86]:
print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), X.columns), 
             reverse=True)


Features sorted by their score:
[(0.0516, 'GoodForMeal_latenight_True'), (0.0486, 'DietaryRestrictions_kosher_True'), (0.0485, 'GoodForMeal_lunch_2'), (0.0456, 'BusinessParking_lot_free'), (0.0413, 'Music_background_music_True'), (0.0413, 'Caters_True'), (0.0409, 'DietaryRestrictions_gluten-free_True'), (0.0403, 'Ambience_casual_none'), (0.0385, 'GoodForMeal_brunch_True'), (0.038, 'Music_video_True'), (0.0346, 'BusinessAcceptsCreditCards_True'), (0.0342, 'OutdoorSeating_True'), (0.0323, 'BusinessParking_garage_True'), (0.0292, 'Smoking_True'), (0.0287, 'BestNights_sunday_True'), (0.0271, 'Ambience_classy_True'), (0.026, 'GoodForDancing_True'), (0.0245, 'Corkage_True'), (0.0245, 'Ambience_divey_quiet'), (0.0228, 'Ambience_casual_full_bar'), (0.0223, 'Ambience_upscale_True'), (0.0221, 'BestNights_saturday_True'), (0.02, 'DriveThru_True'), (0.0116, 'Ambience_trendy_True'), (0.0114, 'BestNights_thursday_yes_free'), (0.0106, 'Ambience_divey_loud'), (0.0105, 'DietaryRestrictions_halal_True'), (0.0092, 'GoodForKids_True'), (0.0092, 'BikeParking_True'), (0.0085, 'RestaurantsPriceRange2_True'), (0.0084, 'BestNights_friday_True'), (0.008, 'AcceptsInsurance_True'), (0.0071, 'RestaurantsAttire_True'), (0.0069, 'GoodForMeal_breakfast_True'), (0.0063, 'BusinessParking_lot_paid'), (0.0063, 'Ambience_divey_very_loud'), (0.0063, 'AgesAllowed_True'), (0.0058, 'RestaurantsReservations_outdoor'), (0.0058, 'GoodForMeal_lunch_3'), (0.0054, 'BusinessParking_street_True'), (0.0054, 'BYOB_True'), (0.0053, 'BestNights_tuesday_True'), (0.0053, 'Ambience_hipster_True'), (0.0052, 'RestaurantsTableService_True'), (0.0052, 'Music_karaoke_True'), (0.0051, 'NoiseLevel_True'), (0.005, 'Music_jukebox_True'), (0.0048, 'Ambience_intimate_dressy'), (0.0047, 'Music_dj_True'), (0.0045, 'Music_live_True'), (0.0042, 'WheelchairAccessible_True'), (0.0038, 'RestaurantsDelivery_True'), (0.0034, 'BestNights_monday_True'), (0.0033, 'GoodForMeal_dessert_True'), (0.0032, 'DietaryRestrictions_vegetarian_True'), (0.0027, 'Open24Hours_True'), (0.0019, 'HasTV_True'), (0.0019, 'Ambience_intimate_formal'), (0.0016, 'BestNights_thursday_yes_corkage'), (0.0015, 'Music_no_music_True'), (0.0006, 'RestaurantsReservations_True'), (0.0006, 'DietaryRestrictions_dairy-free_True'), (0.0002, 'GoodForMeal_dinner_True'), (0.0, 'ByAppointmentOnly_True'), (0.0, 'BusinessAcceptsBitcoin_True')]

In [ ]: