In [1]:
import pandas as pd
import numpy as np
In [2]:
data = pd.read_pickle('UC02_df_uc_food_att.p')
In [4]:
data.columns
Out[4]:
Index([ u'address', u'attributes',
u'business_id', u'categories',
u'city', u'hours',
u'is_open', u'latitude',
u'longitude', u'name',
u'neighborhood', u'postal_code',
u'review_count', u'stars',
u'state', u'type',
u'AcceptsInsurance', u'AgesAllowed',
u'Alcohol', u'Ambience_casual',
u'Ambience_classy', u'Ambience_divey',
u'Ambience_hipster', u'Ambience_intimate',
u'Ambience_romantic', u'Ambience_touristy',
u'Ambience_trendy', u'Ambience_upscale',
u'BYOB', u'BYOBCorkage',
u'BestNights_friday', u'BestNights_monday',
u'BestNights_saturday', u'BestNights_sunday',
u'BestNights_thursday', u'BestNights_tuesday',
u'BestNights_wednesday', u'BikeParking',
u'BusinessAcceptsBitcoin', u'BusinessAcceptsCreditCards',
u'BusinessParking_garage', u'BusinessParking_lot',
u'BusinessParking_street', u'BusinessParking_valet',
u'BusinessParking_validated', u'ByAppointmentOnly',
u'Caters', u'CoatCheck',
u'Corkage', u'DietaryRestrictions_dairy-free',
u'DietaryRestrictions_gluten-free', u'DietaryRestrictions_halal',
u'DietaryRestrictions_kosher', u'DietaryRestrictions_soy-free',
u'DietaryRestrictions_vegan', u'DietaryRestrictions_vegetarian',
u'DogsAllowed', u'DriveThru',
u'GoodForDancing', u'GoodForKids',
u'GoodForMeal_breakfast', u'GoodForMeal_brunch',
u'GoodForMeal_dessert', u'GoodForMeal_dinner',
u'GoodForMeal_latenight', u'GoodForMeal_lunch',
u'HappyHour', u'HasTV',
u'Music_background_music', u'Music_dj',
u'Music_jukebox', u'Music_karaoke',
u'Music_live', u'Music_no_music',
u'Music_video', u'NoiseLevel',
u'Open24Hours', u'OutdoorSeating',
u'RestaurantsAttire', u'RestaurantsCounterService',
u'RestaurantsDelivery', u'RestaurantsGoodForGroups',
u'RestaurantsPriceRange2', u'RestaurantsReservations',
u'RestaurantsTableService', u'RestaurantsTakeOut',
u'Smoking', u'WheelchairAccessible',
u'WiFi'],
dtype='object')
In [80]:
X = data.iloc[:, 16:]
In [18]:
len(X)
Out[18]:
334
In [16]:
X.isnull().sum()
Out[16]:
AcceptsInsurance 291
AgesAllowed 94
Alcohol 333
Ambience_casual 80
Ambience_classy 84
Ambience_divey 94
Ambience_hipster 285
Ambience_intimate 94
Ambience_romantic 332
Ambience_touristy 332
Ambience_trendy 2
Ambience_upscale 79
BYOB 86
BYOBCorkage 285
BestNights_friday 322
BestNights_monday 291
BestNights_saturday 299
BestNights_sunday 73
BestNights_thursday 295
BestNights_tuesday 30
BestNights_wednesday 86
BikeParking 94
BusinessAcceptsBitcoin 285
BusinessAcceptsCreditCards 132
BusinessParking_garage 94
BusinessParking_lot 106
BusinessParking_street 86
BusinessParking_valet 332
BusinessParking_validated 303
ByAppointmentOnly 302
...
GoodForKids 94
GoodForMeal_breakfast 94
GoodForMeal_brunch 30
GoodForMeal_dessert 291
GoodForMeal_dinner 86
GoodForMeal_latenight 63
GoodForMeal_lunch 4
HappyHour 332
HasTV 315
Music_background_music 81
Music_dj 86
Music_jukebox 291
Music_karaoke 319
Music_live 291
Music_no_music 285
Music_video 30
NoiseLevel 30
Open24Hours 291
OutdoorSeating 94
RestaurantsAttire 285
RestaurantsCounterService 332
RestaurantsDelivery 86
RestaurantsGoodForGroups 30
RestaurantsPriceRange2 280
RestaurantsReservations 288
RestaurantsTableService 282
RestaurantsTakeOut 332
Smoking 86
WheelchairAccessible 291
WiFi 332
dtype: int64
In [81]:
X = pd.get_dummies(X, dummy_na=False, drop_first=True)
In [82]:
X
Out[82]:
AcceptsInsurance_True
AgesAllowed_True
Ambience_casual_full_bar
Ambience_casual_none
Ambience_classy_True
Ambience_divey_loud
Ambience_divey_quiet
Ambience_divey_very_loud
Ambience_hipster_True
Ambience_intimate_dressy
...
Open24Hours_True
OutdoorSeating_True
RestaurantsAttire_True
RestaurantsDelivery_True
RestaurantsPriceRange2_True
RestaurantsReservations_True
RestaurantsReservations_outdoor
RestaurantsTableService_True
Smoking_True
WheelchairAccessible_True
iLxG2Oo8HNSZFvJvba4W6g
0
0
1
0
1
0
0
0
0
0
...
0
1
0
0
0
0
0
0
1
0
-zEpEmDfFQL-ph0N3BDlXA
0
0
0
0
0
0
0
0
0
0
...
0
1
0
0
0
0
0
0
0
0
D6SCh4BwNb52wZIqXdS4JQ
1
0
1
0
1
1
0
0
1
0
...
0
1
0
0
0
0
1
0
1
1
5lw5rrhFMz-liykUn8dGSw
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
rP-MueqMyZbiDbOEUELXtQ
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
xLeOANU-lqtDOIV6Owm3EQ
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2YnUIGGmhoJX4Cd61nQLkg
0
0
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
1
0
t952QLgrh-5l1woJ4IFgUA
0
0
0
1
0
0
1
0
0
0
...
0
1
0
0
0
0
0
0
0
0
xWGn7oqLOZu_00H8pZOtsQ
0
0
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
Epr8Ft7LwYAzotFwxQX6uw
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
HbK6IfznbVToEaKVC0WcSg
1
0
1
0
1
0
0
0
1
0
...
0
1
0
0
1
0
1
0
0
0
myyJxx7nvs-5_T-Uvc-kLw
0
0
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
uf8WNaHKKssMUbQVg8nJ7g
0
0
0
1
0
1
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
BLtzUCwmGOSVgPJ78N3ORA
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
1
0
ETxB6iSdRyhX4PTY0AnAIA
0
0
0
0
1
0
0
0
0
1
...
0
0
0
0
0
0
0
0
0
0
gEJZ5USuvC1r8_QvSvDCNA
0
0
0
1
0
0
1
0
0
0
...
0
1
0
0
0
0
0
0
1
0
6o6XxnI8RWhnAVSdxqCJWQ
0
0
0
1
0
0
0
0
0
0
...
0
1
0
0
0
0
0
0
1
0
SFh9uHtGR1lVPH2RTS2SWQ
0
0
0
1
1
1
0
0
0
0
...
0
0
0
0
0
0
0
0
1
0
cGuUEFw374zfpr3Cp8ywJw
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
Bfh_ON-LQUcJerjcDF5KNA
0
0
0
1
0
0
1
0
0
0
...
0
0
0
0
0
0
0
0
0
0
pCer-vIZM05p423MFmiSng
0
0
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
pjO0ZRGpCwlaOzipZDa3HA
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
5jXwI9WpDxGDmzmkmE0RTg
0
0
1
0
1
0
0
0
0
0
...
0
1
0
0
0
0
0
0
0
0
Gvox18F9QxsSS9qG2O8E-A
0
0
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
hX-foSZsr_fW_n7ydo4D0Q
0
0
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
Ih9my8hp4KqcyBXI9Cyevg
1
0
1
0
1
0
0
0
0
0
...
0
0
0
0
0
0
1
0
1
0
Icj9QzQxqvzOmPm_liAq1Q
0
0
1
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
M8G8S2takaE_NOVutGzEkg
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
-XH3-e3jDVdjHJq7GfNY7w
0
0
0
1
0
0
1
0
0
0
...
0
0
0
0
0
0
0
0
0
0
e0prCZXtHGQIKeQ_wTW3uw
0
0
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
vVdY2qDO7dkUrU8LXYZ1uA
0
0
0
1
0
0
0
0
0
0
...
0
1
0
0
0
0
0
0
1
0
BKNPoWf3bQoxFWf-Kjt3wQ
0
0
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
O3zVw-VAcJi8ta4B6v5iAA
0
0
1
0
1
0
0
0
0
0
...
0
0
0
1
0
0
1
0
0
0
ZxQlHVm0pj0ERqpwhEHc6w
0
0
0
1
0
0
0
0
0
0
...
0
1
0
0
0
0
0
0
0
0
J5U-nbhKSnnX7DJGT6QELg
0
0
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
1
0
s173l3ubdDLHeFIBO3ydrw
0
0
0
1
1
0
1
0
0
0
...
0
0
0
0
0
0
0
0
0
0
m6nLTIrEJHtHSYJjVavnzw
0
0
0
0
0
0
1
0
0
0
...
0
0
0
0
0
0
0
0
1
0
Ah4i15g8Ow_zphzcpulTxQ
0
0
0
0
0
0
0
0
0
0
...
0
1
0
0
0
0
0
0
0
0
NZ3A0xYdnmIG68mZeZV6AA
1
0
1
0
1
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
1
QdganL5AW9E_fNoRVroLig
0
0
0
1
1
0
0
0
0
0
...
0
1
0
0
0
0
0
0
1
0
x6fvLWdx-Kb8_oMeqaHhRQ
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
z-Q3uNXOjr7iYpl9kexFzQ
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
UTrqsP-oQrwoIOmBv_Jtzg
0
1
0
1
0
0
1
0
0
0
...
0
0
0
0
0
0
0
0
1
0
w2xwgqHM7dJlHfyu4ouhBg
0
0
0
0
1
0
0
0
0
0
...
0
0
0
0
0
0
0
0
1
0
dn9lwYUxmhs_mLKPu7L25Q
1
0
1
0
1
0
0
0
0
0
...
0
1
1
0
0
0
1
0
0
0
K2lscpx0S9wpqNz0s9AIwQ
0
0
1
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
a39B-BfbCipNkNH7vwCfug
0
0
0
0
1
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
OrX0D6xwxmymxfzWRatT7Q
0
0
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
1
0
JbWQtVLQDEOBlvbVWk2OFg
0
0
0
0
0
0
0
0
0
0
...
0
1
0
0
0
0
0
0
1
0
o80p16nM5CWbl22JP1nB-w
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
xE4WM5ZvtBkFnojFUTQfqg
0
0
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
xeN57OX3lJfY96nfEx63fg
0
0
0
1
1
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
nplkC6vnh4qT9xH-vhup6w
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
4cPM1Hcj3UiN3fxyOsiR9w
0
0
0
1
1
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
DysXlYb_WVlksoip8atIvQ
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
XbHxWOciYlBhJOjKRQbo9g
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
mofOjB6flg-eAWOFbOkHfQ
0
0
1
0
0
0
0
0
0
0
...
0
1
0
0
0
0
0
0
0
0
vRAM7lfJDY6pLZlxd4ge1Q
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
swdilw4Vbk-196sTVdMo-A
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
-i3dOjumvOw-52aGXU1xDg
0
0
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
334 rows × 65 columns
In [44]:
from sklearn.preprocessing import label_binarize
In [70]:
y = data['stars']
def good_bad(x):
if x >= 4.0:
return 1
else:
return 0
y = y.apply(good_bad)
In [66]:
y = data['stars'].astype(str)
In [59]:
list(y.unique())
Out[59]:
['3.0', '4.5', '4.0', '5.0', '3.5', '1.0', '2.5', '2.0', '1.5']
In [60]:
y = label_binarize(y, classes=list(y.unique()))
In [33]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
In [84]:
rf = RandomForestClassifier(n_estimators=1000, random_state=1)
rf = rf.fit(X_train, y_train)
y_predict = rf.predict(X_test)
print roc_auc_score(y_test, y_predict, average=None)
0.624823943662
In [85]:
X.head()
Out[85]:
AcceptsInsurance_True
AgesAllowed_True
Ambience_casual_full_bar
Ambience_casual_none
Ambience_classy_True
Ambience_divey_loud
Ambience_divey_quiet
Ambience_divey_very_loud
Ambience_hipster_True
Ambience_intimate_dressy
...
Open24Hours_True
OutdoorSeating_True
RestaurantsAttire_True
RestaurantsDelivery_True
RestaurantsPriceRange2_True
RestaurantsReservations_True
RestaurantsReservations_outdoor
RestaurantsTableService_True
Smoking_True
WheelchairAccessible_True
iLxG2Oo8HNSZFvJvba4W6g
0
0
1
0
1
0
0
0
0
0
...
0
1
0
0
0
0
0
0
1
0
-zEpEmDfFQL-ph0N3BDlXA
0
0
0
0
0
0
0
0
0
0
...
0
1
0
0
0
0
0
0
0
0
D6SCh4BwNb52wZIqXdS4JQ
1
0
1
0
1
1
0
0
1
0
...
0
1
0
0
0
0
1
0
1
1
5lw5rrhFMz-liykUn8dGSw
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
rP-MueqMyZbiDbOEUELXtQ
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
5 rows × 65 columns
In [52]:
rf.feature_importances_
Out[52]:
array([ 7.08695551e-04, 2.06634883e-03, 4.34665938e-03,
5.03297170e-03, 1.08354919e-03, 1.53620781e-02,
1.62823596e-02, 1.80114810e-03, 1.67958944e-02,
3.00554804e-03, 8.30724616e-03, 1.80113435e-02,
7.48980642e-03, 3.84432887e-03, 3.39230978e-03,
1.11355238e-03, 7.52062326e-04, 3.36672656e-03,
6.24440868e-03, 0.00000000e+00, 8.32592635e-04,
3.80086042e-03, 0.00000000e+00, 1.35057481e-02,
5.89276382e-03, 5.29669822e-03, 7.62559073e-03,
1.02760482e-03, 1.34006380e-03, 1.07889008e-02,
3.75518228e-03, 4.45420953e-04, 1.24656731e-02,
1.47229267e-02, 1.13829971e-02, 5.97326995e-03,
0.00000000e+00, 6.07759961e-04, 7.19162873e-03,
4.53248785e-03, 6.61315983e-03, 1.14859418e-03,
1.00969076e-02, 7.85605326e-03, 0.00000000e+00,
1.80354062e-03, 1.97225241e-02, 2.04615671e-02,
2.07151550e-02, 6.27180613e-03, 2.69253569e-02,
3.47211350e-03, 1.10906329e-02, 0.00000000e+00,
2.57384098e-03, 9.42248213e-05, 1.16468480e-02,
0.00000000e+00, 2.14355364e-02, 2.37136476e-02,
1.92164451e-02, 1.71599747e-03, 1.30621469e-02,
1.53760928e-02, 0.00000000e+00, 0.00000000e+00,
7.28479026e-03, 6.11827271e-03, 9.71579664e-03,
3.85889954e-03, 2.91576335e-02, 2.40160697e-02,
2.63775773e-03, 2.87838242e-03, 8.22989673e-04,
4.86899912e-03, 6.04050649e-03, 8.68059154e-03,
1.18679078e-02, 1.69991812e-02, 1.03044012e-02,
6.17210930e-03, 3.01120654e-03, 1.50453312e-02,
1.26807987e-03, 2.40146995e-02, 6.90181167e-03,
5.41047616e-03, 1.65603005e-03, 0.00000000e+00,
3.79754380e-03, 3.30567549e-02, 1.55401802e-02,
3.44982923e-02, 5.12798070e-03, 2.38013460e-03,
0.00000000e+00, 5.15156015e-04, 7.08011372e-03,
1.96277394e-02, 1.69055014e-02, 1.27257664e-03,
2.72101801e-03, 2.48652326e-03, 0.00000000e+00,
1.34384643e-03, 1.01046127e-02, 4.16048281e-03,
5.01987723e-03, 2.75382291e-03, 5.13855683e-03,
2.98582637e-02, 5.52172487e-03, 4.53506205e-03,
7.33982020e-03, 7.45133173e-04, 2.55157156e-03,
2.45231122e-02, 1.86727509e-03, 3.87602835e-03,
3.93666043e-04, 1.58354507e-03, 8.46021549e-05,
6.12744552e-03, 6.57586204e-03, 3.06295662e-03,
1.61247640e-03, 0.00000000e+00, 1.83856872e-04,
1.07195123e-03, 3.41867141e-03, 2.74444544e-03,
0.00000000e+00, 2.59908490e-02, 3.36744914e-03,
1.12271249e-03, 3.13690064e-03, 1.23458247e-03])
In [86]:
print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), X.columns),
reverse=True)
Features sorted by their score:
[(0.0516, 'GoodForMeal_latenight_True'), (0.0486, 'DietaryRestrictions_kosher_True'), (0.0485, 'GoodForMeal_lunch_2'), (0.0456, 'BusinessParking_lot_free'), (0.0413, 'Music_background_music_True'), (0.0413, 'Caters_True'), (0.0409, 'DietaryRestrictions_gluten-free_True'), (0.0403, 'Ambience_casual_none'), (0.0385, 'GoodForMeal_brunch_True'), (0.038, 'Music_video_True'), (0.0346, 'BusinessAcceptsCreditCards_True'), (0.0342, 'OutdoorSeating_True'), (0.0323, 'BusinessParking_garage_True'), (0.0292, 'Smoking_True'), (0.0287, 'BestNights_sunday_True'), (0.0271, 'Ambience_classy_True'), (0.026, 'GoodForDancing_True'), (0.0245, 'Corkage_True'), (0.0245, 'Ambience_divey_quiet'), (0.0228, 'Ambience_casual_full_bar'), (0.0223, 'Ambience_upscale_True'), (0.0221, 'BestNights_saturday_True'), (0.02, 'DriveThru_True'), (0.0116, 'Ambience_trendy_True'), (0.0114, 'BestNights_thursday_yes_free'), (0.0106, 'Ambience_divey_loud'), (0.0105, 'DietaryRestrictions_halal_True'), (0.0092, 'GoodForKids_True'), (0.0092, 'BikeParking_True'), (0.0085, 'RestaurantsPriceRange2_True'), (0.0084, 'BestNights_friday_True'), (0.008, 'AcceptsInsurance_True'), (0.0071, 'RestaurantsAttire_True'), (0.0069, 'GoodForMeal_breakfast_True'), (0.0063, 'BusinessParking_lot_paid'), (0.0063, 'Ambience_divey_very_loud'), (0.0063, 'AgesAllowed_True'), (0.0058, 'RestaurantsReservations_outdoor'), (0.0058, 'GoodForMeal_lunch_3'), (0.0054, 'BusinessParking_street_True'), (0.0054, 'BYOB_True'), (0.0053, 'BestNights_tuesday_True'), (0.0053, 'Ambience_hipster_True'), (0.0052, 'RestaurantsTableService_True'), (0.0052, 'Music_karaoke_True'), (0.0051, 'NoiseLevel_True'), (0.005, 'Music_jukebox_True'), (0.0048, 'Ambience_intimate_dressy'), (0.0047, 'Music_dj_True'), (0.0045, 'Music_live_True'), (0.0042, 'WheelchairAccessible_True'), (0.0038, 'RestaurantsDelivery_True'), (0.0034, 'BestNights_monday_True'), (0.0033, 'GoodForMeal_dessert_True'), (0.0032, 'DietaryRestrictions_vegetarian_True'), (0.0027, 'Open24Hours_True'), (0.0019, 'HasTV_True'), (0.0019, 'Ambience_intimate_formal'), (0.0016, 'BestNights_thursday_yes_corkage'), (0.0015, 'Music_no_music_True'), (0.0006, 'RestaurantsReservations_True'), (0.0006, 'DietaryRestrictions_dairy-free_True'), (0.0002, 'GoodForMeal_dinner_True'), (0.0, 'ByAppointmentOnly_True'), (0.0, 'BusinessAcceptsBitcoin_True')]
In [ ]:
Content source: djfan/yelp-challenge
Similar notebooks: