In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
df = pd.read_pickle('../data_processeing/Yelp_Cuisine_Chinese.pkl')
In [15]:
df_chi = df[df['cuisine_Chinese'] == 2]
In [8]:
df.columns
Out[8]:
Index([ u'address',
u'attributes',
u'business_id',
u'categories',
u'city',
u'hours',
u'is_open',
u'latitude',
u'longitude',
u'name',
u'neighborhood',
u'postal_code',
u'review_count',
u'stars',
u'state',
u'type',
u'AcceptsInsurance',
u'AgesAllowed',
u'Alcohol',
u'Ambience_casual',
u'Ambience_classy',
u'Ambience_divey',
u'Ambience_hipster',
u'Ambience_intimate',
u'Ambience_romantic',
u'Ambience_touristy',
u'Ambience_trendy',
u'Ambience_upscale',
u'BYOB',
u'BYOBCorkage',
u'BestNights_friday',
u'BestNights_monday',
u'BestNights_saturday',
u'BestNights_sunday',
u'BestNights_thursday',
u'BestNights_tuesday',
u'BestNights_wednesday',
u'BikeParking',
u'BusinessAcceptsBitcoin',
u'BusinessAcceptsCreditCards',
u'BusinessParking_garage',
u'BusinessParking_lot',
u'BusinessParking_street',
u'BusinessParking_valet',
u'BusinessParking_validated',
u'ByAppointmentOnly',
u'Caters',
u'CoatCheck',
u'Corkage',
u'DietaryRestrictions_dairy-free',
u'DietaryRestrictions_gluten-free',
u'DietaryRestrictions_halal',
u'DietaryRestrictions_kosher',
u'DietaryRestrictions_soy-free',
u'DietaryRestrictions_vegan',
u'DietaryRestrictions_vegetarian',
u'DogsAllowed',
u'DriveThru',
u'GoodForDancing',
u'GoodForKids',
u'GoodForMeal_breakfast',
u'GoodForMeal_brunch',
u'GoodForMeal_dessert',
u'GoodForMeal_dinner',
u'GoodForMeal_latenight',
u'GoodForMeal_lunch',
u'HairSpecializesIn_africanamerican',
u'HairSpecializesIn_asian',
u'HairSpecializesIn_coloring',
u'HairSpecializesIn_curly',
u'HairSpecializesIn_extensions',
u'HairSpecializesIn_kids',
u'HairSpecializesIn_perms',
u'HairSpecializesIn_straightperms',
u'HappyHour',
u'HasTV',
u'Music_background_music',
u'Music_dj',
u'Music_jukebox',
u'Music_karaoke',
u'Music_live',
u'Music_no_music',
u'Music_video',
u'NoiseLevel',
u'Open24Hours',
u'OutdoorSeating',
u'RestaurantsAttire',
u'RestaurantsCounterService',
u'RestaurantsDelivery',
u'RestaurantsGoodForGroups',
u'RestaurantsPriceRange2',
u'RestaurantsReservations',
u'RestaurantsTableService',
u'RestaurantsTakeOut',
u'Smoking',
u'WheelchairAccessible',
u'WiFi',
u'cuisine_Chinese'],
dtype='object')
In [20]:
df_chi['neighborhood']
Out[20]:
q_KQbgnaYDlPx8EHTydcBQ Brown's Corners
2px99IppAcnxR238eq_8_w
TkXbFJtFCdM_WTLkHa6Erw
dYmm5468BdWxWgksXpy2TQ
OS5IyYEXgYV9_Z0iS4MkOw St. Lawrence
01aNlDhbMObjc9OdAHuNpQ
KeQ1cK564cL5C_hBTFrqnA Mississauga Valley
ZcMRFFuvFt5gH2hdnpyigg Rosemont-La Petite-Patrie
1uPQTz5XZSWr0ti7kUpaDw Ville-Marie
oWDLyENvjBa5LZdRGGOrow
3PhF188adiC8ZutKRP9gyQ
rb4d1_OtpVD91J5NDp4Y3w South End
cXdQ3H0GqcIzRgrvBEaBxw
-GDfRHphXjccSWhliFHnzg
ls3r7doDW0p65cwMWxEIdQ Scarborough
VzUo-RURV3VnfNItAYM8yg Spring Valley
98gLNkYYJCg4f_uJ7o2wBg Plaza Midwood
3GfdCuI0YCc5U3rLLLPHUw
N7wlv3nrhDPuDDY8yX6QvQ
H_eO04NZAQIDcbtFQ4BUag Westside
f5xm2RiwLv0gbmXU4BkrGA Unionville
7sMxjqfidcS1VTqdXBWhnA
NiviplzqF1RtwLvYnL79fQ
0BEaW7yAystJ2OG-S0jv-Q
2xbxXWeu3tpZEJdGTTGbLg Westside
q0oPX1DXW86QytTvvrD9MA Scarborough
0C6uuZBD7xKS-uucgwoyQw
N3zuaqGESF5iZsi_md9c1Q Spring Valley
KUI1j3pYVVQ6ddLqTS0GWw
qMJCVx6-ZYvZQHlfrZ_cug
...
R8Mw0iFenJ3nwXuZb7Ql4A
u6JJojkbqVlVI4NqKsL54Q Bickford Park
5XX1w4WbwSlrACQ5vKuLDw Ville-Marie
B6xVgae6x4Xm5wVNrxaHiw
DIxgItsSI9QwX9H8lVptYg Northwest
_5XClj4E5VCIsEscbrrPKg Little Italy
2reHgsuAWofHfocfJ4Xs5w Chinatown
fojiPcgWlM-FJTxZlz4Idg
-yzBFD877La-RP739VxF7w Willowdale
bj6Km1RvgD6oyppCe5yDog
D5tehjlw9ZIoUzhOEblFqA
Gg0sdZWtkk9yEgpS5W_PIA
VbY4yD6UmA9jKKxHscrxvg
mofOjB6flg-eAWOFbOkHfQ
aXnWWp6Reaep4-0iKWlYHQ Downtown Core
pFg0ZuTVVZMZpANZXI0LtA
bM-rAr7gxsxxjD-FlRcIJA
phTLTkqSvylLkmz5wAwhTw Northwest
XdfYrFt4tkHn3TZWtBPDWw
2YBO1LEKIgyle0uX50u15Q
QpOAv584eb-ecG4TLEm95g Southeast
JPfi__QJAaRzmfh5aOyFEw
osvcl7Fh_J5VwUikh5Vskg Milliken
9Z_6rRy7Tl_C6HIgm7y6FA
uLUl_dMl6a9m774LGReDVA
tquWyW3Mm5ka8LJtzodALg
9iJMGMsTK-q6W5MB1_Ny3Q
w5CSi-An5meLnxjKSFn0wQ
Q3UkgxNNInsPcUFhsQFcrg
OgwN65jZebPRIPSmNpRP7A
Name: neighborhood, dtype: object
In [18]:
X = df_chi.iloc[:, 16:-1]
In [48]:
y = df_chi['stars']
In [49]:
y.plot(kind='kde')
Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f15b92d5a90>
In [31]:
X = pd.get_dummies(X, dummy_na=False, drop_first=True)
In [50]:
y.value_counts()
Out[50]:
3.5 862
4.0 725
3.0 544
4.5 228
2.5 210
2.0 69
5.0 33
1.5 15
1.0 7
Name: stars, dtype: int64
In [35]:
X.sum().sort_values()
Out[35]:
HairSpecializesIn_straightperms_True 1
BusinessParking_validated_True 1
GoodForDancing_True 1
Ambience_intimate_True 2
GoodForKids_True 2
CoatCheck_True 2
Ambience_romantic_formal 3
DietaryRestrictions_vegan_True 3
DietaryRestrictions_dairy-free_True 4
ByAppointmentOnly_True 4
Music_live_True 4
RestaurantsReservations_True 5
HairSpecializesIn_africanamerican_True 6
Open24Hours_True 7
BestNights_thursday_yes_corkage 7
BestNights_saturday_True 7
BusinessAcceptsBitcoin_True 8
Ambience_trendy_True 8
RestaurantsTakeOut_True 8
BestNights_monday_True 10
BestNights_tuesday_True 11
Music_jukebox_True 11
BusinessParking_lot_paid 12
DietaryRestrictions_kosher_True 12
RestaurantsTableService_True 15
HairSpecializesIn_coloring_True 15
HairSpecializesIn_kids_4 17
BestNights_wednesday_True 17
Ambience_touristy_True 18
GoodForMeal_breakfast_True 22
...
RestaurantsCounterService_True 64
GoodForMeal_latenight_True 69
GoodForMeal_dinner_True 93
HairSpecializesIn_kids_3 99
BYOBCorkage_True 123
Music_video_True 133
AgesAllowed_True 147
Ambience_hipster_loud 153
GoodForMeal_lunch_True 455
HairSpecializesIn_curly_True 532
Ambience_hipster_quiet 615
Ambience_classy_full_bar 647
BusinessParking_lot_free 713
HappyHour_True 936
RestaurantsPriceRange2_True 1061
Ambience_divey_True 1065
BusinessAcceptsCreditCards_True 1085
Ambience_classy_none 1117
Music_no_music_True 1144
OutdoorSeating_True 1193
BYOB_True 1291
Smoking_True 1324
BusinessParking_garage_True 1357
HairSpecializesIn_kids_2 1389
Corkage_True 1665
GoodForMeal_dessert_True 1759
Ambience_upscale_True 2248
BestNights_sunday_True 2313
DietaryRestrictions_soy-free_True 2347
DietaryRestrictions_halal_True 2516
dtype: int64
In [51]:
def good_bad(x):
if x >= 4.0:
return 1
else:
return 0
y = y.apply(good_bad)
In [52]:
y
Out[52]:
q_KQbgnaYDlPx8EHTydcBQ 0
2px99IppAcnxR238eq_8_w 1
TkXbFJtFCdM_WTLkHa6Erw 0
dYmm5468BdWxWgksXpy2TQ 0
OS5IyYEXgYV9_Z0iS4MkOw 0
01aNlDhbMObjc9OdAHuNpQ 1
KeQ1cK564cL5C_hBTFrqnA 1
ZcMRFFuvFt5gH2hdnpyigg 0
1uPQTz5XZSWr0ti7kUpaDw 1
oWDLyENvjBa5LZdRGGOrow 0
3PhF188adiC8ZutKRP9gyQ 0
rb4d1_OtpVD91J5NDp4Y3w 1
cXdQ3H0GqcIzRgrvBEaBxw 1
-GDfRHphXjccSWhliFHnzg 0
ls3r7doDW0p65cwMWxEIdQ 0
VzUo-RURV3VnfNItAYM8yg 1
98gLNkYYJCg4f_uJ7o2wBg 0
3GfdCuI0YCc5U3rLLLPHUw 1
N7wlv3nrhDPuDDY8yX6QvQ 0
H_eO04NZAQIDcbtFQ4BUag 1
f5xm2RiwLv0gbmXU4BkrGA 1
7sMxjqfidcS1VTqdXBWhnA 0
NiviplzqF1RtwLvYnL79fQ 1
0BEaW7yAystJ2OG-S0jv-Q 0
2xbxXWeu3tpZEJdGTTGbLg 1
q0oPX1DXW86QytTvvrD9MA 1
0C6uuZBD7xKS-uucgwoyQw 1
N3zuaqGESF5iZsi_md9c1Q 1
KUI1j3pYVVQ6ddLqTS0GWw 0
qMJCVx6-ZYvZQHlfrZ_cug 1
..
R8Mw0iFenJ3nwXuZb7Ql4A 0
u6JJojkbqVlVI4NqKsL54Q 1
5XX1w4WbwSlrACQ5vKuLDw 1
B6xVgae6x4Xm5wVNrxaHiw 0
DIxgItsSI9QwX9H8lVptYg 1
_5XClj4E5VCIsEscbrrPKg 1
2reHgsuAWofHfocfJ4Xs5w 1
fojiPcgWlM-FJTxZlz4Idg 0
-yzBFD877La-RP739VxF7w 1
bj6Km1RvgD6oyppCe5yDog 0
D5tehjlw9ZIoUzhOEblFqA 0
Gg0sdZWtkk9yEgpS5W_PIA 1
VbY4yD6UmA9jKKxHscrxvg 1
mofOjB6flg-eAWOFbOkHfQ 0
aXnWWp6Reaep4-0iKWlYHQ 0
pFg0ZuTVVZMZpANZXI0LtA 0
bM-rAr7gxsxxjD-FlRcIJA 1
phTLTkqSvylLkmz5wAwhTw 0
XdfYrFt4tkHn3TZWtBPDWw 1
2YBO1LEKIgyle0uX50u15Q 0
QpOAv584eb-ecG4TLEm95g 0
JPfi__QJAaRzmfh5aOyFEw 1
osvcl7Fh_J5VwUikh5Vskg 0
9Z_6rRy7Tl_C6HIgm7y6FA 1
uLUl_dMl6a9m774LGReDVA 0
tquWyW3Mm5ka8LJtzodALg 0
9iJMGMsTK-q6W5MB1_Ny3Q 1
w5CSi-An5meLnxjKSFn0wQ 1
Q3UkgxNNInsPcUFhsQFcrg 1
OgwN65jZebPRIPSmNpRP7A 1
Name: stars, dtype: int64
In [39]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
In [76]:
def RF_model(X, y, test_size, random_state_split, random_state_model, n_estimators):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state_split)
rf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state_model)
rf = rf.fit(X_train, y_train)
y_predict = rf.predict(X_test)
return roc_auc_score(y_test, y_predict)
In [86]:
RF_model(X, y, test_size=0.33, random_state_split=1, random_state_model=1, n_estimators=100)
Out[86]:
0.57762115757407917
Content source: djfan/yelp-challenge
Similar notebooks: