In [14]:
# Import the libraries we will be using
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

#from dstools import data_tools   #if plot decision tree surface

import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 10, 8

In [13]:
df = pd.read_pickle('../data_processeing/Yelp_Cuisine_Chinese.pkl')
print df.shape
df.head(2)


(27314, 98)
Out[13]:
address attributes business_id categories city hours is_open latitude longitude name ... RestaurantsDelivery RestaurantsGoodForGroups RestaurantsPriceRange2 RestaurantsReservations RestaurantsTableService RestaurantsTakeOut Smoking WheelchairAccessible WiFi cuisine_Chinese
EDqCEAGXVGCH4FJXgqtjqg 979 Bloor Street W [{u'Alcohol': u'none'}, {u'Ambience': {u'roman... EDqCEAGXVGCH4FJXgqtjqg [Restaurants, Pizza, Chicken Wings, Italian] Toronto [Monday 11:0-2:0, Tuesday 11:0-2:0, Wednesday ... 1 43.661054 -79.429089 Pizza Pizza ... False False NaN NaN NaN NaN False NaN NaN 1
GDnbt3isfhd57T1QqU6flg 11072 No Frank Lloyd Wright [{u'Alcohol': u'none'}, {u'Ambience': {u'roman... GDnbt3isfhd57T1QqU6flg [Tex-Mex, Mexican, Fast Food, Restaurants] Scottsdale [Monday 10:0-22:0, Tuesday 10:0-22:0, Wednesda... 1 33.586710 -111.835410 Taco Bell ... False False NaN NaN NaN NaN False NaN NaN 1

2 rows × 98 columns


In [29]:
#df.review_count.unique()

flaten attributes


In [15]:
df2 = df.copy()

In [31]:
df_att = df2.loc[ :, u'AcceptsInsurance':'WiFi'] # include 'WiFi'
att_type = {}
col_b = []
col_m = []
for i in range(df_att.shape[1]):
    temp = df_att.groupby(df_att.iloc[:,i]).size().keys()
    print temp.name, ': ', temp.values
    if set(temp.values) == set([False, True]) or set(temp.values).issubset(set([False, True])):
        att_type[temp.name] = 'b'
        col_b.append(temp.name)
    else:
        att_type[temp.name] = 'm'
        col_m.append(temp.name)

print len(col_b), len(col_m)
col_all = col_b + col_m


AcceptsInsurance :  [False True]
AgesAllowed :  [False True]
Alcohol :  [False True]
Ambience_casual :  [False True]
Ambience_classy :  ['beer_and_wine' 'full_bar' 'none']
Ambience_divey :  [False True]
Ambience_hipster :  ['average' 'loud' 'quiet' 'very_loud']
Ambience_intimate :  [False True]
Ambience_romantic :  ['casual' 'dressy' 'formal']
Ambience_touristy :  [False True]
Ambience_trendy :  [False True]
Ambience_upscale :  [False True]
BYOB :  [False True]
BYOBCorkage :  [False True]
BestNights_friday :  [False]
BestNights_monday :  [False True]
BestNights_saturday :  [False True]
BestNights_sunday :  [False True]
BestNights_thursday :  [False 'yes_corkage' 'yes_free']
BestNights_tuesday :  [False True]
BestNights_wednesday :  [False True]
BikeParking :  [False True]
BusinessAcceptsBitcoin :  [False True]
BusinessAcceptsCreditCards :  [False True]
BusinessParking_garage :  [False True]
BusinessParking_lot :  [False 'free' 'paid']
BusinessParking_street :  [False True]
BusinessParking_valet :  [False True]
BusinessParking_validated :  [False True]
ByAppointmentOnly :  [False True]
Caters :  [False True]
CoatCheck :  [False True]
Corkage :  [False True]
DietaryRestrictions_dairy-free :  [False True]
DietaryRestrictions_gluten-free :  [False True]
DietaryRestrictions_halal :  [False True]
DietaryRestrictions_kosher :  [False True]
DietaryRestrictions_soy-free :  [False True]
DietaryRestrictions_vegan :  [False True]
DietaryRestrictions_vegetarian :  [False True]
DogsAllowed :  [False True]
DriveThru :  [False True]
GoodForDancing :  [False True]
GoodForKids :  [False True]
GoodForMeal_breakfast :  [False True]
GoodForMeal_brunch :  [False True]
GoodForMeal_dessert :  [False True]
GoodForMeal_dinner :  [False True]
GoodForMeal_latenight :  [False True]
GoodForMeal_lunch :  [False True]
HairSpecializesIn_africanamerican :  [False True]
HairSpecializesIn_asian :  [False True]
HairSpecializesIn_coloring :  [False True]
HairSpecializesIn_curly :  [False True]
HairSpecializesIn_extensions :  [False True]
HairSpecializesIn_kids :  [1 2 3 4]
HairSpecializesIn_perms :  ['18plus' '19plus' '21plus' 'allages']
HairSpecializesIn_straightperms :  [False True]
HappyHour :  [False True]
HasTV :  [False True]
Music_background_music :  [False True]
Music_dj :  [False True]
Music_jukebox :  [False True]
Music_karaoke :  [False True]
Music_live :  [False True]
Music_no_music :  [False True]
Music_video :  [False True]
NoiseLevel :  [False True]
Open24Hours :  [False True]
OutdoorSeating :  [False True]
RestaurantsAttire :  [False True]
RestaurantsCounterService :  [False True]
RestaurantsDelivery :  [False True]
RestaurantsGoodForGroups :  [False True]
RestaurantsPriceRange2 :  [False True]
RestaurantsReservations :  [False True 'outdoor']
RestaurantsTableService :  [False True]
RestaurantsTakeOut :  [False True]
Smoking :  [False True]
WheelchairAccessible :  [False True]
WiFi :  [False True]
73 8

In [35]:
### join label with all the expanded"attributes" and "review count"
df_att_b = df2.loc[:,col_b].join(df[['review_count','cuisine_Chinese']])
df_att_b.head(1)


Out[35]:
AcceptsInsurance AgesAllowed Alcohol Ambience_casual Ambience_divey Ambience_intimate Ambience_touristy Ambience_trendy Ambience_upscale BYOB ... RestaurantsDelivery RestaurantsGoodForGroups RestaurantsPriceRange2 RestaurantsTableService RestaurantsTakeOut Smoking WheelchairAccessible WiFi review_count cuisine_Chinese
EDqCEAGXVGCH4FJXgqtjqg NaN False NaN NaN True NaN NaN NaN True False ... False False NaN NaN NaN False NaN NaN 7 1

1 rows × 75 columns


In [36]:
df_att_m = df2.loc[:,col_m].join(df[['review_count','cuisine_Chinese']])
df_att_m.head(1)


Out[36]:
Ambience_classy Ambience_hipster Ambience_romantic BestNights_thursday BusinessParking_lot HairSpecializesIn_kids HairSpecializesIn_perms RestaurantsReservations review_count cuisine_Chinese
EDqCEAGXVGCH4FJXgqtjqg none quiet casual NaN free 1 NaN NaN 7 1

In [37]:
df_att_all = df_att.join(df[['review_count','cuisine_Chinese']])
df_att_all.head(1)


Out[37]:
AcceptsInsurance AgesAllowed Alcohol Ambience_casual Ambience_classy Ambience_divey Ambience_hipster Ambience_intimate Ambience_romantic Ambience_touristy ... RestaurantsGoodForGroups RestaurantsPriceRange2 RestaurantsReservations RestaurantsTableService RestaurantsTakeOut Smoking WheelchairAccessible WiFi review_count cuisine_Chinese
EDqCEAGXVGCH4FJXgqtjqg NaN False NaN NaN none True quiet NaN casual NaN ... False NaN NaN NaN NaN False NaN NaN 7 1

1 rows × 83 columns


In [38]:
df_att_all.shape


Out[38]:
(27314, 83)

In [39]:
df_att_all_filled = df_att_all.fillna(value='none')
df_att_all_filled.head(3)


Out[39]:
AcceptsInsurance AgesAllowed Alcohol Ambience_casual Ambience_classy Ambience_divey Ambience_hipster Ambience_intimate Ambience_romantic Ambience_touristy ... RestaurantsGoodForGroups RestaurantsPriceRange2 RestaurantsReservations RestaurantsTableService RestaurantsTakeOut Smoking WheelchairAccessible WiFi review_count cuisine_Chinese
EDqCEAGXVGCH4FJXgqtjqg none False none none none True quiet none casual none ... False none none none none False none none 7 1
GDnbt3isfhd57T1QqU6flg none False none none none False quiet none casual none ... False none none none none False none none 9 1
a1Ba6XeIOP48e64YFD0dMw none none none none none none none none none none ... none none none none none none none none 3 0

3 rows × 83 columns


In [45]:
## transformed certain columns to BINARY varibales #####
df_chin_bi = pd.get_dummies(df_att_all_filled, \
                            prefix=col_all, prefix_sep='_', dummy_na=False, \
                            columns=col_all, sparse=False, drop_first=False)

In [56]:
## change column order
df_chin_bi = df_chin_bi.loc[:,['cuisine_Chinese','review_count'] + list(df_chin_bi.columns.values[2:])]

In [57]:
df_chin_bi.head(2)


Out[57]:
cuisine_Chinese review_count AcceptsInsurance_False AcceptsInsurance_True AcceptsInsurance_none AgesAllowed_False AgesAllowed_True AgesAllowed_none Alcohol_False Alcohol_True ... HairSpecializesIn_kids_none HairSpecializesIn_perms_18plus HairSpecializesIn_perms_19plus HairSpecializesIn_perms_21plus HairSpecializesIn_perms_allages HairSpecializesIn_perms_none RestaurantsReservations_False RestaurantsReservations_True RestaurantsReservations_none RestaurantsReservations_outdoor
EDqCEAGXVGCH4FJXgqtjqg 1 7 0 0 1 1 0 0 0 0 ... 0 0 0 0 0 1 0 0 1 0
GDnbt3isfhd57T1QqU6flg 1 9 0 0 1 1 0 0 0 0 ... 0 0 0 0 0 1 0 0 1 0

2 rows × 254 columns


In [58]:
df_chin_bi.shape


Out[58]:
(27314, 254)

In [59]:
# Split training(labeled) and test(unlabled) 
df_chin_train = df_chin_bi[df_chin_bi['cuisine_Chinese'] != 0]
df_chin_test = df_chin_bi[df_chin_bi['cuisine_Chinese'] == 0]

In [ ]:
## Separate X(feature) and Y(traget) for training set
X = df_chin_train.ix[:, 1:]
Y = df_chin_train['cuisine_Chinese'].apply(lambda n: n-1)

Logistic Regression

Parameter tuning


In [61]:
# panalty and C value tuning
penalties = ['l1', 'l2'] 
C_values = sorted([np.exp(a*5) for a in np.linspace(-8,4,20)])
C_values


Out[61]:
[4.2483542552915889e-18,
 9.9925650127909073e-17,
 2.3503537966609349e-15,
 5.5282732335564566e-14,
 1.30030657462187e-12,
 3.0584544514583164e-11,
 7.1937986119660656e-10,
 1.6920552289032579e-08,
 3.9798874726580644e-07,
 9.3611036001982535e-06,
 0.00022018275947666106,
 0.0051789243705977379,
 0.12181361383661964,
 2.8651811561836724,
 67.391999951340196,
 1585.128970864379,
 37283.859450495467,
 876954.62077529205,
 20626872.27754036,
 485165195.40979028]

In [80]:
# pick train test split ratio as 0.7

# LR GridSearch 1
X_train, X_vali, Y_train, Y_vali = train_test_split(
    X, Y, train_size=0.7, random_state=90) #Use random_state to fix samples

# tuning hyper parameter
tuned_parameters_LR = {'C': C_values,
                       'penalty': penalties} 

lr = LogisticRegression()
gr_lr = GridSearchCV(lr, param_grid=tuned_parameters_LR, cv=5)
gr_lr.fit(X_train, Y_train)

gr_lr_auc_scores = cross_val_score(gr_lr, X, Y, scoring="roc_auc", cv=5)

print ( "Logistic regression training size(0.8): Mean AUC %.4f\n" % (np.mean(gr_lr_auc_scores)))

print "Best parameters set found:"
print gr_lr.best_params_
print"Gridsearch Scores: "
print gr_lr.score(X_vali, Y_vali)

means = gr_lr.cv_results_['mean_test_score']
stds = gr_lr.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gr_lr.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))


Logistic regression training size(0.8): Mean AUC 0.7488

Best parameters set found:
{'penalty': 'l1', 'C': 2.8651811561836724}
Gridsearch Scores: 
0.864946084497
0.855 (+/-0.000) for {'penalty': 'l1', 'C': 4.2483542552915889e-18}
0.855 (+/-0.000) for {'penalty': 'l2', 'C': 4.2483542552915889e-18}
0.855 (+/-0.000) for {'penalty': 'l1', 'C': 9.9925650127909073e-17}
0.855 (+/-0.000) for {'penalty': 'l2', 'C': 9.9925650127909073e-17}
0.855 (+/-0.000) for {'penalty': 'l1', 'C': 2.3503537966609349e-15}
0.855 (+/-0.000) for {'penalty': 'l2', 'C': 2.3503537966609349e-15}
0.855 (+/-0.000) for {'penalty': 'l1', 'C': 5.5282732335564566e-14}
0.855 (+/-0.000) for {'penalty': 'l2', 'C': 5.5282732335564566e-14}
0.855 (+/-0.000) for {'penalty': 'l1', 'C': 1.30030657462187e-12}
0.855 (+/-0.000) for {'penalty': 'l2', 'C': 1.30030657462187e-12}
0.855 (+/-0.000) for {'penalty': 'l1', 'C': 3.0584544514583164e-11}
0.855 (+/-0.000) for {'penalty': 'l2', 'C': 3.0584544514583164e-11}
0.855 (+/-0.000) for {'penalty': 'l1', 'C': 7.1937986119660656e-10}
0.855 (+/-0.000) for {'penalty': 'l2', 'C': 7.1937986119660656e-10}
0.855 (+/-0.000) for {'penalty': 'l1', 'C': 1.6920552289032579e-08}
0.855 (+/-0.000) for {'penalty': 'l2', 'C': 1.6920552289032579e-08}
0.855 (+/-0.000) for {'penalty': 'l1', 'C': 3.9798874726580644e-07}
0.855 (+/-0.000) for {'penalty': 'l2', 'C': 3.9798874726580644e-07}
0.855 (+/-0.000) for {'penalty': 'l1', 'C': 9.3611036001982535e-06}
0.855 (+/-0.000) for {'penalty': 'l2', 'C': 9.3611036001982535e-06}
0.855 (+/-0.000) for {'penalty': 'l1', 'C': 0.00022018275947666106}
0.855 (+/-0.000) for {'penalty': 'l2', 'C': 0.00022018275947666106}
0.855 (+/-0.000) for {'penalty': 'l1', 'C': 0.0051789243705977379}
0.855 (+/-0.001) for {'penalty': 'l2', 'C': 0.0051789243705977379}
0.856 (+/-0.004) for {'penalty': 'l1', 'C': 0.12181361383661964}
0.857 (+/-0.001) for {'penalty': 'l2', 'C': 0.12181361383661964}
0.857 (+/-0.002) for {'penalty': 'l1', 'C': 2.8651811561836724}
0.857 (+/-0.002) for {'penalty': 'l2', 'C': 2.8651811561836724}
0.857 (+/-0.002) for {'penalty': 'l1', 'C': 67.391999951340196}
0.856 (+/-0.002) for {'penalty': 'l2', 'C': 67.391999951340196}
0.857 (+/-0.002) for {'penalty': 'l1', 'C': 1585.128970864379}
0.856 (+/-0.002) for {'penalty': 'l2', 'C': 1585.128970864379}
0.857 (+/-0.002) for {'penalty': 'l1', 'C': 37283.859450495467}
0.857 (+/-0.002) for {'penalty': 'l2', 'C': 37283.859450495467}
0.857 (+/-0.002) for {'penalty': 'l1', 'C': 876954.62077529205}
0.857 (+/-0.002) for {'penalty': 'l2', 'C': 876954.62077529205}
0.857 (+/-0.002) for {'penalty': 'l1', 'C': 20626872.27754036}
0.857 (+/-0.002) for {'penalty': 'l2', 'C': 20626872.27754036}
0.857 (+/-0.002) for {'penalty': 'l1', 'C': 485165195.40979028}
0.857 (+/-0.002) for {'penalty': 'l2', 'C': 485165195.40979028}

In [81]:
p_opt = gr_lr.best_params_.values()[0]
c_opt = gr_lr.best_params_.values()[1]

Decision Tree Classifier


In [131]:
mss_values = sorted([int(a) for a in np.linspace(50, 550, 20)])
print mss_values
msl_values = sorted([int(a) for a in np.linspace(10, 600, 20)])
print msl_values


[50, 76, 102, 128, 155, 181, 207, 234, 260, 286, 313, 339, 365, 392, 418, 444, 471, 497, 523, 550]
[10, 41, 72, 103, 134, 165, 196, 227, 258, 289, 320, 351, 382, 413, 444, 475, 506, 537, 568, 600]

In [133]:
# tuning hyperparameter
tuned_parameters_DT = {'min_samples_split': mss_values,
                       'min_samples_leaf':msl_values
                       }

dt = DecisionTreeClassifier(criterion='entropy')
gr_dt = GridSearchCV(dt, param_grid=tuned_parameters_DT, cv=5)
gr_dt.fit(X_train, Y_train)

gr_dt_auc_scores = cross_val_score(gr_dt, X, Y, scoring="roc_auc", cv=5)

print ( "Decision Tree with training size(0.8): Mean AUC %.4f\n" % (np.mean(gr_dt_auc_scores)))

print "Best parameters set found:"
print gr_dt.best_params_
print"Gridsearch Scores: "
print gr_dt.score(X_vali, Y_vali)

means = gr_dt.cv_results_['mean_test_score']
stds = gr_dt.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gr_dt.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))


Decision Tree with training size(0.8): Mean AUC 0.7336

Best parameters set found:
{'min_samples_split': 128, 'min_samples_leaf': 10}
Gridsearch Scores: 
0.861057097401
0.847 (+/-0.008) for {'min_samples_split': 50, 'min_samples_leaf': 10}
0.852 (+/-0.009) for {'min_samples_split': 76, 'min_samples_leaf': 10}
0.854 (+/-0.006) for {'min_samples_split': 102, 'min_samples_leaf': 10}
0.857 (+/-0.003) for {'min_samples_split': 128, 'min_samples_leaf': 10}
0.854 (+/-0.003) for {'min_samples_split': 155, 'min_samples_leaf': 10}
0.854 (+/-0.003) for {'min_samples_split': 181, 'min_samples_leaf': 10}
0.854 (+/-0.003) for {'min_samples_split': 207, 'min_samples_leaf': 10}
0.853 (+/-0.002) for {'min_samples_split': 234, 'min_samples_leaf': 10}
0.853 (+/-0.004) for {'min_samples_split': 260, 'min_samples_leaf': 10}
0.851 (+/-0.003) for {'min_samples_split': 286, 'min_samples_leaf': 10}
0.851 (+/-0.003) for {'min_samples_split': 313, 'min_samples_leaf': 10}
0.851 (+/-0.003) for {'min_samples_split': 339, 'min_samples_leaf': 10}
0.851 (+/-0.003) for {'min_samples_split': 365, 'min_samples_leaf': 10}
0.851 (+/-0.003) for {'min_samples_split': 392, 'min_samples_leaf': 10}
0.851 (+/-0.003) for {'min_samples_split': 418, 'min_samples_leaf': 10}
0.851 (+/-0.003) for {'min_samples_split': 444, 'min_samples_leaf': 10}
0.851 (+/-0.004) for {'min_samples_split': 471, 'min_samples_leaf': 10}
0.851 (+/-0.003) for {'min_samples_split': 497, 'min_samples_leaf': 10}
0.852 (+/-0.004) for {'min_samples_split': 523, 'min_samples_leaf': 10}
0.854 (+/-0.003) for {'min_samples_split': 550, 'min_samples_leaf': 10}
0.855 (+/-0.006) for {'min_samples_split': 50, 'min_samples_leaf': 41}
0.855 (+/-0.005) for {'min_samples_split': 76, 'min_samples_leaf': 41}
0.854 (+/-0.004) for {'min_samples_split': 102, 'min_samples_leaf': 41}
0.856 (+/-0.007) for {'min_samples_split': 128, 'min_samples_leaf': 41}
0.853 (+/-0.008) for {'min_samples_split': 155, 'min_samples_leaf': 41}
0.855 (+/-0.004) for {'min_samples_split': 181, 'min_samples_leaf': 41}
0.855 (+/-0.003) for {'min_samples_split': 207, 'min_samples_leaf': 41}
0.854 (+/-0.002) for {'min_samples_split': 234, 'min_samples_leaf': 41}
0.853 (+/-0.004) for {'min_samples_split': 260, 'min_samples_leaf': 41}
0.852 (+/-0.004) for {'min_samples_split': 286, 'min_samples_leaf': 41}
0.852 (+/-0.004) for {'min_samples_split': 313, 'min_samples_leaf': 41}
0.852 (+/-0.004) for {'min_samples_split': 339, 'min_samples_leaf': 41}
0.852 (+/-0.004) for {'min_samples_split': 365, 'min_samples_leaf': 41}
0.852 (+/-0.004) for {'min_samples_split': 392, 'min_samples_leaf': 41}
0.852 (+/-0.004) for {'min_samples_split': 418, 'min_samples_leaf': 41}
0.852 (+/-0.004) for {'min_samples_split': 444, 'min_samples_leaf': 41}
0.852 (+/-0.004) for {'min_samples_split': 471, 'min_samples_leaf': 41}
0.852 (+/-0.004) for {'min_samples_split': 497, 'min_samples_leaf': 41}
0.852 (+/-0.004) for {'min_samples_split': 523, 'min_samples_leaf': 41}
0.854 (+/-0.002) for {'min_samples_split': 550, 'min_samples_leaf': 41}
0.856 (+/-0.003) for {'min_samples_split': 50, 'min_samples_leaf': 72}
0.856 (+/-0.003) for {'min_samples_split': 76, 'min_samples_leaf': 72}
0.856 (+/-0.003) for {'min_samples_split': 102, 'min_samples_leaf': 72}
0.856 (+/-0.003) for {'min_samples_split': 128, 'min_samples_leaf': 72}
0.856 (+/-0.003) for {'min_samples_split': 155, 'min_samples_leaf': 72}
0.856 (+/-0.003) for {'min_samples_split': 181, 'min_samples_leaf': 72}
0.855 (+/-0.003) for {'min_samples_split': 207, 'min_samples_leaf': 72}
0.854 (+/-0.002) for {'min_samples_split': 234, 'min_samples_leaf': 72}
0.853 (+/-0.004) for {'min_samples_split': 260, 'min_samples_leaf': 72}
0.852 (+/-0.004) for {'min_samples_split': 286, 'min_samples_leaf': 72}
0.852 (+/-0.004) for {'min_samples_split': 313, 'min_samples_leaf': 72}
0.852 (+/-0.004) for {'min_samples_split': 339, 'min_samples_leaf': 72}
0.852 (+/-0.004) for {'min_samples_split': 365, 'min_samples_leaf': 72}
0.852 (+/-0.004) for {'min_samples_split': 392, 'min_samples_leaf': 72}
0.852 (+/-0.004) for {'min_samples_split': 418, 'min_samples_leaf': 72}
0.852 (+/-0.004) for {'min_samples_split': 444, 'min_samples_leaf': 72}
0.852 (+/-0.004) for {'min_samples_split': 471, 'min_samples_leaf': 72}
0.852 (+/-0.004) for {'min_samples_split': 497, 'min_samples_leaf': 72}
0.852 (+/-0.004) for {'min_samples_split': 523, 'min_samples_leaf': 72}
0.854 (+/-0.002) for {'min_samples_split': 550, 'min_samples_leaf': 72}
0.857 (+/-0.005) for {'min_samples_split': 50, 'min_samples_leaf': 103}
0.857 (+/-0.005) for {'min_samples_split': 76, 'min_samples_leaf': 103}
0.857 (+/-0.005) for {'min_samples_split': 102, 'min_samples_leaf': 103}
0.857 (+/-0.005) for {'min_samples_split': 128, 'min_samples_leaf': 103}
0.857 (+/-0.005) for {'min_samples_split': 155, 'min_samples_leaf': 103}
0.857 (+/-0.005) for {'min_samples_split': 181, 'min_samples_leaf': 103}
0.857 (+/-0.005) for {'min_samples_split': 207, 'min_samples_leaf': 103}
0.855 (+/-0.001) for {'min_samples_split': 234, 'min_samples_leaf': 103}
0.854 (+/-0.004) for {'min_samples_split': 260, 'min_samples_leaf': 103}
0.852 (+/-0.005) for {'min_samples_split': 286, 'min_samples_leaf': 103}
0.852 (+/-0.005) for {'min_samples_split': 313, 'min_samples_leaf': 103}
0.852 (+/-0.005) for {'min_samples_split': 339, 'min_samples_leaf': 103}
0.852 (+/-0.005) for {'min_samples_split': 365, 'min_samples_leaf': 103}
0.852 (+/-0.005) for {'min_samples_split': 392, 'min_samples_leaf': 103}
0.852 (+/-0.005) for {'min_samples_split': 418, 'min_samples_leaf': 103}
0.852 (+/-0.005) for {'min_samples_split': 444, 'min_samples_leaf': 103}
0.852 (+/-0.005) for {'min_samples_split': 471, 'min_samples_leaf': 103}
0.852 (+/-0.005) for {'min_samples_split': 497, 'min_samples_leaf': 103}
0.852 (+/-0.005) for {'min_samples_split': 523, 'min_samples_leaf': 103}
0.855 (+/-0.000) for {'min_samples_split': 550, 'min_samples_leaf': 103}
0.853 (+/-0.005) for {'min_samples_split': 50, 'min_samples_leaf': 134}
0.853 (+/-0.005) for {'min_samples_split': 76, 'min_samples_leaf': 134}
0.853 (+/-0.005) for {'min_samples_split': 102, 'min_samples_leaf': 134}
0.853 (+/-0.005) for {'min_samples_split': 128, 'min_samples_leaf': 134}
0.853 (+/-0.005) for {'min_samples_split': 155, 'min_samples_leaf': 134}
0.853 (+/-0.005) for {'min_samples_split': 181, 'min_samples_leaf': 134}
0.853 (+/-0.005) for {'min_samples_split': 207, 'min_samples_leaf': 134}
0.853 (+/-0.005) for {'min_samples_split': 234, 'min_samples_leaf': 134}
0.853 (+/-0.005) for {'min_samples_split': 260, 'min_samples_leaf': 134}
0.852 (+/-0.005) for {'min_samples_split': 286, 'min_samples_leaf': 134}
0.852 (+/-0.005) for {'min_samples_split': 313, 'min_samples_leaf': 134}
0.852 (+/-0.005) for {'min_samples_split': 339, 'min_samples_leaf': 134}
0.852 (+/-0.005) for {'min_samples_split': 365, 'min_samples_leaf': 134}
0.852 (+/-0.005) for {'min_samples_split': 392, 'min_samples_leaf': 134}
0.852 (+/-0.005) for {'min_samples_split': 418, 'min_samples_leaf': 134}
0.852 (+/-0.005) for {'min_samples_split': 444, 'min_samples_leaf': 134}
0.852 (+/-0.005) for {'min_samples_split': 471, 'min_samples_leaf': 134}
0.852 (+/-0.005) for {'min_samples_split': 497, 'min_samples_leaf': 134}
0.852 (+/-0.005) for {'min_samples_split': 523, 'min_samples_leaf': 134}
0.855 (+/-0.000) for {'min_samples_split': 550, 'min_samples_leaf': 134}
0.852 (+/-0.005) for {'min_samples_split': 50, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 76, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 102, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 128, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 155, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 181, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 207, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 234, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 260, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 286, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 313, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 339, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 365, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 392, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 418, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 444, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 471, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 497, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 523, 'min_samples_leaf': 165}
0.855 (+/-0.000) for {'min_samples_split': 550, 'min_samples_leaf': 165}
0.852 (+/-0.005) for {'min_samples_split': 50, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 76, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 102, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 128, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 155, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 181, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 207, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 234, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 260, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 286, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 313, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 339, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 365, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 392, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 418, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 444, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 471, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 497, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 523, 'min_samples_leaf': 196}
0.855 (+/-0.000) for {'min_samples_split': 550, 'min_samples_leaf': 196}
0.852 (+/-0.005) for {'min_samples_split': 50, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 76, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 102, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 128, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 155, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 181, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 207, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 234, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 260, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 286, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 313, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 339, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 365, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 392, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 418, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 444, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 471, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 497, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 523, 'min_samples_leaf': 227}
0.855 (+/-0.000) for {'min_samples_split': 550, 'min_samples_leaf': 227}
0.852 (+/-0.005) for {'min_samples_split': 50, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 76, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 102, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 128, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 155, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 181, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 207, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 234, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 260, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 286, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 313, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 339, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 365, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 392, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 418, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 444, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 471, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 497, 'min_samples_leaf': 258}
0.852 (+/-0.005) for {'min_samples_split': 523, 'min_samples_leaf': 258}
0.855 (+/-0.000) for {'min_samples_split': 550, 'min_samples_leaf': 258}
0.855 (+/-0.000) for {'min_samples_split': 50, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 76, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 102, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 128, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 155, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 181, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 207, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 234, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 260, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 286, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 313, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 339, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 365, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 392, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 418, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 444, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 471, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 497, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 523, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 550, 'min_samples_leaf': 289}
0.855 (+/-0.000) for {'min_samples_split': 50, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 76, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 102, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 128, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 155, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 181, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 207, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 234, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 260, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 286, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 313, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 339, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 365, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 392, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 418, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 444, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 471, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 497, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 523, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 550, 'min_samples_leaf': 320}
0.855 (+/-0.000) for {'min_samples_split': 50, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 76, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 102, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 128, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 155, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 181, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 207, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 234, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 260, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 286, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 313, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 339, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 365, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 392, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 418, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 444, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 471, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 497, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 523, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 550, 'min_samples_leaf': 351}
0.855 (+/-0.000) for {'min_samples_split': 50, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 76, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 102, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 128, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 155, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 181, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 207, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 234, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 260, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 286, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 313, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 339, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 365, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 392, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 418, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 444, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 471, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 497, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 523, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 550, 'min_samples_leaf': 382}
0.855 (+/-0.000) for {'min_samples_split': 50, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 76, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 102, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 128, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 155, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 181, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 207, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 234, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 260, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 286, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 313, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 339, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 365, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 392, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 418, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 444, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 471, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 497, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 523, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 550, 'min_samples_leaf': 413}
0.855 (+/-0.000) for {'min_samples_split': 50, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 76, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 102, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 128, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 155, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 181, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 207, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 234, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 260, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 286, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 313, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 339, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 365, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 392, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 418, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 444, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 471, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 497, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 523, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 550, 'min_samples_leaf': 444}
0.855 (+/-0.000) for {'min_samples_split': 50, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 76, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 102, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 128, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 155, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 181, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 207, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 234, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 260, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 286, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 313, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 339, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 365, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 392, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 418, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 444, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 471, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 497, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 523, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 550, 'min_samples_leaf': 475}
0.855 (+/-0.000) for {'min_samples_split': 50, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 76, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 102, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 128, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 155, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 181, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 207, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 234, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 260, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 286, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 313, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 339, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 365, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 392, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 418, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 444, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 471, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 497, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 523, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 550, 'min_samples_leaf': 506}
0.855 (+/-0.000) for {'min_samples_split': 50, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 76, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 102, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 128, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 155, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 181, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 207, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 234, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 260, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 286, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 313, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 339, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 365, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 392, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 418, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 444, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 471, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 497, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 523, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 550, 'min_samples_leaf': 537}
0.855 (+/-0.000) for {'min_samples_split': 50, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 76, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 102, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 128, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 155, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 181, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 207, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 234, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 260, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 286, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 313, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 339, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 365, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 392, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 418, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 444, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 471, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 497, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 523, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 550, 'min_samples_leaf': 568}
0.855 (+/-0.000) for {'min_samples_split': 50, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 76, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 102, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 128, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 155, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 181, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 207, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 234, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 260, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 286, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 313, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 339, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 365, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 392, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 418, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 444, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 471, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 497, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 523, 'min_samples_leaf': 600}
0.855 (+/-0.000) for {'min_samples_split': 550, 'min_samples_leaf': 600}

In [137]:
mss_opt = gr_dt.best_params_.values()[0]
msl_opt = gr_dt.best_params_.values()[1]

Compare Models: LR & DT


In [141]:
models = []
labels = []

# fit LR model
lr_opt = LogisticRegression(C=c_opt, penalty=p_opt, random_state=99)
lr_opt.fit(X_train, Y_train)
models.append(lr_opt)
labels.append('Logistic Regression')

# fit DT model
dt_opt = DecisionTreeClassifier(criterion='entropy', random_state=99, \
                                min_samples_split=mss_opt, min_samples_leaf=msl_opt)
dt_opt.fit(X_train, Y_train)
models.append(dt_opt)
labels.append('Decision Tree')

In [142]:
models


Out[142]:
[LogisticRegression(C=2.8651811561836724, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l1', random_state=99,
           solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
             max_features=None, max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=10,
             min_samples_split=128, min_weight_fraction_leaf=0.0,
             presort=False, random_state=99, splitter='best')]

In [147]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 8, 8

#### Plot One model - train & validation sets ####
def plot_ROC(model, Y_train, X_train, Y_vali, X_vali, label):
    print ("AUC on the %s Train data = %.5f" % \
           (label, metrics.roc_auc_score(model.predict(X_train), Y_train)))
    print ("AUC on the %s Validation data = %.5f\n" % \
           (label, metrics.roc_auc_score(model.predict(X_vali), Y_vali)))

    #fpr, tpr, thresholds = metrics.roc_curve(Y_train, model.predict_proba(X_train)[:,1])    
    #plt.plot(fpr, tpr, label='{} Train set'.format(label))
    
    fpr, tpr, thresholds = metrics.roc_curve(Y_vali, model.predict_proba(X_vali)[:,1])    
    plt.plot(fpr, tpr, label='{} Validation set'.format(label))
    
    plt.xlabel("false-positive rate", size=18)
    plt.ylabel("true-positibe rate", size=18)
    plt.title("ROC Curve of {} Model".format(label), size=20)
    plt.legend(loc='best')

    
##### more than one model - validation sets #####
def plot_ROCs(models, Y_trains, X_trains, Y_valis, X_valis, labels):  
    for model, Y_train, X_train, Y_vali, X_vali, label in \
    zip(models, Y_trains, X_trains, Y_valis, X_valis, labels):
        
        print ("AUC on the %s Train data = %.5f" % \
               (label, metrics.roc_auc_score(model.predict(X_train), Y_train)))
        print ("AUC on the %s Validation data = %.5f\n" % \
               (label, metrics.roc_auc_score(model.predict(X_vali), Y_vali)))
        
        fpr, tpr, thresholds = metrics.roc_curve(Y_vali, model.predict_proba(X_vali)[:,1])
        plt.plot(fpr, tpr, label=label)
    
    plt.xlabel("fpr", size=18)
    plt.ylabel("tpr", size=18)
    plt.title("ROC Curves for Models and Validation sets", size=20)
    plt.legend(loc='best')

In [148]:
plot_ROC(lr_opt, Y_train, X_train, Y_vali, X_vali, label='LR')


AUC on the LR Train data = 0.72826
AUC on the LR Validation data = 0.76676


In [149]:
# Plot AUC for DT and LR optimal models
plot_ROCs(models, [Y_train]*2, [X_train]*2, [Y_vali]*2, [X_vali]*2, labels)


AUC on the Logistic Regression Train data = 0.72826
AUC on the Logistic Regression Validation data = 0.76676

AUC on the Decision Tree Train data = 0.73065
AUC on the Decision Tree Validation data = 0.68256


In [ ]: