In [6]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data


--2017-12-31 21:46:29--  https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data
Resolving archive.ics.uci.edu... 128.195.10.249
Connecting to archive.ics.uci.edu|128.195.10.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23279 (23K) [text/plain]
Saving to: ‘pima-indians-diabetes.data.1’

pima-indians-diabet 100%[===================>]  22.73K   143KB/s    in 0.2s    

2017-12-31 21:46:30 (143 KB/s) - ‘pima-indians-diabetes.data.1’ saved [23279/23279]


In [9]:
ls


pima-indians-diabetes.data  xgboost-tutorial.ipynb

In [35]:
from numpy import loadtxt
from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

%matplotlib inline

In [10]:
# load data
dataset = loadtxt('pima-indians-diabetes.data', delimiter=',')

In [11]:
X = dataset[:, 0:8]
Y = dataset[:, 8]

In [14]:
X.shape, Y.shape


Out[14]:
((768, 8), (768,))

In [15]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [16]:
model = XGBClassifier()

In [17]:
model


Out[17]:
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [18]:
model.fit(X_train, y_train)


Out[18]:
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [20]:
y_pred = model.predict(X_test)

In [24]:
predictions = [round(value) for value in y_pred]

In [29]:
accuracy = accuracy_score(y_test, predictions)

In [30]:
print('Accuracy: %.2f%%' % (accuracy * 100.0))


Accuracy: 77.95%

In [31]:
print(model.feature_importances_)


[ 0.07094595  0.1858108   0.08952703  0.08445946  0.07263514  0.16047297
  0.12837838  0.20777027]

In [37]:
plot_importance(model)


Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1cae0550>

In [38]:
model.learning_rate


Out[38]:
0.1

In [40]:
dir(model)


Out[40]:
['_Booster',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_estimator_type',
 '_features_count',
 '_get_param_names',
 '_le',
 'apply',
 'base_score',
 'booster',
 'classes_',
 'colsample_bylevel',
 'colsample_bytree',
 'evals_result',
 'feature_importances_',
 'fit',
 'gamma',
 'get_params',
 'get_xgb_params',
 'learning_rate',
 'max_delta_step',
 'max_depth',
 'min_child_weight',
 'missing',
 'n_classes_',
 'n_estimators',
 'nthread',
 'objective',
 'predict',
 'predict_proba',
 'reg_alpha',
 'reg_lambda',
 'scale_pos_weight',
 'score',
 'seed',
 'set_params',
 'silent',
 'subsample']

In [41]:
model.n_estimators


Out[41]:
100

In [43]:
model.n_classes_


Out[43]:
2

In [44]:
model.subsample


Out[44]:
1

In [45]:
model.max_depth


Out[45]:
3

In [49]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [47]:
n_estimators = [50, 100, 150, 200]
max_depth = [2, 4, 6, 8]
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators)

In [48]:
param_grid


Out[48]:
{'max_depth': [2, 4, 6, 8], 'n_estimators': [50, 100, 150, 200]}

In [51]:
X = dataset[:, 0:8]
Y = dataset[:, 8]

In [52]:
model = XGBClassifier()
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring='neg_log_loss', n_jobs=1, cv=kfold, verbose=1)
result = grid_search.fit(X, Y)


Fitting 10 folds for each of 16 candidates, totalling 160 fits
[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:   30.3s finished

In [54]:
result.best_score_


Out[54]:
-0.4743699228699067

In [55]:
result.best_params_


Out[55]:
{'max_depth': 2, 'n_estimators': 100}

In [56]:
means = result.cv_results_['mean_test_score']

In [57]:
means


Out[57]:
array([-0.47896612, -0.47436992, -0.47833176, -0.48773776, -0.49183161,
       -0.50878154, -0.53464562, -0.56524915, -0.5141909 , -0.55201343,
       -0.59413648, -0.64052303, -0.54021524, -0.59921839, -0.64791661,
       -0.68629109])

In [58]:
std = result.cv_results_['std_test_score']

In [59]:
std


Out[59]:
array([ 0.03729732,  0.04089869,  0.04684618,  0.05188218,  0.05475398,
        0.06035662,  0.07157062,  0.07967163,  0.06392399,  0.07491469,
        0.07858727,  0.08881017,  0.07977898,  0.09148149,  0.10389971,
        0.11529137])

In [60]:
params = result.cv_results_['params']

In [61]:
params


Out[61]:
[{'max_depth': 2, 'n_estimators': 50},
 {'max_depth': 2, 'n_estimators': 100},
 {'max_depth': 2, 'n_estimators': 150},
 {'max_depth': 2, 'n_estimators': 200},
 {'max_depth': 4, 'n_estimators': 50},
 {'max_depth': 4, 'n_estimators': 100},
 {'max_depth': 4, 'n_estimators': 150},
 {'max_depth': 4, 'n_estimators': 200},
 {'max_depth': 6, 'n_estimators': 50},
 {'max_depth': 6, 'n_estimators': 100},
 {'max_depth': 6, 'n_estimators': 150},
 {'max_depth': 6, 'n_estimators': 200},
 {'max_depth': 8, 'n_estimators': 50},
 {'max_depth': 8, 'n_estimators': 100},
 {'max_depth': 8, 'n_estimators': 150},
 {'max_depth': 8, 'n_estimators': 200}]

In [63]:
for mean, stdev, param in zip(means, std, params):
    print('%f (%f) with: %r' % (mean, stdev, param))


-0.478966 (0.037297) with: {'max_depth': 2, 'n_estimators': 50}
-0.474370 (0.040899) with: {'max_depth': 2, 'n_estimators': 100}
-0.478332 (0.046846) with: {'max_depth': 2, 'n_estimators': 150}
-0.487738 (0.051882) with: {'max_depth': 2, 'n_estimators': 200}
-0.491832 (0.054754) with: {'max_depth': 4, 'n_estimators': 50}
-0.508782 (0.060357) with: {'max_depth': 4, 'n_estimators': 100}
-0.534646 (0.071571) with: {'max_depth': 4, 'n_estimators': 150}
-0.565249 (0.079672) with: {'max_depth': 4, 'n_estimators': 200}
-0.514191 (0.063924) with: {'max_depth': 6, 'n_estimators': 50}
-0.552013 (0.074915) with: {'max_depth': 6, 'n_estimators': 100}
-0.594136 (0.078587) with: {'max_depth': 6, 'n_estimators': 150}
-0.640523 (0.088810) with: {'max_depth': 6, 'n_estimators': 200}
-0.540215 (0.079779) with: {'max_depth': 8, 'n_estimators': 50}
-0.599218 (0.091481) with: {'max_depth': 8, 'n_estimators': 100}
-0.647917 (0.103900) with: {'max_depth': 8, 'n_estimators': 150}
-0.686291 (0.115291) with: {'max_depth': 8, 'n_estimators': 200}

In [ ]: