In [10]:
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np

In [19]:
data = read_csv('../data/otto/train.csv')

In [20]:
data.shape


Out[20]:
(61878, 95)

In [21]:
data.head()


Out[21]:
id feat_1 feat_2 feat_3 feat_4 feat_5 feat_6 feat_7 feat_8 feat_9 ... feat_85 feat_86 feat_87 feat_88 feat_89 feat_90 feat_91 feat_92 feat_93 target
0 1 1 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 Class_1
1 2 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 Class_1
2 3 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 Class_1
3 4 1 0 0 1 6 1 5 0 0 ... 0 1 2 0 0 0 0 0 0 Class_1
4 5 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 1 0 0 0 Class_1

5 rows × 95 columns


In [11]:
np.unique(data.target)


Out[11]:
array(['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
       'Class_7', 'Class_8', 'Class_9'], dtype=object)

In [22]:
dataset = data.values

In [25]:
X = dataset[:, 1:94]
y = dataset[:, 94]

In [27]:
label_encoded_y = LabelEncoder().fit_transform(y)

In [28]:
label_encoded_y


Out[28]:
array([0, 0, 0, ..., 8, 8, 8])

In [29]:
model = XGBClassifier()

In [33]:
n_estimators = range(50, 400, 50)
print(list(n_estimators))
param_grid = dict(n_estimators=n_estimators)


[50, 100, 150, 200, 250, 300, 350]

In [34]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)

In [ ]:
grid_search = GridSearchCV(model, param_grid, scoring='neg_log_loss', cv=kfold, verbose=2)
grid_result = grid_search.fit(X, label_encoded_y)


Fitting 10 folds for each of 7 candidates, totalling 70 fits
[CV] n_estimators=50 .................................................
[CV] .................................. n_estimators=50, total=  25.7s
[CV] n_estimators=50 .................................................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.3s remaining:    0.0s
[CV] .................................. n_estimators=50, total=  25.8s
[CV] n_estimators=50 .................................................

In [ ]: