In [10]:

    
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np



In [19]:

    
data = read_csv('../data/otto/train.csv')



In [20]:

    
data.shape









    Out[20]:





(61878, 95)



In [21]:

    
data.head()









    Out[21]:







  
    
      
      id
      feat_1
      feat_2
      feat_3
      feat_4
      feat_5
      feat_6
      feat_7
      feat_8
      feat_9
      ...
      feat_85
      feat_86
      feat_87
      feat_88
      feat_89
      feat_90
      feat_91
      feat_92
      feat_93
      target
    
  
  
    
      0
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      1
      0
      0
      0
      0
      0
      0
      0
      0
      Class_1
    
    
      1
      2
      0
      0
      0
      0
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      Class_1
    
    
      2
      3
      0
      0
      0
      0
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      Class_1
    
    
      3
      4
      1
      0
      0
      1
      6
      1
      5
      0
      0
      ...
      0
      1
      2
      0
      0
      0
      0
      0
      0
      Class_1
    
    
      4
      5
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      1
      0
      0
      0
      0
      1
      0
      0
      0
      Class_1
    
  

5 rows × 95 columns



In [11]:

    
np.unique(data.target)









    Out[11]:





array(['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
       'Class_7', 'Class_8', 'Class_9'], dtype=object)



In [22]:

    
dataset = data.values



In [25]:

    
X = dataset[:, 1:94]
y = dataset[:, 94]



In [27]:

    
label_encoded_y = LabelEncoder().fit_transform(y)



In [28]:

    
label_encoded_y









    Out[28]:





array([0, 0, 0, ..., 8, 8, 8])



In [29]:

    
model = XGBClassifier()



In [33]:

    
n_estimators = range(50, 400, 50)
print(list(n_estimators))
param_grid = dict(n_estimators=n_estimators)









    



[50, 100, 150, 200, 250, 300, 350]



In [34]:

    
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)



In [ ]:

    
grid_search = GridSearchCV(model, param_grid, scoring='neg_log_loss', cv=kfold, verbose=2)
grid_result = grid_search.fit(X, label_encoded_y)









    



Fitting 10 folds for each of 7 candidates, totalling 70 fits
[CV] n_estimators=50 .................................................
[CV] .................................. n_estimators=50, total=  25.7s
[CV] n_estimators=50 .................................................






    



[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.3s remaining:    0.0s






    



[CV] .................................. n_estimators=50, total=  25.8s
[CV] n_estimators=50 .................................................



In [ ]:

	id	feat_1	feat_4	feat_5	feat_6	feat_7	feat_8	...	feat_85	feat_86	feat_87	feat_90	target
0	1	1	0	0	0	0	0	...	1	0	0	0	Class_1
1	2	0	0	0	0	0	1	...	0	0	0	0	Class_1
2	3	0	0	0	0	0	1	...	0	0	0	0	Class_1
3	4	1	1	6	1	5	0	...	0	1	2	0	Class_1
4	5	0	0	0	0	0	0	...	1	0	0	1	Class_1