In [2]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
In [3]:
dataset = loadtxt('pima-indians-diabetes.data', delimiter=',')
In [4]:
X = dataset[:, 0:8]
Y = dataset[:, 8]
In [5]:
seed = 7
test_size = 0.33
In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, Y,
test_size=test_size,
random_state=seed)
In [8]:
model = XGBClassifier()
In [9]:
model
Out[9]:
In [10]:
model.fit(X_train, y_train)
Out[10]:
In [13]:
y_pred = model.predict(X_test)
In [20]:
predictions = [round(value) for value in y_pred]
In [21]:
accuracy = accuracy_score(y_test, predictions)
In [22]:
print('Accuracy: %.2f%%' % (accuracy * 100.0))
In [23]:
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, eval_metric='error', eval_set=eval_set, verbose=True)
Out[23]:
In [27]:
# early stopping
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# load data
dataset = loadtxt('pima-indians-diabetes.data', delimiter=",")
# split data into X and y
X = dataset[:,0:8]
Y = dataset[:,8]
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
In [28]:
model = XGBClassifier()
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, early_stopping_rounds=10,
eval_metric='logloss', eval_set=eval_set, verbose=True)
Out[28]:
In [30]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print('Accuracy: %.2f%%' % (accuracy * 100.0))
In [31]:
model.feature_importances_
Out[31]:
In [34]:
import matplotlib.pyplot as plt
from xgboost import plot_importance
plot_importance(model)
plt.show()
In [38]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
In [39]:
dataset = loadtxt('pima-indians-diabetes.data', delimiter=',')
X = dataset[:, 0:8]
Y = dataset[:, 8]
In [78]:
model = XGBClassifier()
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
max_depth = [2, 3, 4, 5, 6, 7, 8]
param_grid = dict(learning_rate=learning_rate, max_depth=max_depth)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
In [79]:
grid_search = GridSearchCV(model, param_grid, scoring='neg_log_loss', cv=kfold, verbose=1)
grid_result = grid_search.fit(X, Y)
In [80]:
print(grid_result)
In [81]:
print('Best: %f using %s' % (grid_result.best_score_, grid_result.best_params_))
In [82]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
In [83]:
means.shape
Out[83]:
In [84]:
for mean, stdev, param in zip(means, stds, params):
print('%f (%f) with: %r' % (mean, stdev, param))