In [ ]:
import numpy as np
from timeit import default_timer as timer
# Pre-splits the dataset into training and testing folds for all the classifiers to use.
def splitting_dataset(folder_name, dataset_name, num_folds):
from numpy import genfromtxt
from sklearn.model_selection import StratifiedKFold
# Loads the dataset.
dataset = genfromtxt(folder_name + '/' + dataset_name, delimiter=",")
# Splits the dataset into the data and the labels.
X = dataset[:, 0 : len(dataset[0]) - 1]
y = dataset[:, len(dataset[0]) - 1]
print("Splitting the whole dataset into training and testing folds...")
# Creating folds with StratifiedKFold.
skf = StratifiedKFold(n_splits = num_folds, random_state = None, shuffle = True)
skf.get_n_splits(X, y)
# Opening the dataset file for copying lines.
f_ds = open(folder_name + "/" + dataset_name)
# Creating a list from its lines.
dataset_lines = []
for line in f_ds:
dataset_lines.append(line)
ctr = 0; # For naming the files.
# For each fold...
for train_index, test_index in skf.split(X, y):
train_name = folder_name + '/' + 'fold' + str(ctr) + '_train.data'; # file for train instances
test_name = folder_name + '/' + '/fold' + str(ctr) + '_test.data'; # file for test instances
f_train = open(train_name, 'w')
f_test = open(test_name, 'w')
# Selecting the training and testing instances + labels.
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# Printing the training data (+labels) into the train file.
for i in train_index:
f_train.write(dataset_lines[i])
# Printing the testing data (+labels) into the test file.
for i in test_index:
f_test.write(dataset_lines[i])
ctr += 1;
f_train.close()
f_test.close()
print("%.2f %%" % (ctr / num_folds * 100))
f_ds.close()
print("The whole dataset has been split into folds!")
print("\n")
return
# Performs the grid search for all of the classifiers, including the extended ones.
def general_grid_search(folder_name, num_folds, missing):
# Grid search and extension for XGBoost.
parameters_xgboost = grid_search_xgboost(folder_name, num_folds, missing)
# Grid search and extension for RF.
parameters_rf = grid_search_rf(folder_name, num_folds, missing)
# Grid search and extension for GB
parameters_gb = grid_search_gb(folder_name, num_folds, missing)
return parameters_xgboost, parameters_rf, parameters_gb
# Grid search for XGBoost. Checks also if the extension of the grid search
# is necessary and if it is, performs it by calling the extended grid search function.
def grid_search_xgboost(folder_name, num_folds, missing):
from numpy import genfromtxt
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import Imputer
# Initializes the parameters' list.
#print("Initializing the parameters to test for XGBoost...")
learning_rate = [0.05, 0.1, 0.2]
max_depth = [3, 5, 6, 8]
subsample = [0.5, 0.8, 1]
gamma = [0, 0.1, 0.2, 0.3]
min_child_weight = [1, 3, 5]
#print("The parameters to test for XGBoost have been initialized!\n")
# Initialize the arrays that'll contain the best parameters for each fold (10). Will be returned at the end.
best_learning_rate = []
best_max_depth = []
best_subsample = []
best_gamma = []
best_min_child_weight = []
imputer = Imputer(missing_values = missing)
xgb_model = XGBClassifier(n_estimators = 200)
# For each fold, loads the training dataset in and perform the grid search on it.
for i in range(0, num_folds):
# Loads the dataset (training fold).
#print("Loading training dataset...")
dataset = genfromtxt(folder_name + '/' + 'fold' + str(i) + '_train.data', delimiter=",")
#print("Training dataset was loaded in!")
# Splits the dataset into the data and the labels.
#print("Splitting the dataset into data and labels...")
X = dataset[:, 0 : len(dataset[0]) - 1]
Y = dataset[:, len(dataset[0]) - 1]
X = imputer.fit_transform(X, Y)
#print("The data and labels from the dataset have been split!")
# Timed grid search.
start = timer()
param_grid = dict(learning_rate = learning_rate, max_depth = max_depth,
subsample = subsample, gamma = gamma, min_child_weight = min_child_weight)
#print("Starting the grid search...")
kfold = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 7)
grid_search = GridSearchCV(xgb_model, param_grid, scoring = "neg_log_loss", n_jobs = -1, cv = kfold, verbose = 1)
grid_result = grid_search.fit(X, Y)
#print("The grid search is over!")
end = timer()
time_grid_xgb.append(end - start)
# Summarizes results.
print("Best: %f using %s \n" % (grid_result.best_score_, grid_result.best_params_))
best_learning_rate.append(grid_result.best_params_['learning_rate'])
best_max_depth.append(grid_result.best_params_['max_depth'])
best_subsample.append(grid_result.best_params_['subsample'])
best_gamma.append(grid_result.best_params_['gamma'])
best_min_child_weight.append(grid_result.best_params_['min_child_weight'])
print("%.2f %%\n" % ((i + 1) / num_folds * 100))
print("Done with the normal grid search for XGBoost!\n")
# Grid extension.
parameters_xgboost = grid_extension_xgboost(folder_name, num_folds, missing, best_learning_rate,
best_max_depth, best_subsample, best_gamma, best_min_child_weight)
return parameters_xgboost
# Grid search for Random Forest. Checks also if the extension of the grid search
# is necessary and if it is, performs it by calling the extended grid search function.
def grid_search_rf(folder_name, num_folds, missing):
from numpy import genfromtxt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import Imputer
# Initializes the parameters' list.
#print("Initializing the parameters to test for Random Forest...")
max_features = ["auto", "log2", None]
min_samples_leaf = [1, 25, 50, 70]
max_depth = [None, 5, 8, 10]
min_samples_split = [2, 5, 8, 10]
#print("The parameters to test for Random Forest have been initialized!\n")
# Initialize the arrays that'll contain the best parameters for each fold (10). Will be returned at the end.
best_max_features = []
best_min_samples_leaf = []
best_max_depth = []
best_min_samples_split = []
imputer = Imputer(missing_values = missing)
rf_model = RandomForestClassifier(n_estimators = 200)
# For each fold, loads the training dataset in and perform the grid search on it.
for i in range(0, num_folds):
# Loads the dataset (training fold).
#print("Loading training dataset...")
dataset = genfromtxt(folder_name + '/' + 'fold' + str(i) + '_train.data', delimiter=",")
#print("Training dataset was loaded in!")
# Splits the dataset into the data and the labels.
#print("Splitting the dataset into data and labels...")
X = dataset[:, 0 : len(dataset[0]) - 1]
Y = dataset[:, len(dataset[0]) - 1]
X = imputer.fit_transform(X, Y)
#print("The data and labels from the dataset have been split!")
# Timed grid search.
start = timer()
param_grid = dict(max_features = max_features, min_samples_leaf = min_samples_leaf,
max_depth = max_depth, min_samples_split = min_samples_split)
#print("Starting the grid search...")
kfold = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 7)
grid_search = GridSearchCV(rf_model, param_grid, scoring = "neg_log_loss", n_jobs = -1, cv = kfold, verbose = 1)
grid_result = grid_search.fit(X, Y)
#print("The grid search is over!")
end = timer()
time_grid_rf.append(end - start)
# Summarize results
print("Best: %f using %s \n" % (grid_result.best_score_, grid_result.best_params_))
best_max_features.append(grid_result.best_params_['max_features'])
best_min_samples_leaf.append(grid_result.best_params_['min_samples_leaf'])
best_max_depth.append(grid_result.best_params_['max_depth'])
best_min_samples_split.append(grid_result.best_params_['min_samples_split'])
print("%.2f %%\n" % ((i + 1) / num_folds * 100))
print("Done with the normal grid search for Random Forests!\n")
# Grid extension.
parameters_rf = grid_extension_rf(folder_name, num_folds, missing, best_max_features, best_min_samples_leaf,
best_max_depth, best_min_samples_split)
return parameters_rf
# Grid search for Gradient Boosting. Checks also if the extension of the grid search
# is necessary and if it is, performs it by calling the extended grid search function.
def grid_search_gb(folder_name, num_folds, missing):
from numpy import genfromtxt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import Imputer
# Initializes the parameters' list.
#print("Initializing the parameters to test for Gradient Boosting...")
learning_rate = [0.05, 0.1, 0.2]
max_depth = [3, 5, 6, 8]
subsample = [0.5, 0.8, 1]
max_features = ["auto", "log2", None]
min_samples_split = [2, 5, 8, 10]
#print("The parameters to test for Gradient Boosting have been initialized!\n")
# Initialize the arrays that'll contain the best parameters for each fold (10). Will be returned at the end.
best_learning_rate = []
best_max_depth = []
best_subsample = []
best_max_features = []
best_min_samples_split = []
imputer = Imputer(missing_values = missing)
gb_model = GradientBoostingClassifier(n_estimators = 200)
# For each fold, loads the training dataset in and perform the grid search on it.
for i in range(0, num_folds):
# Loads the dataset (training fold).
#print("Loading training dataset...")
dataset = genfromtxt(folder_name + '/' + 'fold' + str(i) + '_train.data', delimiter=",")
#print("Training dataset was loaded in!")
# Splits the dataset into the data and the labels.
#print("Splitting the dataset into data and labels...")
X = dataset[:, 0 : len(dataset[0]) - 1]
Y = dataset[:, len(dataset[0]) - 1]
X = imputer.fit_transform(X, Y)
#print("The data and labels from the dataset have been split!")
# Timed grid search.
start = timer()
param_grid = dict(learning_rate = learning_rate, max_depth = max_depth, subsample = subsample,
max_features = max_features, min_samples_split = min_samples_split)
#print("Starting the grid search...")
kfold = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 7)
grid_search = GridSearchCV(gb_model, param_grid, scoring = "neg_log_loss", n_jobs = -1, cv = kfold, verbose = 1)
grid_result = grid_search.fit(X, Y)
#print("The grid search is over!")
end = timer()
time_grid_gb.append(end - start)
# Summarize results
print("Best: %f using %s \n" % (grid_result.best_score_, grid_result.best_params_))
best_learning_rate.append(grid_result.best_params_['learning_rate'])
best_max_depth.append(grid_result.best_params_['max_depth'])
best_subsample.append(grid_result.best_params_['subsample'])
best_max_features.append(grid_result.best_params_['max_features'])
best_min_samples_split.append(grid_result.best_params_['min_samples_split'])
print("%.2f %%\n" % ((i + 1) / num_folds * 100))
print("Done with the normal grid search for Gradient Boosting!\n")
# Grid extension.
parameters_gb = grid_extension_gb(folder_name, num_folds, missing, best_learning_rate, best_max_depth,
best_subsample, best_max_features, best_min_samples_split)
return parameters_gb
# Checks whether the grid extension is needed for XGBoost and performs it if it's the case.
# In any case, also groups all the parameters into one 2D array to avoid having too
# many parameters going around.
def grid_extension_xgboost(folder_name, num_folds, missing, best_learning_rate,
best_max_depth, best_subsample, best_gamma,
best_min_child_weight):
from numpy import genfromtxt
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import Imputer
need = False # Initializing the boolean that'll be use to know if the extension is needed.
# Checks whether the grid extension is needed.
if (best_learning_rate.count(0.05) >= num_folds / 2 or best_learning_rate.count(0.2) >= num_folds / 2
or best_max_depth.count(3) >= num_folds / 2 or best_max_depth.count(8) >= num_folds / 2
or best_subsample.count(0.5) >= num_folds / 2 or best_subsample.count(1) >= num_folds / 2
or best_gamma.count(0) >= num_folds / 2 or best_gamma.count(0.3) >= num_folds / 2
or best_min_child_weight.count(1) >= num_folds / 2 or best_min_child_weight.count(5) >= num_folds / 2):
need = True
# If the extension is needed...
if need == True:
#print("\nStarting the grid expansion...")
imputer = Imputer(missing_values = missing)
# For each fold, load the training dataset in and perform the grid search on it.
for i in range(0, num_folds):
# Load the dataset (training fold)
#print("Loading training dataset...")
dataset = genfromtxt(folder_name + '/' + 'fold' + str(i) + '_train.data', delimiter=",")
#print("Training dataset was loaded in!")
# Split the dataset into the data and the labels
#print("Splitting the dataset into data and labels...")
X = dataset[:, 0 : len(dataset[0]) - 1]
Y = dataset[:, len(dataset[0]) - 1]
X = imputer.fit_transform(X, Y)
#print("The data and labels from the dataset have been split!")
# Initializes the model with the best found parameter for each fold (individually).
# The parameters on the extreme of the grid will be added to the parameters' dictionary.
xgb_model = XGBClassifier(n_estimators = 200, learning_rate = best_learning_rate[i], max_depth = best_max_depth[i],
subsample = best_subsample[i], gamma = best_gamma[i],
min_child_weight = best_min_child_weight[i])
param_grid = dict()
if best_learning_rate[i] == 0.05 or best_learning_rate[i] == 0.2:
if best_learning_rate[i] == 0.05:
learning_rate = [0.01, 0.03, 0.05, 0.07]
else:
learning_rate = [0.15, 0.2, 0.25, 0.3]
param_grid['learning_rate'] = learning_rate
bool_lr = True
else:
bool_lr = False
if best_max_depth[i] == 3 or best_max_depth[i] == 8:
if best_max_depth[i] == 3:
max_depth = [1, 2, 3, 4]
else:
max_depth = [7, 8, 9, 10]
param_grid['max_depth'] = max_depth
bool_md = True
else:
bool_md = False
if best_subsample[i] == 0.5 or best_subsample[i] == 1:
if best_subsample[i] == 0.5:
subsample = [0.4, 0.5, 0.6]
else:
subsample = [0.9, 0.95, 1]
param_grid['subsample'] = subsample
bool_s = True
else:
bool_s = False
if best_gamma[i] == 0 or best_gamma[i] == 0.3:
if best_gamma[i] == 0:
gamma = [0, 0.03, 0.05]
else:
gamma = [0.25, 0.3, 0.4]
param_grid['gamma'] = gamma
bool_g = True
else:
bool_g = False
if best_min_child_weight[i] == 1 or best_min_child_weight[i] == 5:
if best_min_child_weight[i] == 1:
min_child_weight = [0, 1, 2]
else:
min_child_weight = [5, 6, 7, 8]
param_grid['min_child_weight'] = min_child_weight
bool_mcw = True
else:
bool_mcw = False
# Timed extended grid search.
start = timer()
#print("Starting the (expended) grid search...")
kfold = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 7)
grid_search = GridSearchCV(xgb_model, param_grid, scoring = "neg_log_loss", n_jobs = -1, cv = kfold, verbose = 1)
grid_result = grid_search.fit(X, Y)
#print("The (expended) grid search is over!")
end = timer()
time_grid_xgb[i] += (end - start)
# Summarizes results.
print("Best: %f using %s \n" % (grid_result.best_score_, grid_result.best_params_))
# Replaces the old parameters on the extreme of the grid by the newly found values.
if bool_lr == True:
best_learning_rate[i] = grid_result.best_params_['learning_rate']
if bool_md == True:
best_max_depth[i] = grid_result.best_params_['max_depth']
if bool_s == True:
best_subsample[i] = grid_result.best_params_['subsample']
if bool_g == True:
best_gamma[i] = grid_result.best_params_['gamma']
if bool_mcw == True:
best_min_child_weight[i] = grid_result.best_params_['min_child_weight']
print("%.2f %%\n" % ((i + 1) / num_folds * 100))
print("Done with the grid expansion for XGBoost!\n")
parameters_xgboost = [best_learning_rate, best_max_depth, best_subsample, best_gamma, best_min_child_weight]
return parameters_xgboost
# Checks whether the grid extension is needed for Random Forest and performs it if it's the case.
# In any case, also groups all the parameters into one 2D array to avoid having too
# many parameters going around.
def grid_extension_rf(folder_name, num_folds, missing, best_max_features, best_min_samples_leaf,
best_max_depth, best_min_samples_split):
from numpy import genfromtxt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import Imputer
need = False # Initializing the boolean that'll be use to know if the extension is needed.
# Checks whether the grid extension is needed.
if (best_min_samples_leaf.count(1) >= num_folds / 2 or best_min_samples_leaf.count(70) >= num_folds / 2
or best_max_depth.count(10) >= num_folds / 2 or best_min_samples_split.count(1) >= num_folds / 2
or best_min_samples_split.count(10) >= num_folds / 2):
need = True
# If the extension is needed...
if need == True:
#print("\nStarting the grid expansion...")
imputer = Imputer(missing_values = missing)
# For each fold, loads the training dataset in and perform the grid search on it.
for i in range(0, num_folds):
# Loads the dataset (training fold).
#print("Loading training dataset...")
dataset = genfromtxt(folder_name + '/' + 'fold' + str(i) + '_train.data', delimiter=",")
#print("Training dataset was loaded in!")
# Splits the dataset into the data and the labels.
#print("Splitting the dataset into data and labels...")
X = dataset[:, 0 : len(dataset[0]) - 1]
Y = dataset[:, len(dataset[0]) - 1]
X = imputer.fit_transform(X, Y)
#print("The data and labels from the dataset have been split!")
# Initializes the model with the best found parameter for each fold (individually).
# The parameters on the extreme of the grid will be added to the parameters' dictionary.
rf_model = RandomForestClassifier(n_estimators = 200, max_features = best_max_features[i],
min_samples_leaf = best_min_samples_leaf[i], max_depth = best_max_depth[i],
min_samples_split = best_min_samples_split[i])
param_grid = dict()
if best_min_samples_leaf[i] == 1 or best_min_samples_leaf[i] == 70:
if best_min_samples_leaf[i] == 1:
min_samples_leaf = [1, 5, 10, 15]
else:
min_samples_leaf = [60, 70, 80]
param_grid['min_samples_leaf'] = min_samples_leaf
bool_msl = True
else:
bool_msl = False
if best_max_depth[i] == 10:
max_depth = [9, 10, 15, 20]
param_grid['max_depth'] = max_depth
bool_md = True
else:
bool_md = False
if best_min_samples_split[i] == 1 or best_min_samples_split[i] == 10:
if best_min_samples_split[i] == 1:
min_samples_split = [1, 2, 3, 4]
else:
min_samples_split = [9, 10, 11, 15]
param_grid['min_samples_split'] = min_samples_split
bool_mss = True
else:
bool_mss = False
# Timed extended grid search.
start = timer()
#print("Starting the (expended) grid search...")
kfold = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 7)
grid_search = GridSearchCV(rf_model, param_grid, scoring = "neg_log_loss", n_jobs = -1, cv = kfold, verbose = 1)
grid_result = grid_search.fit(X, Y)
#print("The (expended) grid search is over!")
end = timer()
time_grid_rf[i] += (end - start)
# Summarizes results.
print("Best: %f using %s \n" % (grid_result.best_score_, grid_result.best_params_))
# Replaces the old parameters on the extreme of the grid by the newly found values.
if bool_msl == True:
best_min_samples_leaf[i] = grid_result.best_params_['min_samples_leaf']
if bool_md == True:
best_max_depth[i] = grid_result.best_params_['max_depth']
if bool_mss == True:
best_min_samples_split[i] = grid_result.best_params_['min_samples_split']
print("%.2f %%\n" % ((i + 1) / num_folds * 100))
print("Done with the grid expansion for Random Forests!\n")
parameters_rf = [best_max_features, best_min_samples_leaf, best_max_depth, best_min_samples_split]
return parameters_rf
# Checks whether the grid extension is needed for XGBoost and performs it if it's the case.
# In any case, also groups all the parameters into one 2D array to avoid having too
# many parameters going around.
def grid_extension_gb(folder_name, num_folds, missing, best_learning_rate,
best_max_depth, best_subsample, best_max_features,
best_min_samples_split):
from numpy import genfromtxt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import Imputer
need = False # Initializing the boolean that'll be use to know if the extension is needed.
# Checks whether the grid extension is needed.
if (best_learning_rate.count(0.05) >= num_folds / 2 or best_learning_rate.count(0.2) >= num_folds / 2
or best_max_depth.count(3) >= num_folds / 2 or best_max_depth.count(8) >= num_folds / 2
or best_subsample.count(0.5) >= num_folds / 2 or best_subsample.count(1) >= num_folds / 2
or best_min_samples_split.count(1) >= num_folds / 2 or best_min_samples_split.count(10) >= num_folds / 2):
need = True
# If the extension is needed...
if need == True:
#print("\nStarting the grid expansion...")
imputer = Imputer(missing_values = missing)
# For each fold, load the training dataset in and perform the grid search on it.
for i in range(0, num_folds):
# Load the dataset (training fold)
#print("Loading training dataset...")
dataset = genfromtxt(folder_name + '/' + 'fold' + str(i) + '_train.data', delimiter=",")
#print("Training dataset was loaded in!")
# Split the dataset into the data and the labels
#print("Splitting the dataset into data and labels...")
X = dataset[:, 0 : len(dataset[0]) - 1]
Y = dataset[:, len(dataset[0]) - 1]
X = imputer.fit_transform(X, Y)
#print("The data and labels from the dataset have been split!")
# Initializes the model with the best found parameter for each fold (individually).
# The parameters on the extreme of the grid will be added to the parameters' dictionary.
gb_model = GradientBoostingClassifier(n_estimators = 200, learning_rate = best_learning_rate[i],
max_depth = best_max_depth[i], subsample = best_subsample[i],
max_features = best_max_features[i], min_samples_split = best_min_samples_split[i])
param_grid = dict()
if best_learning_rate[i] == 0.05 or best_learning_rate[i] == 0.2:
if best_learning_rate[i] == 0.05:
learning_rate = [0.01, 0.03, 0.05, 0.07]
else:
learning_rate = [0.15, 0.2, 0.25, 0.3]
param_grid['learning_rate'] = learning_rate
bool_lr = True
else:
bool_lr = False
if best_max_depth[i] == 3 or best_max_depth[i] == 8:
if best_max_depth[i] == 3:
max_depth = [1, 2, 3, 4]
else:
max_depth = [7, 8, 9, 10]
param_grid['max_depth'] = max_depth
bool_md = True
else:
bool_md = False
if best_subsample[i] == 0.5 or best_subsample[i] == 1:
if best_subsample[i] == 0.5:
subsample = [0.4, 0.5, 0.6]
else:
subsample = [0.9, 0.95, 1]
param_grid['subsample'] = subsample
bool_s = True
else:
bool_s = False
if best_min_samples_split[i] == 1 or best_min_samples_split[i] == 10:
if best_min_samples_split[i] == 1:
min_samples_split = [1, 2, 3, 4]
else:
min_samples_split = [9, 10, 11, 15]
param_grid['min_samples_split'] = min_samples_split
bool_mss = True
else:
bool_mss = False
# Timed extended grid search.
start = timer()
#print("Starting the (expended) grid search...")
kfold = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 7)
grid_search = GridSearchCV(gb_model, param_grid, scoring = "neg_log_loss", n_jobs = -1, cv = kfold, verbose = 1)
grid_result = grid_search.fit(X, Y)
#print("The (expended) grid search is over!")
end = timer()
time_grid_gb[i] += (end - start)
# Summarizes results.
print("Best: %f using %s \n" % (grid_result.best_score_, grid_result.best_params_))
# Replaces the old parameters on the extreme of the grid by the newly found values.
if bool_lr == True:
best_learning_rate[i] = grid_result.best_params_['learning_rate']
if bool_md == True:
best_max_depth[i] = grid_result.best_params_['max_depth']
if bool_s == True:
best_subsample[i] = grid_result.best_params_['subsample']
if bool_mss == True:
best_min_samples_split[i] = grid_result.best_params_['min_samples_split']
print("%.2f %%\n" % ((i + 1) / num_folds * 100))
print("Done with the grid expansion for Gradient Boosting!\n")
parameters_gb = [best_learning_rate, best_max_depth, best_subsample, best_max_features, best_min_samples_split]
return parameters_gb
# Trains all the models with the final parameters and tests them.
def train_and_test(folder_name, num_folds, missing, parameters_xgboost, parameters_rf, parameters_gb):
from numpy import genfromtxt
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import Imputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
# Initializes the default models.
print("Initializing all the default models...")
xgb_d_model = XGBClassifier(n_estimators = 200)
rf_d_model = RandomForestClassifier(n_estimators = 200)
gb_d_model = GradientBoostingClassifier(n_estimators = 200)
print("All the default models have been initialized!")
# Will store the accuracy for each fold. An average will then be computed.
xgb_d_results = []
xgb_results = []
rf_d_results = []
rf_results = []
gb_d_results = []
gb_results = []
imputer = Imputer(missing_values = missing)
depth_rf_d = []
depth_rf = []
# For each fold, loads the training/testing dataset in and fits the models before testing.
for i in range(0, num_folds):
# Loads the dataset (training fold).
#print("Loading the training set...")
dataset = genfromtxt(folder_name + '/' + 'fold' + str(i) + '_train.data', delimiter=",")
#print("Training set was loaded in!")
# Split the dataset into the data and the labels
#print("Splitting the dataset into data and labels...")
X = dataset[:, 0 : len(dataset[0]) - 1]
Y = dataset[:, len(dataset[0]) - 1]
X = imputer.fit_transform(X, Y)
#print("The data and labels from the dataset have been split!")
print("Training all the default models over the fold with the best parameters...")
start = timer()
xgb_d_model.fit(X, Y)
end = timer()
time_fit_xgb_d.append(end - start)
start = timer()
rf_d_model.fit(X, Y)
end = timer()
time_fit_rf_d.append(end - start)
start = timer()
gb_d_model.fit(X, Y)
end = timer()
time_fit_gb_d.append(end - start)
print("All the default models have been trained!")
print("Initializing all the tuned models and training them over the fold with the best parameters...")
xgb_model = XGBClassifier(n_estimators = 200, learning_rate = parameters_xgboost[0][i],
max_depth = parameters_xgboost[1][i], subsample = parameters_xgboost[2][i],
gamma = parameters_xgboost[3][i], min_child_weight = parameters_xgboost[4][i])
rf_model = RandomForestClassifier(n_estimators = 200, max_features = parameters_rf[0][i],
min_samples_leaf = parameters_rf[1][i], max_depth = parameters_rf[2][i],
min_samples_split = parameters_rf[3][i])
gb_model = GradientBoostingClassifier(n_estimators = 200, learning_rate = parameters_gb[0][i],
max_depth = parameters_gb[1][i], subsample = parameters_gb[2][i],
max_features = parameters_gb[3][i], min_samples_split = parameters_gb[4][i])
start = timer()
xgb_model.fit(X, Y)
end = timer()
time_fit_xgb.append(end - start)
start = timer()
rf_model.fit(X, Y)
end = timer()
time_fit_rf.append(end - start)
start = timer()
gb_model.fit(X, Y)
end = timer()
time_fit_gb.append(end - start)
print("All the tuned models have been initialized and trained!")
# Loads the dataset (testing fold).
#print("Loading testing dataset...")
testing = genfromtxt(folder_name + '/' + 'fold' + str(i) + '_test.data', delimiter=",")
#print("Testing dataset was loaded in!")
# Splits the testing fold into the data and the labels.
#print("Splitting the testing set into data and labels...")
X_test = testing[:, 0 : len(dataset[0]) - 1]
Y_test = testing[:, len(dataset[0]) - 1]
X_test = imputer.fit_transform(X_test, Y_test);
#print("The data and labels from the testing set have been split!\n")
# Makes predictions for test data.
xgb_d_Y_pred = xgb_d_model.predict(X_test)
xgb_d_predictions = [round(value) for value in xgb_d_Y_pred]
xgb_Y_pred = xgb_model.predict(X_test)
xgb_predictions = [round(value) for value in xgb_Y_pred]
rf_d_Y_pred = rf_d_model.predict(X_test)
rf_d_predictions = [round(value) for value in rf_d_Y_pred]
rf_Y_pred = rf_model.predict(X_test)
rf_predictions = [round(value) for value in rf_Y_pred]
gb_d_Y_pred = gb_d_model.predict(X_test)
gb_d_predictions = [round(value) for value in gb_d_Y_pred]
gb_Y_pred = gb_model.predict(X_test)
gb_predictions = [round(value) for value in gb_Y_pred]
# Evaluates predictions.
xgb_d_accuracy = accuracy_score(Y_test, xgb_d_predictions)
xgb_accuracy = accuracy_score(Y_test, xgb_predictions)
rf_d_accuracy = accuracy_score(Y_test, rf_d_predictions)
rf_accuracy = accuracy_score(Y_test, rf_predictions)
gb_d_accuracy = accuracy_score(Y_test, gb_d_predictions)
gb_accuracy = accuracy_score(Y_test, gb_predictions)
# Saves the predictions results.
xgb_d_results.append(xgb_d_accuracy)
xgb_results.append(xgb_accuracy)
rf_d_results.append(rf_d_accuracy)
rf_results.append(rf_accuracy)
gb_d_results.append(gb_d_accuracy)
gb_results.append(gb_accuracy)
#print("Default XGBoost accuracy %.2f%%" % (xgb_d_accuracy * 100.0))
#print("XGBoost accuracy: %.2f%%" % (xgb_accuracy * 100.0))
#print("Default Random Forests accuracy: %.2f%%" % (rf_d_accuracy * 100.0))
#print("Random Forests accuracy: %.2f%%" % (rf_accuracy * 100.0))
#print("Default Gradient Boosting accuracy: %.2f%%" % (gb_d_accuracy * 100.0))
#print("Gradient Boosting accuracy: %.2f%% \n" % (gb_accuracy * 100.0))
print("%.2f %%\n" % ((i + 1) / num_folds * 100))
# Saving the depth of the trees for random forests.
tmp = [estimator.tree_.max_depth for estimator in rf_d_model.estimators_]
depth_rf_d.append(sum(tmp) / len(tmp))
tmp = [estimator.tree_.max_depth for estimator in rf_model.estimators_]
depth_rf.append(sum(tmp) / len(tmp))
return xgb_d_results, xgb_results, rf_d_results, rf_results, gb_d_results, gb_results, depth_rf_d, depth_rf
# Finds the most frequent parameters of a classifier out of its parameters array.
def find_most_frequent_parameters(parameters):
n_parameters = len(parameters) # Number of different parameters
most_frequent_parameters = [] # 1D array containing the most frequent parameters.
for i in range(0, n_parameters):
for j in range(0, len(parameters[i])):
if parameters[i][j] == None:
parameters[i][j] = -1
(values, counts) = np.unique(parameters[i], return_counts = True)
ind = np.argmax(counts)
most_frequent_parameters.append(values[ind]) # Stores the most frequent value for parameter i
if most_frequent_parameters[i] == -1:
most_frequent_parameters[i] = None
return most_frequent_parameters
# Prints the final accuracy results.
def print_results(num_folds, xgb_d_results, xgb_results, rf_d_results, rf_results, gb_d_results, gb_results):
print("\t\t XGB d.\t\t XGB t.\t\t RF d.\t\t RF t.\t\t GB d.\t\t GB t.")
for i in range(0, num_folds):
print("Fold ", i + 1, "\t %.2f%%" % (xgb_d_results[i] * 100.0), "\t %.2f%%" % (xgb_results[i] * 100.0),
"\t %.2f%%" % (rf_d_results[i] * 100), "\t %.2f%%" % (rf_results[i] * 100),
"\t %.2f%%" % (gb_d_results[i] * 100), "\t %.2f%%" % (gb_results[i] * 100))
print("Average", "\t %.2f%%" % (sum(xgb_d_results) / float(len(xgb_d_results)) * 100.0),
"\t %.2f%%" % (sum(xgb_results) / float(len(xgb_results)) * 100.0),
"\t %.2f%%" % (sum(rf_d_results) / float(len(rf_d_results)) * 100.0),
"\t %.2f%%" % (sum(rf_results) / float(len(rf_results)) * 100.0),
"\t %.2f%%" % (sum(gb_d_results) / float(len(gb_d_results)) * 100.0),
"\t %.2f%%" % (sum(gb_results) / float(len(gb_results)) * 100.0))
print("Std dev", "\t %.2f" % (np.std(xgb_d_results)), "\t\t %.2f" % (np.std(xgb_results)),
"\t\t %.2f" % (np.std(rf_d_results)), "\t\t %.2f" % (np.std(rf_results)), "\t\t %.2f" % (np.std(gb_d_results)),
"\t\t %.2f" % (np.std(gb_results)))
print("\n")
print("\n")
return
# Prints the time for finding the optimum parameters.
def print_timing(num_folds):
print("\t\tXGB d.\t XGB t.\t\t RF d.\t RF t.\t\t GB d.\t GB t.")
for i in range(0, num_folds):
print("Fold ", i + 1, "\t %.2f" % time_fit_xgb_d[i], "\t %.2f" % time_grid_xgb[i], "+ %.2f" % time_fit_xgb[i],
"\t %.2f" % time_fit_rf_d[i], "\t %.2f" % time_grid_rf[i], "+ %.2f" % time_fit_rf[i],
"\t %.2f" % time_fit_gb_d[i], "\t %.2f" % time_grid_gb[i], "+ %.2f" % time_fit_gb[i])
print("Average", "\t %.2f" % (sum(time_fit_xgb_d) / float(len(time_fit_xgb_d))),
"\t %.2f" % (sum(time_grid_xgb) / float(len(time_grid_xgb))), "+ %.2f" % (sum(time_fit_xgb) / float(len(time_fit_xgb))),
"\t %.2f" % (sum(time_fit_rf_d) / float(len(time_fit_rf_d))),
"\t %.2f" % (sum(time_grid_rf) / float(len(time_grid_rf))), "+ %.2f" % (sum(time_fit_rf) / float(len(time_fit_rf))),
"\t %.2f" % (sum(time_fit_gb_d) / float(len(time_fit_gb_d))),
"\t %.2f" % (sum(time_grid_gb) / float(len(time_grid_gb))), "+ %.2f" % (sum(time_fit_gb) / float(len(time_fit_gb))))
print("\n")
print("\n")
return
# Prints the average depth of each model.
def print_depth(parameters_xgboost, parameters_gb, depth_rf_d, depth_rf):
print("Depth of default XGBoost: ", 3) # Default value.
print("Depth of tuned XGBoost: ", sum(parameters_xgboost[1]) / len(parameters_xgboost[1])) # Average max_depth value.
print("Depth of default Random Forest: ", sum(depth_rf_d) / len(depth_rf_d)) # Since default value is "None", mean of all the depths.
print("Depth of tuned Random Forest: ", sum(depth_rf) / len(depth_rf)) # Average value between the "None" and the "fixed" depths.
print("Depth of default Gradient Boosting:", 3) # Default value.
print("Depth of tuned Gradient Boosting: ", sum(parameters_gb[1]) / len(parameters_gb[1])) # Average max_depth value.
print("\n")
print("\n")
return
# Prints the mode of the parameters.
def print_mode(parameters_xgboost, parameters_rf, parameters_gb):
# XGBoost
most_frequent_xgboost = find_most_frequent_parameters(parameters_xgboost)
print("XGBoost most frequent parameters")
print("Learning rate: ", most_frequent_xgboost[0])
print("Depth: ", most_frequent_xgboost[1])
print("Subsample: ", most_frequent_xgboost[2])
print("Gamma: ", most_frequent_xgboost[3])
print("Min child weight: ", most_frequent_xgboost[4])
# Random Forests
most_frequent_rf = find_most_frequent_parameters(parameters_rf)
print("\n")
print("Random Forest most frequent parameters")
print("Max features: ", most_frequent_rf[0])
print("Min samples leaf: ", most_frequent_rf[1])
print("Max depth: ", most_frequent_rf[2])
print("Min samples split: ", most_frequent_rf[3])
# Gradient Boosting
most_frequent_gb = find_most_frequent_parameters(parameters_gb)
print("\n")
print("Gradient Boosting most frequent parameters")
print("Learning rate: ", most_frequent_gb[0])
print("Depth: ", most_frequent_gb[1])
print("Subsample: ", most_frequent_gb[2])
print("Max features: ", most_frequent_gb[3])
print("Min samples split: ", most_frequent_gb[4])
return
# Compare the three classifiers.
def compare_classifiers(folder_name, dataset_name, num_folds, missing):
# Initializing all the variables for the time measurements.
global time_grid_xgb, time_grid_rf, time_grid_gb
global time_fit_xgb_d, time_fit_xgb
global time_fit_rf_d, time_fit_rf
global time_fit_gb_d, time_fit_gb
time_grid_xgb = []
time_grid_rf = []
time_grid_gb = []
time_fit_xgb_d = []
time_fit_xgb = []
time_fit_rf_d = []
time_fit_rf = []
time_fit_gb_d = []
time_fit_gb = []
# Splitting the dataset
splitting_dataset(folder_name, dataset_name, num_folds)
# Learning the parameters over the basic grid for XGBoost, RF and GB
parameters_xgboost, parameters_rf, parameters_gb = general_grid_search(folder_name, num_folds, missing)
# Training and testing with the best parameters of each fold for each classifier.
xgb_d_results, xgb_results, rf_d_results, rf_results, gb_d_results, gb_results, depth_rf_d, depth_rf = train_and_test(folder_name, num_folds, missing, parameters_xgboost, parameters_rf, parameters_gb)
# Printing stuff (accuracy, most frequent parameters, depth, std).
print_results(num_folds, xgb_d_results, xgb_results, rf_d_results, rf_results, gb_d_results, gb_results)
print_timing(num_folds)
print_depth(parameters_xgboost, parameters_gb, depth_rf_d, depth_rf)
print_mode(parameters_xgboost, parameters_rf, parameters_gb)
return