In [ ]:
import numpy as np
from timeit import default_timer as timer

# Pre-splits the dataset into training and testing folds for all the classifiers to use.
def splitting_dataset(folder_name, dataset_name, num_folds):
    from numpy import genfromtxt
    from sklearn.model_selection import StratifiedKFold

    # Loads the dataset.
    dataset = genfromtxt(folder_name + '/' + dataset_name, delimiter=",")
    # Splits the dataset into the data and the labels.
    X = dataset[:, 0 : len(dataset[0]) - 1]
    y = dataset[:, len(dataset[0]) - 1]

    print("Splitting the whole dataset into training and testing folds...")

    # Creating folds with StratifiedKFold.
    skf = StratifiedKFold(n_splits = num_folds, random_state = None, shuffle = True)
    skf.get_n_splits(X, y)
    # Opening the dataset file for copying lines.
    f_ds = open(folder_name + "/" + dataset_name)
    # Creating a list from its lines.
    dataset_lines = []
    for line in f_ds:
        dataset_lines.append(line)

    ctr = 0; # For naming the files.

    # For each fold...
    for train_index, test_index in skf.split(X, y):
        train_name = folder_name + '/' + 'fold' + str(ctr) + '_train.data'; # file for train instances
        test_name = folder_name + '/' + '/fold' + str(ctr) + '_test.data'; # file for test instances
        f_train = open(train_name, 'w')
        f_test = open(test_name, 'w')

        # Selecting the training and testing instances + labels.
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Printing the training data (+labels) into the train file.
        for i in train_index:
            f_train.write(dataset_lines[i])

        # Printing the testing data (+labels) into the test file.
        for i in test_index:
            f_test.write(dataset_lines[i])

        ctr += 1;
        f_train.close()
        f_test.close()

        print("%.2f %%" % (ctr / num_folds * 100))

    f_ds.close()
    print("The whole dataset has been split into folds!") 
    print("\n")
    
    return



# Performs the grid search for all of the classifiers, including the extended ones. 
def general_grid_search(folder_name, num_folds, missing):
    # Grid search and extension for XGBoost.
    parameters_xgboost = grid_search_xgboost(folder_name, num_folds, missing)
    # Grid search and extension for RF.
    parameters_rf = grid_search_rf(folder_name, num_folds, missing)
    # Grid search and extension for GB
    parameters_gb = grid_search_gb(folder_name, num_folds, missing)
    
    return parameters_xgboost, parameters_rf, parameters_gb



# Grid search for XGBoost. Checks also if the extension of the grid search
# is necessary and if it is, performs it by calling the extended grid search function.
def grid_search_xgboost(folder_name, num_folds, missing):
    from numpy import genfromtxt
    from xgboost import XGBClassifier
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import accuracy_score
    from sklearn.preprocessing import Imputer
    
    # Initializes the parameters' list.
    #print("Initializing the parameters to test for XGBoost...") 
    learning_rate = [0.05, 0.1, 0.2]
    max_depth = [3, 5, 6, 8]
    subsample = [0.5, 0.8, 1]
    gamma = [0, 0.1, 0.2, 0.3]
    min_child_weight = [1, 3, 5]
    #print("The parameters to test for XGBoost have been initialized!\n")
    
    # Initialize the arrays that'll contain the best parameters for each fold (10). Will be returned at the end.
    best_learning_rate = []
    best_max_depth = []
    best_subsample = []
    best_gamma = []
    best_min_child_weight = []
    
    imputer = Imputer(missing_values = missing)    
    xgb_model = XGBClassifier(n_estimators = 200)
    
    # For each fold, loads the training dataset in and perform the grid search on it.
    for i in range(0, num_folds):
        # Loads the dataset (training fold).
        #print("Loading training dataset...") 
        dataset = genfromtxt(folder_name + '/' + 'fold' + str(i) + '_train.data', delimiter=",")
        #print("Training dataset was loaded in!")

        # Splits the dataset into the data and the labels.
        #print("Splitting the dataset into data and labels...")
        X = dataset[:, 0 : len(dataset[0]) - 1]
        Y = dataset[:, len(dataset[0]) - 1]
        X = imputer.fit_transform(X, Y)
        #print("The data and labels from the dataset have been split!")

        # Timed grid search.
        start = timer()
        param_grid = dict(learning_rate = learning_rate, max_depth = max_depth, 
                          subsample = subsample, gamma = gamma, min_child_weight = min_child_weight)
        #print("Starting the grid search...")
        kfold = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 7)
        grid_search = GridSearchCV(xgb_model, param_grid, scoring = "neg_log_loss", n_jobs = -1, cv = kfold, verbose = 1)
        grid_result = grid_search.fit(X, Y)
        #print("The grid search is over!")
        end = timer()
        time_grid_xgb.append(end - start)        

        # Summarizes results. 
        print("Best: %f using %s \n" % (grid_result.best_score_, grid_result.best_params_))

        best_learning_rate.append(grid_result.best_params_['learning_rate'])
        best_max_depth.append(grid_result.best_params_['max_depth'])
        best_subsample.append(grid_result.best_params_['subsample'])
        best_gamma.append(grid_result.best_params_['gamma'])
        best_min_child_weight.append(grid_result.best_params_['min_child_weight'])
        
        print("%.2f %%\n" % ((i + 1) / num_folds * 100))
        
    print("Done with the normal grid search for XGBoost!\n")
    
    # Grid extension.
    parameters_xgboost = grid_extension_xgboost(folder_name, num_folds, missing, best_learning_rate, 
                                                best_max_depth, best_subsample, best_gamma, best_min_child_weight)
    
    return parameters_xgboost



# Grid search for Random Forest. Checks also if the extension of the grid search
# is necessary and if it is, performs it by calling the extended grid search function.
def grid_search_rf(folder_name, num_folds, missing):
    from numpy import genfromtxt
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import accuracy_score
    from sklearn.preprocessing import Imputer
    
    # Initializes the parameters' list.
    #print("Initializing the parameters to test for Random Forest...") 
    max_features = ["auto", "log2", None]
    min_samples_leaf = [1, 25, 50, 70]
    max_depth = [None, 5, 8, 10]
    min_samples_split = [2, 5, 8, 10]
    #print("The parameters to test for Random Forest have been initialized!\n")
    
    # Initialize the arrays that'll contain the best parameters for each fold (10). Will be returned at the end.
    best_max_features = []
    best_min_samples_leaf = []
    best_max_depth = []
    best_min_samples_split = []
    
    imputer = Imputer(missing_values = missing)    
    rf_model = RandomForestClassifier(n_estimators = 200)    
    
    # For each fold, loads the training dataset in and perform the grid search on it.
    for i in range(0, num_folds):
        # Loads the dataset (training fold).
        #print("Loading training dataset...") 
        dataset = genfromtxt(folder_name + '/' + 'fold' + str(i) + '_train.data', delimiter=",")
        #print("Training dataset was loaded in!")

        # Splits the dataset into the data and the labels.
        #print("Splitting the dataset into data and labels...")
        X = dataset[:, 0 : len(dataset[0]) - 1]
        Y = dataset[:, len(dataset[0]) - 1]
        X = imputer.fit_transform(X, Y)
        #print("The data and labels from the dataset have been split!")

        # Timed grid search.
        start = timer()
        param_grid = dict(max_features = max_features, min_samples_leaf = min_samples_leaf, 
                          max_depth = max_depth, min_samples_split = min_samples_split)
        #print("Starting the grid search...")
        kfold = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 7)
        grid_search = GridSearchCV(rf_model, param_grid, scoring = "neg_log_loss", n_jobs = -1, cv = kfold, verbose = 1)
        grid_result = grid_search.fit(X, Y)
        #print("The grid search is over!")
        end = timer()
        time_grid_rf.append(end - start)        

        # Summarize results 
        print("Best: %f using %s \n" % (grid_result.best_score_, grid_result.best_params_))

        best_max_features.append(grid_result.best_params_['max_features'])        
        best_min_samples_leaf.append(grid_result.best_params_['min_samples_leaf'])
        best_max_depth.append(grid_result.best_params_['max_depth'])
        best_min_samples_split.append(grid_result.best_params_['min_samples_split'])
                
        print("%.2f %%\n" % ((i + 1) / num_folds * 100))
        
    print("Done with the normal grid search for Random Forests!\n")
    
    # Grid extension.
    parameters_rf = grid_extension_rf(folder_name, num_folds, missing, best_max_features, best_min_samples_leaf,
                                                best_max_depth, best_min_samples_split)
    
    return parameters_rf

    
    
# Grid search for Gradient Boosting. Checks also if the extension of the grid search
# is necessary and if it is, performs it by calling the extended grid search function.
def grid_search_gb(folder_name, num_folds, missing):
    from numpy import genfromtxt
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import accuracy_score
    from sklearn.preprocessing import Imputer
    
    # Initializes the parameters' list.
    #print("Initializing the parameters to test for Gradient Boosting...") 
    learning_rate = [0.05, 0.1, 0.2]
    max_depth = [3, 5, 6, 8]
    subsample = [0.5, 0.8, 1]
    max_features = ["auto", "log2", None]
    min_samples_split = [2, 5, 8, 10]
    #print("The parameters to test for Gradient Boosting have been initialized!\n")
    
    # Initialize the arrays that'll contain the best parameters for each fold (10). Will be returned at the end.
    best_learning_rate = []
    best_max_depth = []
    best_subsample = []
    best_max_features = []
    best_min_samples_split = []
    
    imputer = Imputer(missing_values = missing)    
    gb_model = GradientBoostingClassifier(n_estimators = 200)    
    
    # For each fold, loads the training dataset in and perform the grid search on it.
    for i in range(0, num_folds):
        # Loads the dataset (training fold).
        #print("Loading training dataset...") 
        dataset = genfromtxt(folder_name + '/' + 'fold' + str(i) + '_train.data', delimiter=",")
        #print("Training dataset was loaded in!")

        # Splits the dataset into the data and the labels.
        #print("Splitting the dataset into data and labels...")
        X = dataset[:, 0 : len(dataset[0]) - 1]
        Y = dataset[:, len(dataset[0]) - 1]
        X = imputer.fit_transform(X, Y)
        #print("The data and labels from the dataset have been split!")

        # Timed grid search.
        start = timer()
        param_grid = dict(learning_rate = learning_rate, max_depth = max_depth, subsample = subsample, 
                          max_features = max_features, min_samples_split = min_samples_split)
        #print("Starting the grid search...")
        kfold = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 7)
        grid_search = GridSearchCV(gb_model, param_grid, scoring = "neg_log_loss", n_jobs = -1, cv = kfold, verbose = 1)
        grid_result = grid_search.fit(X, Y)
        #print("The grid search is over!")
        end = timer()
        time_grid_gb.append(end - start)        

        # Summarize results 
        print("Best: %f using %s \n" % (grid_result.best_score_, grid_result.best_params_))

        best_learning_rate.append(grid_result.best_params_['learning_rate'])
        best_max_depth.append(grid_result.best_params_['max_depth'])
        best_subsample.append(grid_result.best_params_['subsample'])
        best_max_features.append(grid_result.best_params_['max_features'])  
        best_min_samples_split.append(grid_result.best_params_['min_samples_split'])
                
        print("%.2f %%\n" % ((i + 1) / num_folds * 100))
        
    print("Done with the normal grid search for Gradient Boosting!\n")
    
    # Grid extension.
    parameters_gb = grid_extension_gb(folder_name, num_folds, missing, best_learning_rate, best_max_depth,
                                      best_subsample, best_max_features, best_min_samples_split)
    
    return parameters_gb
    
    
    
# Checks whether the grid extension is needed for XGBoost and performs it if it's the case.
# In any case, also groups all the parameters into one 2D array to avoid having too
# many parameters going around.
def grid_extension_xgboost(folder_name, num_folds, missing, best_learning_rate, 
                             best_max_depth, best_subsample, best_gamma, 
                             best_min_child_weight):    
    from numpy import genfromtxt
    from xgboost import XGBClassifier
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import accuracy_score
    from sklearn.preprocessing import Imputer
    
    need = False # Initializing the boolean that'll be use to know if the extension is needed. 
    
    # Checks whether the grid extension is needed.
    if (best_learning_rate.count(0.05) >= num_folds / 2 or best_learning_rate.count(0.2) >= num_folds / 2 
        or best_max_depth.count(3) >= num_folds / 2 or best_max_depth.count(8) >= num_folds / 2 
        or best_subsample.count(0.5) >= num_folds / 2 or best_subsample.count(1) >= num_folds / 2 
        or best_gamma.count(0) >= num_folds / 2 or best_gamma.count(0.3) >= num_folds / 2
        or best_min_child_weight.count(1) >= num_folds / 2 or best_min_child_weight.count(5) >= num_folds / 2):    
        need = True
   
    # If the extension is needed...
    if need == True:
        #print("\nStarting the grid expansion...")   

        imputer = Imputer(missing_values = missing)

        # For each fold, load the training dataset in and perform the grid search on it.
        for i in range(0, num_folds):
            # Load the dataset (training fold)
            #print("Loading training dataset...") 
            dataset = genfromtxt(folder_name + '/' + 'fold' + str(i) + '_train.data', delimiter=",")
            #print("Training dataset was loaded in!")

            # Split the dataset into the data and the labels
            #print("Splitting the dataset into data and labels...")
            X = dataset[:, 0 : len(dataset[0]) - 1]
            Y = dataset[:, len(dataset[0]) - 1]
            X = imputer.fit_transform(X, Y)
            #print("The data and labels from the dataset have been split!")

            # Initializes the model with the best found parameter for each fold (individually).
            # The parameters on the extreme of the grid will be added to the parameters' dictionary.
            xgb_model = XGBClassifier(n_estimators = 200, learning_rate = best_learning_rate[i], max_depth = best_max_depth[i],
                                     subsample = best_subsample[i], gamma = best_gamma[i], 
                                      min_child_weight = best_min_child_weight[i])

            param_grid = dict()

            if best_learning_rate[i] == 0.05 or best_learning_rate[i] == 0.2:
                if best_learning_rate[i] == 0.05:
                    learning_rate = [0.01, 0.03, 0.05, 0.07]
                else:
                    learning_rate = [0.15, 0.2, 0.25, 0.3]
                param_grid['learning_rate'] = learning_rate
                bool_lr = True
            else:
                bool_lr = False

            if best_max_depth[i] == 3 or best_max_depth[i] == 8:
                if best_max_depth[i] == 3:
                    max_depth = [1, 2, 3, 4]
                else:
                    max_depth = [7, 8, 9, 10]
                param_grid['max_depth'] = max_depth
                bool_md = True
            else:
                bool_md = False

            if best_subsample[i] == 0.5 or best_subsample[i] == 1:
                if best_subsample[i] == 0.5:
                    subsample = [0.4, 0.5, 0.6]
                else:
                    subsample = [0.9, 0.95, 1]
                param_grid['subsample'] = subsample
                bool_s = True
            else:
                bool_s = False

            if best_gamma[i] == 0 or best_gamma[i] == 0.3:
                if best_gamma[i] == 0:
                    gamma = [0, 0.03, 0.05]
                else:
                    gamma = [0.25, 0.3, 0.4]
                param_grid['gamma'] = gamma
                bool_g = True
            else:
                bool_g = False

            if best_min_child_weight[i] == 1 or best_min_child_weight[i] == 5:
                if best_min_child_weight[i] == 1:
                    min_child_weight = [0, 1, 2]
                else:
                    min_child_weight = [5, 6, 7, 8] 
                param_grid['min_child_weight'] = min_child_weight
                bool_mcw = True
            else:
                bool_mcw = False

            # Timed extended grid search.
            start = timer()
            #print("Starting the (expended) grid search...")
            kfold = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 7)
            grid_search = GridSearchCV(xgb_model, param_grid, scoring = "neg_log_loss", n_jobs = -1, cv = kfold, verbose = 1)
            grid_result = grid_search.fit(X, Y)
            #print("The (expended) grid search is over!")
            end = timer()
            time_grid_xgb[i] += (end - start)

            # Summarizes results.
            print("Best: %f using %s \n" % (grid_result.best_score_, grid_result.best_params_))

            # Replaces the old parameters on the extreme of the grid by the newly found values.
            if bool_lr == True:
                best_learning_rate[i] = grid_result.best_params_['learning_rate']
            if bool_md == True:
                best_max_depth[i] = grid_result.best_params_['max_depth']
            if bool_s == True:
                best_subsample[i] = grid_result.best_params_['subsample']
            if bool_g == True:
                best_gamma[i] = grid_result.best_params_['gamma']
            if bool_mcw == True:
                best_min_child_weight[i] = grid_result.best_params_['min_child_weight']

            print("%.2f %%\n" % ((i + 1) / num_folds * 100))

        print("Done with the grid expansion for XGBoost!\n")  
        
    parameters_xgboost = [best_learning_rate, best_max_depth, best_subsample, best_gamma, best_min_child_weight]
    
    return parameters_xgboost 



# Checks whether the grid extension is needed for Random Forest and performs it if it's the case.
# In any case, also groups all the parameters into one 2D array to avoid having too
# many parameters going around.
def grid_extension_rf(folder_name, num_folds, missing, best_max_features, best_min_samples_leaf,
                                                best_max_depth, best_min_samples_split):    
    from numpy import genfromtxt
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import accuracy_score
    from sklearn.preprocessing import Imputer
    
    need = False # Initializing the boolean that'll be use to know if the extension is needed. 
    
    # Checks whether the grid extension is needed.
    if (best_min_samples_leaf.count(1) >= num_folds / 2 or best_min_samples_leaf.count(70) >= num_folds / 2 
        or best_max_depth.count(10) >= num_folds / 2 or best_min_samples_split.count(1) >= num_folds / 2 
        or best_min_samples_split.count(10) >= num_folds / 2):
        need = True
        
    # If the extension is needed...
    if need == True:
        #print("\nStarting the grid expansion...")   

        imputer = Imputer(missing_values = missing)
        # For each fold, loads the training dataset in and perform the grid search on it.
        for i in range(0, num_folds):
            # Loads the dataset (training fold).
            #print("Loading training dataset...") 
            dataset = genfromtxt(folder_name + '/' + 'fold' + str(i) + '_train.data', delimiter=",")
            #print("Training dataset was loaded in!")

            # Splits the dataset into the data and the labels.
            #print("Splitting the dataset into data and labels...")
            X = dataset[:, 0 : len(dataset[0]) - 1]
            Y = dataset[:, len(dataset[0]) - 1]
            X = imputer.fit_transform(X, Y)
            #print("The data and labels from the dataset have been split!")

            # Initializes the model with the best found parameter for each fold (individually).
            # The parameters on the extreme of the grid will be added to the parameters' dictionary.
            rf_model = RandomForestClassifier(n_estimators = 200, max_features = best_max_features[i],
                                              min_samples_leaf = best_min_samples_leaf[i], max_depth = best_max_depth[i],
                                             min_samples_split = best_min_samples_split[i])
    
            param_grid = dict()
        
            if best_min_samples_leaf[i] == 1 or best_min_samples_leaf[i] == 70:
                if best_min_samples_leaf[i] == 1:
                    min_samples_leaf = [1, 5, 10, 15]
                else:
                    min_samples_leaf = [60, 70, 80]
                param_grid['min_samples_leaf'] = min_samples_leaf
                bool_msl = True
            else:
                bool_msl = False
            
            if best_max_depth[i] == 10:
                max_depth = [9, 10, 15, 20]
                param_grid['max_depth'] = max_depth
                bool_md = True
            else:
                bool_md = False
                
            if best_min_samples_split[i] == 1 or best_min_samples_split[i] == 10:
                if best_min_samples_split[i] == 1:
                    min_samples_split = [1, 2, 3, 4]
                else:
                    min_samples_split = [9, 10, 11, 15]
                param_grid['min_samples_split'] = min_samples_split
                bool_mss = True
            else:
                bool_mss = False

            # Timed extended grid search.
            start = timer()
            #print("Starting the (expended) grid search...")
            kfold = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 7)
            grid_search = GridSearchCV(rf_model, param_grid, scoring = "neg_log_loss", n_jobs = -1, cv = kfold, verbose = 1)
            grid_result = grid_search.fit(X, Y)
            #print("The (expended) grid search is over!")
            end = timer()
            time_grid_rf[i] += (end - start)

            # Summarizes results.
            print("Best: %f using %s \n" % (grid_result.best_score_, grid_result.best_params_))

            # Replaces the old parameters on the extreme of the grid by the newly found values.
            if bool_msl == True:
                best_min_samples_leaf[i] = grid_result.best_params_['min_samples_leaf']            
            if bool_md == True:
                best_max_depth[i] = grid_result.best_params_['max_depth']
            if bool_mss == True:
                best_min_samples_split[i] = grid_result.best_params_['min_samples_split']

            print("%.2f %%\n" % ((i + 1) / num_folds * 100))

        print("Done with the grid expansion for Random Forests!\n")  
        
    parameters_rf = [best_max_features, best_min_samples_leaf, best_max_depth, best_min_samples_split] 
    
    return parameters_rf



# Checks whether the grid extension is needed for XGBoost and performs it if it's the case.
# In any case, also groups all the parameters into one 2D array to avoid having too
# many parameters going around.
def grid_extension_gb(folder_name, num_folds, missing, best_learning_rate, 
                             best_max_depth, best_subsample, best_max_features, 
                             best_min_samples_split):    
    from numpy import genfromtxt
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import accuracy_score
    from sklearn.preprocessing import Imputer
    
    need = False # Initializing the boolean that'll be use to know if the extension is needed. 
    
    # Checks whether the grid extension is needed.
    if (best_learning_rate.count(0.05) >= num_folds / 2 or best_learning_rate.count(0.2) >= num_folds / 2 
        or best_max_depth.count(3) >= num_folds / 2 or best_max_depth.count(8) >= num_folds / 2 
        or best_subsample.count(0.5) >= num_folds / 2 or best_subsample.count(1) >= num_folds / 2 
        or best_min_samples_split.count(1) >= num_folds / 2 or best_min_samples_split.count(10) >= num_folds / 2):    
        need = True
   
    # If the extension is needed...
    if need == True:
        #print("\nStarting the grid expansion...")   

        imputer = Imputer(missing_values = missing)

        # For each fold, load the training dataset in and perform the grid search on it.
        for i in range(0, num_folds):
            # Load the dataset (training fold)
            #print("Loading training dataset...") 
            dataset = genfromtxt(folder_name + '/' + 'fold' + str(i) + '_train.data', delimiter=",")
            #print("Training dataset was loaded in!")

            # Split the dataset into the data and the labels
            #print("Splitting the dataset into data and labels...")
            X = dataset[:, 0 : len(dataset[0]) - 1]
            Y = dataset[:, len(dataset[0]) - 1]
            X = imputer.fit_transform(X, Y)
            #print("The data and labels from the dataset have been split!")

            # Initializes the model with the best found parameter for each fold (individually).
            # The parameters on the extreme of the grid will be added to the parameters' dictionary.
            gb_model = GradientBoostingClassifier(n_estimators = 200, learning_rate = best_learning_rate[i], 
                                                  max_depth = best_max_depth[i], subsample = best_subsample[i], 
                                                  max_features = best_max_features[i], min_samples_split = best_min_samples_split[i])

            param_grid = dict()

            if best_learning_rate[i] == 0.05 or best_learning_rate[i] == 0.2:
                if best_learning_rate[i] == 0.05:
                    learning_rate = [0.01, 0.03, 0.05, 0.07]
                else:
                    learning_rate = [0.15, 0.2, 0.25, 0.3]
                param_grid['learning_rate'] = learning_rate
                bool_lr = True
            else:
                bool_lr = False

            if best_max_depth[i] == 3 or best_max_depth[i] == 8:
                if best_max_depth[i] == 3:
                    max_depth = [1, 2, 3, 4]
                else:
                    max_depth = [7, 8, 9, 10]
                param_grid['max_depth'] = max_depth
                bool_md = True
            else:
                bool_md = False

            if best_subsample[i] == 0.5 or best_subsample[i] == 1:
                if best_subsample[i] == 0.5:
                    subsample = [0.4, 0.5, 0.6]
                else:
                    subsample = [0.9, 0.95, 1]
                param_grid['subsample'] = subsample
                bool_s = True
            else:
                bool_s = False

            if best_min_samples_split[i] == 1 or best_min_samples_split[i] == 10:
                if best_min_samples_split[i] == 1:
                    min_samples_split = [1, 2, 3, 4]
                else:
                    min_samples_split = [9, 10, 11, 15]
                param_grid['min_samples_split'] = min_samples_split
                bool_mss = True
            else:
                bool_mss = False

            # Timed extended grid search.
            start = timer()
            #print("Starting the (expended) grid search...")
            kfold = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 7)
            grid_search = GridSearchCV(gb_model, param_grid, scoring = "neg_log_loss", n_jobs = -1, cv = kfold, verbose = 1)
            grid_result = grid_search.fit(X, Y)
            #print("The (expended) grid search is over!")
            end = timer()
            time_grid_gb[i] += (end - start)

            # Summarizes results.
            print("Best: %f using %s \n" % (grid_result.best_score_, grid_result.best_params_))

            # Replaces the old parameters on the extreme of the grid by the newly found values.
            if bool_lr == True:
                best_learning_rate[i] = grid_result.best_params_['learning_rate']
            if bool_md == True:
                best_max_depth[i] = grid_result.best_params_['max_depth']
            if bool_s == True:
                best_subsample[i] = grid_result.best_params_['subsample']
            if bool_mss == True:
                best_min_samples_split[i] = grid_result.best_params_['min_samples_split']

            print("%.2f %%\n" % ((i + 1) / num_folds * 100))

        print("Done with the grid expansion for Gradient Boosting!\n")  
        
    parameters_gb = [best_learning_rate, best_max_depth, best_subsample, best_max_features, best_min_samples_split]
    
    return parameters_gb 



# Trains all the models with the final parameters and tests them.
def train_and_test(folder_name, num_folds, missing, parameters_xgboost, parameters_rf, parameters_gb):
    from numpy import genfromtxt
    from xgboost import XGBClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.metrics import accuracy_score
    from sklearn.preprocessing import Imputer
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import StratifiedKFold
    
    # Initializes the default models.
    print("Initializing all the default models...")
    xgb_d_model = XGBClassifier(n_estimators = 200)
    rf_d_model = RandomForestClassifier(n_estimators = 200)
    gb_d_model = GradientBoostingClassifier(n_estimators = 200)
    print("All the default models have been initialized!")
    
    # Will store the accuracy for each fold. An average will then be computed.
    xgb_d_results = []
    xgb_results = []
    rf_d_results = []
    rf_results = []
    gb_d_results = []
    gb_results = []
    
    imputer = Imputer(missing_values = missing)
    
    depth_rf_d = []
    depth_rf = []
    
    # For each fold, loads the training/testing dataset in and fits the models before testing.
    for i in range(0, num_folds):
        # Loads the dataset (training fold).
        #print("Loading the training set...")
        dataset = genfromtxt(folder_name + '/' + 'fold' + str(i) + '_train.data', delimiter=",")
        #print("Training set was loaded in!")
        
        # Split the dataset into the data and the labels
        #print("Splitting the dataset into data and labels...")
        X = dataset[:, 0 : len(dataset[0]) - 1]
        Y = dataset[:, len(dataset[0]) - 1]        
        X = imputer.fit_transform(X, Y)
        #print("The data and labels from the dataset have been split!")

        print("Training all the default models over the fold with the best parameters...")
        start = timer()
        xgb_d_model.fit(X, Y)
        end = timer()
        time_fit_xgb_d.append(end - start) 
        
        start = timer()
        rf_d_model.fit(X, Y)
        end = timer()
        time_fit_rf_d.append(end - start)
        
        start = timer()
        gb_d_model.fit(X, Y)
        end = timer()
        time_fit_gb_d.append(end - start)
        print("All the default models have been trained!")
        
        print("Initializing all the tuned models and training them over the fold with the best parameters...")
        xgb_model = XGBClassifier(n_estimators = 200, learning_rate = parameters_xgboost[0][i], 
                                  max_depth = parameters_xgboost[1][i], subsample = parameters_xgboost[2][i],
                                  gamma = parameters_xgboost[3][i], min_child_weight = parameters_xgboost[4][i])
        
        rf_model = RandomForestClassifier(n_estimators = 200, max_features = parameters_rf[0][i], 
                                          min_samples_leaf = parameters_rf[1][i], max_depth = parameters_rf[2][i],
                                          min_samples_split = parameters_rf[3][i])
        
        gb_model = GradientBoostingClassifier(n_estimators = 200, learning_rate = parameters_gb[0][i], 
                                              max_depth = parameters_gb[1][i], subsample = parameters_gb[2][i],
                                              max_features = parameters_gb[3][i], min_samples_split = parameters_gb[4][i])
        
        start = timer()
        xgb_model.fit(X, Y)
        end = timer()
        time_fit_xgb.append(end - start)
        
        start = timer()
        rf_model.fit(X, Y)
        end = timer()
        time_fit_rf.append(end - start)
        
        start = timer()
        gb_model.fit(X, Y)
        end = timer()
        time_fit_gb.append(end - start)
        print("All the tuned models have been initialized and trained!")
        
        # Loads the dataset (testing fold).
        #print("Loading testing dataset...") 
        testing = genfromtxt(folder_name + '/' + 'fold' + str(i) + '_test.data', delimiter=",")
        #print("Testing dataset was loaded in!")

        # Splits the testing fold into the data and the labels.
        #print("Splitting the testing set into data and labels...")
        X_test = testing[:, 0 : len(dataset[0]) - 1]
        Y_test = testing[:, len(dataset[0]) - 1]
        X_test = imputer.fit_transform(X_test, Y_test);
        #print("The data and labels from the testing set have been split!\n")
        
        # Makes predictions for test data.
        xgb_d_Y_pred = xgb_d_model.predict(X_test)
        xgb_d_predictions = [round(value) for value in xgb_d_Y_pred]
        xgb_Y_pred = xgb_model.predict(X_test)
        xgb_predictions = [round(value) for value in xgb_Y_pred]
        rf_d_Y_pred = rf_d_model.predict(X_test)
        rf_d_predictions = [round(value) for value in rf_d_Y_pred]
        rf_Y_pred = rf_model.predict(X_test)
        rf_predictions = [round(value) for value in rf_Y_pred]
        gb_d_Y_pred = gb_d_model.predict(X_test)
        gb_d_predictions = [round(value) for value in gb_d_Y_pred]
        gb_Y_pred = gb_model.predict(X_test)
        gb_predictions = [round(value) for value in gb_Y_pred]

        # Evaluates predictions.
        xgb_d_accuracy = accuracy_score(Y_test, xgb_d_predictions)
        xgb_accuracy = accuracy_score(Y_test, xgb_predictions)
        rf_d_accuracy = accuracy_score(Y_test, rf_d_predictions)
        rf_accuracy = accuracy_score(Y_test, rf_predictions)
        gb_d_accuracy = accuracy_score(Y_test, gb_d_predictions)
        gb_accuracy = accuracy_score(Y_test, gb_predictions)

        # Saves the predictions results.
        xgb_d_results.append(xgb_d_accuracy)
        xgb_results.append(xgb_accuracy)
        rf_d_results.append(rf_d_accuracy)
        rf_results.append(rf_accuracy)
        gb_d_results.append(gb_d_accuracy)
        gb_results.append(gb_accuracy)
        
        #print("Default XGBoost accuracy %.2f%%" % (xgb_d_accuracy * 100.0))
        #print("XGBoost accuracy: %.2f%%" % (xgb_accuracy * 100.0))
        #print("Default Random Forests accuracy: %.2f%%" % (rf_d_accuracy * 100.0))
        #print("Random Forests accuracy: %.2f%%" % (rf_accuracy * 100.0))
        #print("Default Gradient Boosting accuracy: %.2f%%" % (gb_d_accuracy * 100.0))
        #print("Gradient Boosting accuracy: %.2f%% \n" % (gb_accuracy * 100.0))

        print("%.2f %%\n" % ((i + 1) / num_folds * 100))
        
        # Saving the depth of the trees for random forests.  
        tmp = [estimator.tree_.max_depth for estimator in rf_d_model.estimators_]
        depth_rf_d.append(sum(tmp) / len(tmp)) 
        tmp = [estimator.tree_.max_depth for estimator in rf_model.estimators_]
        depth_rf.append(sum(tmp) / len(tmp))
     
    return xgb_d_results, xgb_results, rf_d_results, rf_results, gb_d_results, gb_results, depth_rf_d, depth_rf



# Finds the most frequent parameters of a classifier out of its parameters array. 
def find_most_frequent_parameters(parameters):
    n_parameters = len(parameters) # Number of different parameters
    most_frequent_parameters = [] # 1D array containing the most frequent parameters.
    
    for i in range(0, n_parameters):
        for j in range(0, len(parameters[i])):
            if parameters[i][j] == None:
                parameters[i][j] = -1
        
        (values, counts) = np.unique(parameters[i], return_counts = True)
        ind = np.argmax(counts)
        most_frequent_parameters.append(values[ind]) # Stores the most frequent value for parameter i
        
        if most_frequent_parameters[i] == -1:
            most_frequent_parameters[i] = None
    
    return most_frequent_parameters


    
# Prints the final accuracy results.
def print_results(num_folds, xgb_d_results, xgb_results, rf_d_results, rf_results, gb_d_results, gb_results):
    print("\t\t XGB d.\t\t XGB t.\t\t RF d.\t\t RF t.\t\t GB d.\t\t GB t.")
    for i in range(0, num_folds):
        print("Fold ", i + 1, "\t %.2f%%" % (xgb_d_results[i] * 100.0), "\t %.2f%%" % (xgb_results[i] * 100.0), 
              "\t %.2f%%" % (rf_d_results[i] * 100), "\t %.2f%%" % (rf_results[i] * 100), 
              "\t %.2f%%" % (gb_d_results[i] * 100), "\t %.2f%%" % (gb_results[i] * 100))
    print("Average", "\t %.2f%%" % (sum(xgb_d_results) / float(len(xgb_d_results)) * 100.0), 
          "\t %.2f%%" % (sum(xgb_results) / float(len(xgb_results)) * 100.0), 
          "\t %.2f%%" % (sum(rf_d_results) / float(len(rf_d_results)) * 100.0), 
          "\t %.2f%%" % (sum(rf_results) / float(len(rf_results)) * 100.0), 
          "\t %.2f%%" % (sum(gb_d_results) / float(len(gb_d_results)) * 100.0), 
          "\t %.2f%%" % (sum(gb_results) / float(len(gb_results)) * 100.0))
    
    print("Std dev", "\t %.2f" % (np.std(xgb_d_results)), "\t\t %.2f" % (np.std(xgb_results)),
          "\t\t %.2f" % (np.std(rf_d_results)), "\t\t %.2f" % (np.std(rf_results)), "\t\t %.2f" % (np.std(gb_d_results)),
          "\t\t %.2f" % (np.std(gb_results))) 
    print("\n")
    print("\n")
    return



# Prints the time for finding the optimum parameters.
def print_timing(num_folds):
    print("\t\tXGB d.\t XGB t.\t\t RF d.\t RF t.\t\t GB d.\t GB t.")
    for i in range(0, num_folds):
        print("Fold ", i + 1, "\t %.2f" % time_fit_xgb_d[i], "\t %.2f" % time_grid_xgb[i], "+ %.2f" % time_fit_xgb[i], 
              "\t %.2f" % time_fit_rf_d[i], "\t %.2f" % time_grid_rf[i], "+ %.2f" % time_fit_rf[i],
              "\t %.2f" % time_fit_gb_d[i], "\t %.2f" % time_grid_gb[i], "+ %.2f" % time_fit_gb[i]) 
        
    print("Average", "\t %.2f" % (sum(time_fit_xgb_d) / float(len(time_fit_xgb_d))), 
          "\t %.2f" % (sum(time_grid_xgb) / float(len(time_grid_xgb))), "+ %.2f" % (sum(time_fit_xgb) / float(len(time_fit_xgb))),
          "\t %.2f" % (sum(time_fit_rf_d) / float(len(time_fit_rf_d))), 
          "\t %.2f" % (sum(time_grid_rf) / float(len(time_grid_rf))), "+ %.2f" % (sum(time_fit_rf) / float(len(time_fit_rf))),
          "\t %.2f" % (sum(time_fit_gb_d) / float(len(time_fit_gb_d))), 
          "\t %.2f" % (sum(time_grid_gb) / float(len(time_grid_gb))), "+ %.2f" % (sum(time_fit_gb) / float(len(time_fit_gb))))         
    print("\n")
    print("\n")
    return



# Prints the average depth of each model. 
def print_depth(parameters_xgboost, parameters_gb, depth_rf_d, depth_rf):
    print("Depth of default XGBoost: ", 3) # Default value.
    print("Depth of tuned XGBoost: ", sum(parameters_xgboost[1]) / len(parameters_xgboost[1])) # Average max_depth value.
    print("Depth of default Random Forest: ", sum(depth_rf_d) / len(depth_rf_d)) # Since default value is "None", mean of all the depths.
    print("Depth of tuned Random Forest: ", sum(depth_rf) / len(depth_rf)) # Average value between the "None" and the "fixed" depths.           
    print("Depth of default Gradient Boosting:", 3) # Default value.
    print("Depth of tuned Gradient Boosting: ", sum(parameters_gb[1]) / len(parameters_gb[1])) # Average max_depth value.
    print("\n")
    print("\n")
    return
    
    
    
# Prints the mode of the parameters.
def print_mode(parameters_xgboost, parameters_rf, parameters_gb):
    # XGBoost
    most_frequent_xgboost = find_most_frequent_parameters(parameters_xgboost)
    print("XGBoost most frequent parameters")
    print("Learning rate: ", most_frequent_xgboost[0])
    print("Depth: ", most_frequent_xgboost[1])
    print("Subsample: ", most_frequent_xgboost[2])
    print("Gamma: ", most_frequent_xgboost[3])
    print("Min child weight: ", most_frequent_xgboost[4])
    
    # Random Forests
    most_frequent_rf = find_most_frequent_parameters(parameters_rf) 
    print("\n")
    print("Random Forest most frequent parameters")
    print("Max features: ", most_frequent_rf[0])
    print("Min samples leaf: ", most_frequent_rf[1])
    print("Max depth: ", most_frequent_rf[2])
    print("Min samples split: ", most_frequent_rf[3])    
    
    # Gradient Boosting
    most_frequent_gb = find_most_frequent_parameters(parameters_gb)
    print("\n")
    print("Gradient Boosting most frequent parameters")
    print("Learning rate: ", most_frequent_gb[0])
    print("Depth: ", most_frequent_gb[1])
    print("Subsample: ", most_frequent_gb[2])
    print("Max features: ", most_frequent_gb[3])
    print("Min samples split: ", most_frequent_gb[4])
    
    return    
    
 
    
# Compare the three classifiers.    
def compare_classifiers(folder_name, dataset_name, num_folds, missing):
    # Initializing all the variables for the time measurements.
    global time_grid_xgb, time_grid_rf, time_grid_gb
    global time_fit_xgb_d, time_fit_xgb
    global time_fit_rf_d, time_fit_rf
    global time_fit_gb_d, time_fit_gb
    time_grid_xgb = []
    time_grid_rf = []
    time_grid_gb = []
    time_fit_xgb_d = []
    time_fit_xgb = []
    time_fit_rf_d = []
    time_fit_rf = []
    time_fit_gb_d = []
    time_fit_gb = []
    
    # Splitting the dataset
    splitting_dataset(folder_name, dataset_name, num_folds)    
    # Learning the parameters over the basic grid for XGBoost, RF and GB
    parameters_xgboost, parameters_rf, parameters_gb = general_grid_search(folder_name, num_folds, missing)    
    # Training and testing with the best parameters of each fold for each classifier.
    xgb_d_results, xgb_results, rf_d_results, rf_results, gb_d_results, gb_results, depth_rf_d, depth_rf = train_and_test(folder_name, num_folds, missing, parameters_xgboost, parameters_rf, parameters_gb)
    # Printing stuff (accuracy, most frequent parameters, depth, std).
    print_results(num_folds, xgb_d_results, xgb_results, rf_d_results, rf_results, gb_d_results, gb_results)
    print_timing(num_folds)
    print_depth(parameters_xgboost, parameters_gb, depth_rf_d, depth_rf)
    print_mode(parameters_xgboost, parameters_rf, parameters_gb)

    return