In [1]:
import pandas as pd
import numpy as np
import operator
import itertools

import tensorflow as tf

from time import time
from scipy.stats import norm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
from hyperopt import STATUS_OK, hp, fmin, tpe, Trials, space_eval

Import data


In [2]:
def load_data():
    full_data = pd.read_csv("Data/X.csv")
    train_y = pd.read_csv("Data/y_train.csv")
    # Rename columns to something more interpretable
    columns = (["reflectance_" + str(i) for i in range(7)]
               + ["solar_" + str(i) for i in range(5)] + ["id"])
    full_data.columns = columns
    
    # Move ID column to the beginning
    id_column = full_data["id"]
    full_data.drop(labels=["id"], axis=1, inplace = True)
    full_data.insert(0, "id", id_column)
    
    # Add the target value column to the training part
    # in full_data
    split = 98000
    y_id_dict = train_y.set_index("id")["y"].to_dict()
    full_data.loc[:(split-1), "y"] = full_data.loc[:(split-1), "id"].map(y_id_dict)
    
    # Split into training and testing data
    train, test = full_data[:split], full_data[split:]
    return (train, test)

train, test = load_data()

Set parameters


In [3]:
random_seed = 8

Preprocessing


In [4]:
cols_excl = ["id", "y"]
cols_orig = [c for c in train.columns if c not in cols_excl]

# Standardise data can make training faster and reduce
# the chances of getting stuck in local optima
train[cols_orig] = scale(train[cols_orig])
test[cols_orig] = scale(test[cols_orig])

Instances pruning

To get rid of the noisy instances, we prune a proportion of instances that is the farther from the computed median in each bag, and repeat until the change in the RMSE is below a chosen threshold.


In [5]:
# Weight each instance by the gaussian pdf
# obtained with the empirical mean and variance
def gaussian_weight(data):
    cols = list(data.columns)
    X = data.copy()
    
    y_mean_dict = X.groupby("id")["y_hat"].mean().to_dict()
    y_std_dict = X.groupby("id")["y_hat"].std().to_dict()
    X["y_hat_mean"] = X["id"].map(y_mean_dict)
    X["y_hat_std"] = X["id"].map(y_std_dict)
    X["pdf"] = norm.pdf(X["y_hat"], X["y_hat_mean"], 
                        X["y_hat_std"])
    y_pdf_sum_dict = X.groupby("id")["pdf"].sum().to_dict()
    X["pdf_sum"] = X["id"].map(y_pdf_sum_dict)
    X["pdf"] /= X["pdf_sum"]
    X["y_hat_weighted"] = X["y_hat"] * X["pdf"]
    
    y_weighted_dict = X.groupby("id")["y_hat_weighted"].sum().to_dict()
    X["y_hat_weighted_sum"] = X["id"].map(y_weighted_dict)
    
    return(X[cols + ["y_hat_weighted_sum"]])

In [7]:
cols_dnn = cols_orig

models_weights = {"dnn_1": 1}
models_cols = {"dnn_1": cols_dnn}

# DNNRegressor default learning rate
# for Adagrad optimiser
learning_rate = 0.1

# Only show tensorflow's errors
tf.logging.set_verbosity(tf.logging.ERROR)

# Scoring function in the hyperopt hyperparameters tuning.
def scoring_function(parameters):
    print("Training the model with parameters: ")
    print(parameters)
    average_RMSE = 0.0
    n_splits = 5
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
    nb_fold = 0
    for train_index, validation_index in kf.split(train):
        nb_fold += 1
        train_fold, validation_fold = train.loc[train_index], train.loc[validation_index]
        
        # Columns used in DNNRegressor
        feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_dnn]
        
        # Initialise the number of pruning iteration
        pruning_count = 0
        # Directory to save the training phase
        model_dir = ("./log_"
                     + str(parameters["steps"]) + "_"
                     + str(parameters["nb_neurons_1"]) + "_"
                     + str(nb_fold) + "_"
                     + str(pruning_count)
                    )
        model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                  hidden_units=[parameters["nb_neurons_1"]],
                                                  optimizer=tf.train.ProximalAdagradOptimizer(
                                                      learning_rate=learning_rate,
                                                      l2_regularization_strength=parameters["l2_reg"]),
                                                  dropout=parameters["dropout"],
                                                  model_dir=model_dir)

        def input_fn(data_set):
            feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
            labels = tf.constant(data_set["y"].values)
            return feature_cols, labels
        
        model_dnn.fit(input_fn=lambda: input_fn(train_fold), steps=parameters["steps"])

        # Initialise train predictions
        train_pred = train_fold[["id"]].assign(y_hat=0)
        temp = model_dnn.predict(input_fn=lambda: input_fn(train_fold))
        # .predict() returns an iterator; convert to an array
        y_hat = np.array(list(itertools.islice(temp, 0, None)))
        train_pred["y_hat"] = y_hat

        # Use median value by id
        y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

        RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med).values, train_fold["y"]))
        print("Pruning {0} RMSE: {1}".format(pruning_count, RMSE))
        
        # Recursively prune outliers until the
        # decrease in the RMSE is below a threshold
        RMSE_decreasing = True
        while (RMSE_decreasing):
            pruning_count +=1
            
            train_pred["y_med"] = train_pred["id"].map(y_hat_med)
            # Distance from the median for each bag
            train_pred["score"] = (train_pred["y_hat"] - train_pred["y_med"])**2
            # Rank of each instance by bag
            train_pred["rank"] = train_pred.groupby("id")["score"].rank()
            bag_size_dict = train_pred.groupby("id")["score"].count().to_dict()
            train_pred["bag_size"] = train_pred["id"].map(bag_size_dict)
            # Divide rank by the number of instances in each bag
            # to obtain a relative rank from 0 to 1
            train_pred["rank"] = train_pred["rank"] / train_pred["bag_size"]

            # Remove outliers: the instances ranked above 
            # (1 - parameters["outliers_threshold"])
            outliers_index = train_pred["rank"] > (1 - parameters["outliers_threshold"])
            train_fold = train_fold.loc[~outliers_index, :].reset_index(drop=True)
            
            # Train new model on the pruned data
            model_dir = ("./log_"
                         + str(parameters["steps"]) + "_"
                         + str(parameters["nb_neurons_1"]) + "_"
                         + str(nb_fold) + "_"
                         + str(pruning_count)
                        )
            model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                      hidden_units=[parameters["nb_neurons_1"]],
                                                      optimizer=tf.train.ProximalAdagradOptimizer(
                                                          learning_rate=learning_rate,
                                                          l2_regularization_strength=parameters["l2_reg"]),
                                                      dropout=parameters["dropout"],
                                                      model_dir=model_dir)

            model_dnn.fit(input_fn=lambda: input_fn(train_fold), steps=parameters["steps"])
            
            # Compute new RMSE
            train_pred = train_fold[["id"]].assign(y_hat=0)
            temp = model_dnn.predict(input_fn=lambda: input_fn(train_fold))
            y_hat = np.array(list(itertools.islice(temp, 0, None)))
            train_pred["y_hat"] = y_hat

            # Use median value by id
            y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

            new_RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med), train_fold["y"]))
            print("Pruning {0} RMSE: {1}".format(pruning_count, new_RMSE))
            
            if ((RMSE - new_RMSE) > parameters["gain_threshold"]):
                RMSE = new_RMSE
            else:
                RMSE_decreasing = False
        
        models = {"dnn_1": model_dnn}
        
        # Compute RMSE on validation set
        validation_pred = validation_fold[["id", "y"]].assign(y_hat=0)
        for i, m in models.items():
            temp = m.predict(input_fn=lambda: input_fn(validation_fold))
            y_hat = np.array(list(itertools.islice(temp, 0, None)))
            validation_pred["y_hat"] += models_weights[i] * y_hat

        validation_pred = gaussian_weight(validation_pred)
        RMSE = np.sqrt(mean_squared_error(validation_pred["y_hat_weighted_sum"], validation_pred["y"]))
        
        average_RMSE += RMSE
        print("Validation fold {0} RMSE: {1}".format(nb_fold, RMSE))

    average_RMSE /= n_splits

    print("Cross-validation score: {0}\n".format(average_RMSE))
    
    return {"loss": average_RMSE, "status": STATUS_OK}

In [ ]:
t0 = time()

# Grid to pick parameters from.
parameters_grid = {"steps"             : hp.choice("steps", np.arange(1000, 2000, 100, dtype=int)),
                   "nb_neurons_1"      : hp.choice("nb_neurons_1", np.arange(8, 12, 1, dtype=int)),
                   "outliers_threshold": hp.quniform("outliers_threshold", 0.05, 0.051, 0.01),
                   "gain_threshold"    : hp.quniform("gain_threshold", 0.005, 0.0051, 0.001),
                   "dropout": hp.quniform("dropout", 0.2, 0.4, 0.1),
                   "l2_reg": hp.quniform("l2_reg", 0.001, 0.01, 0.001)
                  }
# Record the information about the cross-validation.
trials = Trials()

best = fmin(scoring_function, parameters_grid, algo=tpe.suggest, max_evals=1, 
            trials=trials)

computing_time = time() - t0

In [ ]:
# Save the best parameters as a csv.
best_parameters = pd.DataFrame({key: [value] for (key, value) in 
                                zip(space_eval(parameters_grid, best).keys(),
                                    space_eval(parameters_grid, best).values())})
# Add the corresponding score.
best_parameters["score"] = min(trials.losses())
best_parameters.to_csv("Parameters/best_parameters_6.csv", encoding="utf-8", index=False)

best_parameters

Training models


In [9]:
cols_dnn = cols_orig
models_weights = {"dnn_1": 1.0}
models_cols = {"dnn_1": cols_dnn}
best_parameters = pd.read_csv("Parameters/best_parameters_6.csv", encoding="utf-8")

feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_dnn]

# Initialise number of pruning iteration
pruning_count = 0
model_dir = "./log_submit_6_" + str(pruning_count)
model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                          hidden_units=[best_parameters["nb_neurons_1"][0]],
                                          model_dir=model_dir)

def input_fn(data_set):
    feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
    labels = tf.constant(data_set["y"].values)
    return feature_cols, labels

model_dnn.fit(input_fn=lambda: input_fn(train), steps=best_parameters["steps"][0])

# Initialise train predictions
train_pred = train[["id"]].assign(y_hat=0)
temp = model_dnn.predict(input_fn=lambda: input_fn(train))
y_hat = np.array(list(itertools.islice(temp, 0, None)))
train_pred["y_hat"] = y_hat

# Use median value by id
y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med).values, train["y"]))
print("Pruning {0} RMSE: {1}".format(pruning_count, RMSE))
# Prune outliers
RMSE_decreasing = True
while (RMSE_decreasing):
    pruning_count += 1
    
    train_pred["y_med"] = train_pred["id"].map(y_hat_med)
    # Distance from the median for each bag
    train_pred["score"] = (train_pred["y_hat"] - train_pred["y_med"])**2
    # Rank of each instance by bag
    train_pred["rank"] = train_pred.groupby("id")["score"].rank()
    bag_size_dict = train_pred.groupby("id")["score"].count().to_dict()
    train_pred["bag_size"] = train_pred["id"].map(bag_size_dict)
    train_pred["rank"] = train_pred["rank"] / train_pred["bag_size"]

    # Remove outliers
    outliers_index = train_pred["rank"] > (1 - best_parameters["outliers_threshold"][0])
    train = train.loc[~outliers_index, :].reset_index(drop=True)
    
    model_dir = "./log_submit_6_" + str(pruning_count)
    model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                              hidden_units=[best_parameters["nb_neurons_1"][0]],
                                              model_dir=model_dir)

    model_dnn.fit(input_fn=lambda: input_fn(train), steps=best_parameters["steps"][0])

    # Compute new RMSE
    train_pred = train[["id"]].assign(y_hat=0)
    temp = model_dnn.predict(input_fn=lambda: input_fn(train))
    y_hat = np.array(list(itertools.islice(temp, 0, None)))
    train_pred["y_hat"] = y_hat

    # Use median value by id
    y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

    new_RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med), train["y"]))
    print("Pruning {0} RMSE: {1}".format(pruning_count, new_RMSE))

    if ((RMSE - new_RMSE) > best_parameters["gain_threshold"][0]):
        RMSE = new_RMSE
    else:
        RMSE_decreasing = False

models = {"dnn_1": model_dnn}


Pruning 0 RMSE: 0.682746000532
Pruning 1 RMSE: 0.674742291619
Pruning 2 RMSE: 0.677821951825

Predicting on test set


In [10]:
test_pred = test[["id"]].assign(y_hat=0).reset_index(drop=True)
for i, m in models.items():
    temp = m.predict(input_fn=lambda: input_fn(test))
    y_hat = np.array(list(itertools.islice(temp, 0, None)))
    test_pred["y_hat"] += models_weights[i] * y_hat

# Use gaussian weights
test_pred = gaussian_weight(test_pred)
y_hat_by_bag = test_pred.groupby("id").median()["y_hat_weighted_sum"]

In [11]:
kaggle_pred = pd.DataFrame({"Id": y_hat_by_bag.index, "y": y_hat_by_bag.values})
kaggle_pred.to_csv("Predictions/Prediction_6.csv", encoding="utf-8", index=False)

In [ ]: