In [1]:
import pandas as pd
import numpy as np
import operator
import itertools
import tensorflow as tf
from time import time
from scipy.stats import norm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
from hyperopt import STATUS_OK, hp, fmin, tpe, Trials, space_eval
In [2]:
def load_data():
full_data = pd.read_csv("Data/X.csv")
train_y = pd.read_csv("Data/y_train.csv")
# Rename columns to something more interpretable
columns = (["reflectance_" + str(i) for i in range(7)]
+ ["solar_" + str(i) for i in range(5)] + ["id"])
full_data.columns = columns
# Move ID column to the beginning
id_column = full_data["id"]
full_data.drop(labels=["id"], axis=1, inplace = True)
full_data.insert(0, "id", id_column)
# Add the target value column to the training part
# in full_data
split = 98000
y_id_dict = train_y.set_index("id")["y"].to_dict()
full_data.loc[:(split-1), "y"] = full_data.loc[:(split-1), "id"].map(y_id_dict)
# Split into training and testing data
train, test = full_data[:split], full_data[split:]
return (train, test)
train, test = load_data()
In [3]:
random_seed = 8
In [4]:
cols_excl = ["id", "y"]
cols_orig = [c for c in train.columns if c not in cols_excl]
# Standardise data can make training faster and reduce
# the chances of getting stuck in local optima
train[cols_orig] = scale(train[cols_orig])
test[cols_orig] = scale(test[cols_orig])
To get rid of the noisy instances, we prune a proportion of instances that is the farther from the computed median in each bag, and repeat until the change in the RMSE is below a chosen threshold.
In [5]:
# Weight each instance by the gaussian pdf
# obtained with the empirical mean and variance
def gaussian_weight(data):
cols = list(data.columns)
X = data.copy()
y_mean_dict = X.groupby("id")["y_hat"].mean().to_dict()
y_std_dict = X.groupby("id")["y_hat"].std().to_dict()
X["y_hat_mean"] = X["id"].map(y_mean_dict)
X["y_hat_std"] = X["id"].map(y_std_dict)
X["pdf"] = norm.pdf(X["y_hat"], X["y_hat_mean"],
X["y_hat_std"])
y_pdf_sum_dict = X.groupby("id")["pdf"].sum().to_dict()
X["pdf_sum"] = X["id"].map(y_pdf_sum_dict)
X["pdf"] /= X["pdf_sum"]
X["y_hat_weighted"] = X["y_hat"] * X["pdf"]
y_weighted_dict = X.groupby("id")["y_hat_weighted"].sum().to_dict()
X["y_hat_weighted_sum"] = X["id"].map(y_weighted_dict)
return(X[cols + ["y_hat_weighted_sum"]])
In [7]:
cols_dnn = cols_orig
models_weights = {"dnn_1": 1}
models_cols = {"dnn_1": cols_dnn}
# DNNRegressor default learning rate
# for Adagrad optimiser
learning_rate = 0.1
# Only show tensorflow's errors
tf.logging.set_verbosity(tf.logging.ERROR)
# Scoring function in the hyperopt hyperparameters tuning.
def scoring_function(parameters):
print("Training the model with parameters: ")
print(parameters)
average_RMSE = 0.0
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
nb_fold = 0
for train_index, validation_index in kf.split(train):
nb_fold += 1
train_fold, validation_fold = train.loc[train_index], train.loc[validation_index]
# Columns used in DNNRegressor
feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_dnn]
# Initialise the number of pruning iteration
pruning_count = 0
# Directory to save the training phase
model_dir = ("./log_"
+ str(parameters["steps"]) + "_"
+ str(parameters["nb_neurons_1"]) + "_"
+ str(nb_fold) + "_"
+ str(pruning_count)
)
model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
hidden_units=[parameters["nb_neurons_1"]],
optimizer=tf.train.ProximalAdagradOptimizer(
learning_rate=learning_rate,
l2_regularization_strength=parameters["l2_reg"]),
dropout=parameters["dropout"],
model_dir=model_dir)
def input_fn(data_set):
feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
labels = tf.constant(data_set["y"].values)
return feature_cols, labels
model_dnn.fit(input_fn=lambda: input_fn(train_fold), steps=parameters["steps"])
# Initialise train predictions
train_pred = train_fold[["id"]].assign(y_hat=0)
temp = model_dnn.predict(input_fn=lambda: input_fn(train_fold))
# .predict() returns an iterator; convert to an array
y_hat = np.array(list(itertools.islice(temp, 0, None)))
train_pred["y_hat"] = y_hat
# Use median value by id
y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()
RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med).values, train_fold["y"]))
print("Pruning {0} RMSE: {1}".format(pruning_count, RMSE))
# Recursively prune outliers until the
# decrease in the RMSE is below a threshold
RMSE_decreasing = True
while (RMSE_decreasing):
pruning_count +=1
train_pred["y_med"] = train_pred["id"].map(y_hat_med)
# Distance from the median for each bag
train_pred["score"] = (train_pred["y_hat"] - train_pred["y_med"])**2
# Rank of each instance by bag
train_pred["rank"] = train_pred.groupby("id")["score"].rank()
bag_size_dict = train_pred.groupby("id")["score"].count().to_dict()
train_pred["bag_size"] = train_pred["id"].map(bag_size_dict)
# Divide rank by the number of instances in each bag
# to obtain a relative rank from 0 to 1
train_pred["rank"] = train_pred["rank"] / train_pred["bag_size"]
# Remove outliers: the instances ranked above
# (1 - parameters["outliers_threshold"])
outliers_index = train_pred["rank"] > (1 - parameters["outliers_threshold"])
train_fold = train_fold.loc[~outliers_index, :].reset_index(drop=True)
# Train new model on the pruned data
model_dir = ("./log_"
+ str(parameters["steps"]) + "_"
+ str(parameters["nb_neurons_1"]) + "_"
+ str(nb_fold) + "_"
+ str(pruning_count)
)
model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
hidden_units=[parameters["nb_neurons_1"]],
optimizer=tf.train.ProximalAdagradOptimizer(
learning_rate=learning_rate,
l2_regularization_strength=parameters["l2_reg"]),
dropout=parameters["dropout"],
model_dir=model_dir)
model_dnn.fit(input_fn=lambda: input_fn(train_fold), steps=parameters["steps"])
# Compute new RMSE
train_pred = train_fold[["id"]].assign(y_hat=0)
temp = model_dnn.predict(input_fn=lambda: input_fn(train_fold))
y_hat = np.array(list(itertools.islice(temp, 0, None)))
train_pred["y_hat"] = y_hat
# Use median value by id
y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()
new_RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med), train_fold["y"]))
print("Pruning {0} RMSE: {1}".format(pruning_count, new_RMSE))
if ((RMSE - new_RMSE) > parameters["gain_threshold"]):
RMSE = new_RMSE
else:
RMSE_decreasing = False
models = {"dnn_1": model_dnn}
# Compute RMSE on validation set
validation_pred = validation_fold[["id", "y"]].assign(y_hat=0)
for i, m in models.items():
temp = m.predict(input_fn=lambda: input_fn(validation_fold))
y_hat = np.array(list(itertools.islice(temp, 0, None)))
validation_pred["y_hat"] += models_weights[i] * y_hat
validation_pred = gaussian_weight(validation_pred)
RMSE = np.sqrt(mean_squared_error(validation_pred["y_hat_weighted_sum"], validation_pred["y"]))
average_RMSE += RMSE
print("Validation fold {0} RMSE: {1}".format(nb_fold, RMSE))
average_RMSE /= n_splits
print("Cross-validation score: {0}\n".format(average_RMSE))
return {"loss": average_RMSE, "status": STATUS_OK}
In [ ]:
t0 = time()
# Grid to pick parameters from.
parameters_grid = {"steps" : hp.choice("steps", np.arange(1000, 2000, 100, dtype=int)),
"nb_neurons_1" : hp.choice("nb_neurons_1", np.arange(8, 12, 1, dtype=int)),
"outliers_threshold": hp.quniform("outliers_threshold", 0.05, 0.051, 0.01),
"gain_threshold" : hp.quniform("gain_threshold", 0.005, 0.0051, 0.001),
"dropout": hp.quniform("dropout", 0.2, 0.4, 0.1),
"l2_reg": hp.quniform("l2_reg", 0.001, 0.01, 0.001)
}
# Record the information about the cross-validation.
trials = Trials()
best = fmin(scoring_function, parameters_grid, algo=tpe.suggest, max_evals=1,
trials=trials)
computing_time = time() - t0
In [ ]:
# Save the best parameters as a csv.
best_parameters = pd.DataFrame({key: [value] for (key, value) in
zip(space_eval(parameters_grid, best).keys(),
space_eval(parameters_grid, best).values())})
# Add the corresponding score.
best_parameters["score"] = min(trials.losses())
best_parameters.to_csv("Parameters/best_parameters_6.csv", encoding="utf-8", index=False)
best_parameters
In [9]:
cols_dnn = cols_orig
models_weights = {"dnn_1": 1.0}
models_cols = {"dnn_1": cols_dnn}
best_parameters = pd.read_csv("Parameters/best_parameters_6.csv", encoding="utf-8")
feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_dnn]
# Initialise number of pruning iteration
pruning_count = 0
model_dir = "./log_submit_6_" + str(pruning_count)
model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
hidden_units=[best_parameters["nb_neurons_1"][0]],
model_dir=model_dir)
def input_fn(data_set):
feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
labels = tf.constant(data_set["y"].values)
return feature_cols, labels
model_dnn.fit(input_fn=lambda: input_fn(train), steps=best_parameters["steps"][0])
# Initialise train predictions
train_pred = train[["id"]].assign(y_hat=0)
temp = model_dnn.predict(input_fn=lambda: input_fn(train))
y_hat = np.array(list(itertools.islice(temp, 0, None)))
train_pred["y_hat"] = y_hat
# Use median value by id
y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()
RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med).values, train["y"]))
print("Pruning {0} RMSE: {1}".format(pruning_count, RMSE))
# Prune outliers
RMSE_decreasing = True
while (RMSE_decreasing):
pruning_count += 1
train_pred["y_med"] = train_pred["id"].map(y_hat_med)
# Distance from the median for each bag
train_pred["score"] = (train_pred["y_hat"] - train_pred["y_med"])**2
# Rank of each instance by bag
train_pred["rank"] = train_pred.groupby("id")["score"].rank()
bag_size_dict = train_pred.groupby("id")["score"].count().to_dict()
train_pred["bag_size"] = train_pred["id"].map(bag_size_dict)
train_pred["rank"] = train_pred["rank"] / train_pred["bag_size"]
# Remove outliers
outliers_index = train_pred["rank"] > (1 - best_parameters["outliers_threshold"][0])
train = train.loc[~outliers_index, :].reset_index(drop=True)
model_dir = "./log_submit_6_" + str(pruning_count)
model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
hidden_units=[best_parameters["nb_neurons_1"][0]],
model_dir=model_dir)
model_dnn.fit(input_fn=lambda: input_fn(train), steps=best_parameters["steps"][0])
# Compute new RMSE
train_pred = train[["id"]].assign(y_hat=0)
temp = model_dnn.predict(input_fn=lambda: input_fn(train))
y_hat = np.array(list(itertools.islice(temp, 0, None)))
train_pred["y_hat"] = y_hat
# Use median value by id
y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()
new_RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med), train["y"]))
print("Pruning {0} RMSE: {1}".format(pruning_count, new_RMSE))
if ((RMSE - new_RMSE) > best_parameters["gain_threshold"][0]):
RMSE = new_RMSE
else:
RMSE_decreasing = False
models = {"dnn_1": model_dnn}
In [10]:
test_pred = test[["id"]].assign(y_hat=0).reset_index(drop=True)
for i, m in models.items():
temp = m.predict(input_fn=lambda: input_fn(test))
y_hat = np.array(list(itertools.islice(temp, 0, None)))
test_pred["y_hat"] += models_weights[i] * y_hat
# Use gaussian weights
test_pred = gaussian_weight(test_pred)
y_hat_by_bag = test_pred.groupby("id").median()["y_hat_weighted_sum"]
In [11]:
kaggle_pred = pd.DataFrame({"Id": y_hat_by_bag.index, "y": y_hat_by_bag.values})
kaggle_pred.to_csv("Predictions/Prediction_6.csv", encoding="utf-8", index=False)
In [ ]: