https://kelvins.esa.int/mars-express-power-challenge/
The Mars Express Power Challenge focuses on the difficult problem of predicting the thermal power consumption. Three full Martian years of Mars Express telemetry are made available and you are challenged to predict the thermal subsystem power consumption on the following Martian year.
Luís F. Simões
(Kelvins username: luis)
{ LinkedIn, ResearchGate, twitter, github }
2016-07-31
In [1]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')
// https://github.com/kmahelona/ipython_notebook_goodies
In [2]:
import numpy as np
In [3]:
import matplotlib.pylab as plt
%matplotlib inline
plt.rcParams['savefig.dpi'] = 100
import seaborn as sns
In [4]:
import pandas as pd
In [5]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
In [6]:
from tqdm import tqdm, trange
import sys
In [7]:
from echo_state_networks import *
In [8]:
# https://github.com/rasbt/watermark
%load_ext watermark
%watermark -v -m -p numpy,scipy,matplotlib,seaborn,pandas,scikit-learn,tqdm
The credit for most of the code in this section goes to Alexander Bauer, who very kindly released a baseline script for the competition on github:
Specifically, it includes the code from his rf_baseline.py file.
The sections Adding DMOP files and Adding FTL files (and all blunders they may contain) however, were added by me.
In [9]:
path_to_data = 'data'
In [10]:
# Function to convert the utc timestamp to datetime
def convert_time(df):
df['ut_ms'] = pd.to_datetime(df['ut_ms'], unit='ms')
return df
In [11]:
# Function to resample the dataframe to hourly mean
def resample_1H(df):
df = df.set_index('ut_ms')
df = df.resample('1H').mean()
return df
In [12]:
# Function to read a csv file and resample to hourly consumption
def parse_ts(filename, dropna=True):
df = pd.read_csv(path_to_data + '/' + filename)
df = convert_time(df)
df = resample_1H(df)
if dropna:
df = df.dropna()
return df
In [13]:
# Function to read the ltdata files
def parse_ltdata(filename):
df = pd.read_csv(path_to_data + '/' + filename)
df = convert_time(df)
df = df.set_index('ut_ms')
return df
In [14]:
##Load the power files
pow_train1 = parse_ts('/train_set/power--2008-08-22_2010-07-10.csv')
pow_train2 = parse_ts('/train_set/power--2010-07-10_2012-05-27.csv')
pow_train3 = parse_ts('/train_set/power--2012-05-27_2014-04-14.csv')
# Load the test sample submission file as template for prediction
pow_test = parse_ts('power-prediction-sample-2014-04-14_2016-03-01.csv', False)
# Concatenate the files
power_all = pd.concat([pow_train1, pow_train2, pow_train3, pow_test])
In [15]:
# Same for the saaf files
saaf_train1 = parse_ts('/train_set/context--2008-08-22_2010-07-10--saaf.csv')
saaf_train2 = parse_ts('/train_set/context--2010-07-10_2012-05-27--saaf.csv')
saaf_train3 = parse_ts('/train_set/context--2012-05-27_2014-04-14--saaf.csv')
saaf_test = parse_ts('/test_set/context--2014-04-14_2016-03-01--saaf.csv')
saaf_all = pd.concat([saaf_train1, saaf_train2, saaf_train3, saaf_test])
In [16]:
# Load the ltdata files
ltdata_train1 = parse_ltdata('/train_set/context--2008-08-22_2010-07-10--ltdata.csv')
ltdata_train2 = parse_ltdata('/train_set/context--2010-07-10_2012-05-27--ltdata.csv')
ltdata_train3 = parse_ltdata('/train_set/context--2012-05-27_2014-04-14--ltdata.csv')
ltdata_test = parse_ltdata('/test_set/context--2014-04-14_2016-03-01--ltdata.csv')
ltdata_all = pd.concat([ltdata_train1, ltdata_train2, ltdata_train3, ltdata_test])
In [17]:
# Extract the columns that need to be predicted
power_cols = list(power_all.columns)
print(' '.join(power_cols))
In [18]:
# Now let's join everything together
df = power_all
In [19]:
# Make sure that saaf has the same sampling as the power, fill gaps with nearest value
saaf_all = saaf_all.reindex(df.index, method='nearest')
ltdata_all = ltdata_all.reindex(df.index, method='nearest')
df = df.join(saaf_all)
df = df.join(ltdata_all)
In [20]:
saaf_all.shape, ltdata_all.shape
Out[20]:
In [21]:
df.shape
Out[21]:
In [22]:
#df
In [23]:
def parse_dmop(filename):
df = pd.read_csv(path_to_data + '/' + filename)
df = convert_time(df)
subsystems = sorted({s[1:4] for s in df['subsystem'] if s[0] == 'A'})
df_expansion = [
[when] + [(1 if cmd[1:4]==syst else 0) for syst in subsystems]
for (when, cmd) in df.values
if cmd[0]=='A']
df = pd.DataFrame(df_expansion, columns=['ut_ms'] + subsystems)
df = df.set_index('ut_ms')
# get one row per hour, containing the boolean values indicating whether each
# subsystem was activated in that hour;
# hours not represented in the file have all columns with 0.
# df = df.resample('1H').max().fillna(0)
# cells represent number of times intructions were issued to that subsystem in that hour
df = df.resample('1H').sum().fillna(0)
return df
In [24]:
# Load the dmop files
dmop_train1 = parse_dmop('/train_set/context--2008-08-22_2010-07-10--dmop.csv')
dmop_train2 = parse_dmop('/train_set/context--2010-07-10_2012-05-27--dmop.csv')
dmop_train3 = parse_dmop('/train_set/context--2012-05-27_2014-04-14--dmop.csv')
dmop_test = parse_dmop('/test_set/context--2014-04-14_2016-03-01--dmop.csv')
dmop_all = pd.concat([dmop_train1, dmop_train2, dmop_train3, dmop_test]).fillna(0)
In [25]:
#dmop_all
In [26]:
dmop_all.describe()
Out[26]:
In [27]:
dmop_all = dmop_all.reindex(df.index).fillna(0)
df = df.join(dmop_all)
In [28]:
dmop_all.shape
Out[28]:
In [29]:
df.shape
Out[29]:
In [30]:
from pandas.tslib import Timestamp
def parse_ftl(filename):
df = pd.read_csv(path_to_data + '/' + filename)
df['utb_ms'] = pd.to_datetime(df['utb_ms'], unit='ms')
df['ute_ms'] = pd.to_datetime(df['ute_ms'], unit='ms')
return df
def parse_ftl_all(filenames, hour_indices):
ftl_all = pd.concat([parse_ftl(f) for f in filenames])
types = sorted(set(ftl_all['type']))
ftl_df = pd.DataFrame(index=hour_indices, columns=['flagcomms'] + types).fillna(0)
# hour indices of discarded events, because of non-ocurrence in `hour_indices`
ix_err = []
for (t_start, t_end, p_type, comms) in ftl_all.values:
floor_beg = Timestamp(t_start).floor('1h')
floor_end = Timestamp(t_end).floor('1h')
try:
ftl_df.loc[floor_beg]['flagcomms'] = ftl_df.loc[floor_end]['flagcomms'] = int(comms)
ftl_df.loc[floor_beg][p_type] = ftl_df.loc[floor_end][p_type] = 1
except KeyError:
ix_err.append((floor_beg, floor_end))
print('Warning: discarded %d FTL events' % len(ix_err))
return ftl_all, ftl_df
In [31]:
ftl_fnames = [
'/train_set/context--2008-08-22_2010-07-10--ftl.csv',
'/train_set/context--2010-07-10_2012-05-27--ftl.csv',
'/train_set/context--2012-05-27_2014-04-14--ftl.csv',
'/test_set/context--2014-04-14_2016-03-01--ftl.csv',
]
%time ftl_all, ftl_df = parse_ftl_all(filenames=ftl_fnames, hour_indices=df.index)
In [32]:
# count number of events per type
#ftl_df.sum(axis=0).sort_values()
# determine which event types occur more than 500 times
min_occ = (ftl_df.sum(axis=0) > 500)
# get their columns names
min_occ_cols = min_occ.index[min_occ.values]
# prune out from `ftl_df` the rarely occurring event types
ftl_df_sel = ftl_df[min_occ_cols]
In [33]:
#ftl_df_sel
In [34]:
ftl_df_sel.describe()
Out[34]:
In [35]:
df = df.join(ftl_df_sel)
In [36]:
ftl_df.shape, ftl_df_sel.shape
Out[36]:
In [37]:
df.shape
Out[37]:
In [38]:
#df
Total count on number of times each pointing time occurs in the data
In [39]:
ftl_df.sum(axis=0).sort_values()
Out[39]:
Inspecting the durations of pointing events
In [40]:
t_span = ftl_all['ute_ms'] - ftl_all['utb_ms']
t_span.describe()
Out[40]:
In [41]:
t_span_secs = (t_span.values * 1e-9).astype(np.int)
In [42]:
# number of events lasting more than 2 hours
(t_span_secs > 2*60*60).sum()
Out[42]:
In [43]:
_ / len(t_span_secs)
Out[43]:
~10% of the pointing events last more than 2h. The longest event lasts 32 days.
Now we formulate the prediction problem X -> Y
Y is the matrix that we want to predictX is everything else
In [44]:
Y = df[power_cols]
X = df.drop(power_cols, axis=1)
In [45]:
#X
In [46]:
#Y
In [47]:
#X.describe()
In [48]:
#Y.describe()
In [49]:
# Defining the evaluation metric
def RMSE(val, pred):
diff = (val - pred) ** 2
rmse = np.mean(diff.values) ** 0.5
return rmse
In [50]:
# Splitting the dataset into train and test data
trainset = ~Y[power_cols[0]].isnull()
X_train, Y_train = X[trainset], Y[trainset]
X_test, Y_test = X[~trainset], Y[~trainset]
In [51]:
# Splitting the trainset further for cross-validation
# (trains on two years; leaves the 3rd for validation)
cv_split = X_train.index < '2012-05-27'
X_train_cv, Y_train_cv = X_train[cv_split], Y_train[cv_split]
X_val_cv, Y_val_cv = X_train[~cv_split], Y_train[~cv_split]
In [52]:
X_train_cv.describe()
Out[52]:
In [53]:
Y_train_cv.describe()
Out[53]:
Here comes the machine learning
In [54]:
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, min_samples_leaf=5)
%time rf.fit(X_train_cv, Y_train_cv)
%time Y_val_cv_hat = rf.predict(X_val_cv)
Showing the local prediction error and feature importances
In [55]:
rmse = RMSE(Y_val_cv, Y_val_cv_hat)
print("Local prediction error: {}\n".format(rmse))
print("Feature importances:")
for feature, importance in sorted(zip(rf.feature_importances_, X_train.columns), key=lambda x: x[0], reverse=True):
print(feature, importance)
In [ ]:
# Now we do the training and prediction for the testset
#%time rf.fit(X_train, Y_train)
#%time Y_test_hat = rf.predict(X_test)
In [ ]:
#Preparing the submission file:
#Converting the prediction matrix to a dataframe
#Y_test_hat = pd.DataFrame(Y_test_hat, index=X_test.index, columns=power_cols)
# We need to convert the parsed datetime back to utc timestamp
#Y_test_hat['ut_ms'] = (Y_test_hat.index.astype(np.int64) * 1e-6).astype(int)
# Writing the submission file as csv
#Y_test_hat[['ut_ms'] + power_cols].to_csv('rf_baseline.csv', index=False)
In [ ]:
# Goto https://kelvins.esa.int/mars-express-power-challenge/ , create an account and submit the file!
In [56]:
# renaming dataset variables
X_all, y_all = X, Y
Data Scaling
scaling of X will use the full set of data (train & test)
In [57]:
_x = X_all.as_matrix()
sc_min = _x.min(axis=0)
sc_max = _x.max(axis=0)
sc_avg = _x.mean(axis=0)
sc_std = _x.std(axis=0)
In [58]:
def scale(X):
return (X - sc_min) / (sc_max - sc_min)
# return 2 * (X - sc_min) / (sc_max - sc_min) - 1
def scale2(X):
return (X - sc_avg) / sc_std
Aliases: training on X -> y, testing on Xt -> yt
In [59]:
X, y = X_train_cv.as_matrix(), Y_train_cv.as_matrix()
Xt, yt = X_val_cv.as_matrix(), Y_val_cv.as_matrix()
In [60]:
X.shape, y.shape
Out[60]:
In [61]:
Xt.shape, yt.shape
Out[61]:
In [62]:
X_date = X_train_cv.index
Xt_date = X_val_cv.index
Mean y state in the "train_cv" set
In [63]:
y_mean = y.mean(axis=0)
In [66]:
# re-defining the evaluation metric (to work over numpy matrices)
def RMSE(y_true, y_pred):
return np.sqrt(np.mean((y_true - y_pred) ** 2))
In [67]:
# RMSE per instant (hour) in the dataset
def RMSE_per_instant(y_true, y_pred):
return np.mean((y_true - y_pred) ** 2, axis=1) ** 0.5
In [65]:
def moving_average(a, n=3):
"""
>>> moving_average(np.arange(10), n=3)
array([ 1., 2., 3., 4., 5., 6., 7., 8.])
"""
ret = np.cumsum(a, dtype=float)
ret[n:] = ret[n:] - ret[:-n]
return ret[n - 1:] / n
# http://stackoverflow.com/a/14314054
In [68]:
from collections import namedtuple
err_range = namedtuple('rmse', ['range', 'period'])
def error_evolution(y_true, y_pred, dates=None):
# assign each instant the RSME of a 1-month window centered on it
err_x_t = RMSE_per_instant(y_true=y_true, y_pred=y_pred)
# number of instants to aggregate (30 days * 24 hours)
nr_insts_agg = 30 * 24
rmse_per_month = moving_average(err_x_t ** 2, n=nr_insts_agg) ** 0.5
# these dates may be slightly off, but for plotting purposes they're close enough
if dates is None:
rmse_per_month_dates = None
else:
h = int(nr_insts_agg / 2)
rmse_per_month_dates = dates[h : h + len(rmse_per_month)]
# measure RMSE incrementally, up to each of 1000 evenly-spaced
# instants along the full time span
rmse_so_far = [
(i if dates is None else dates[int(i-1)], RMSE(y_true=y_true[:int(i)], y_pred=y_pred[:int(i)]))
for i in np.linspace(1, len(y_true), 1000)]
return (rmse_per_month_dates, rmse_per_month), rmse_so_far
def error_evolution_summary(err_evol):
(_, rmse_per_month), rmse_so_far = err_evol
per_month = err_range([f(rmse_per_month) for f in [np.min, np.max]], 'per_month')
so_far = err_range([f(rmse_so_far, key=lambda i:i[1])[1] for f in [min, max]], 'so_far')
return rmse_so_far[-1][1], per_month, so_far
In [69]:
martian_years = [
year_data.index[0]
for year_data in [pow_train1, pow_train2, pow_train3, pow_test]
] + [pow_test.index[-1]]
martian_years
Out[69]:
In [86]:
def show_error_evolution(errors, dataset=''):
lgd_args = dict(ncol=2, loc='best', frameon=True, fancybox=True)
def show_martian_years():
xl = plt.xlim()
for my in martian_years:
plt.axvline(my, ls=':', color='grey')
plt.xlim(xl)
def find_extreme(func, data, cutoff=0.05):
return func(data[int(cutoff * len(data)):], key=lambda i:i[1])[1]
plt.figure(figsize=(11,5))
for err_evol, label in errors:
dates, err = err_evol[0]
if dates is not None:
plt.plot(dates, err, label=label)
else:
plt.plot(err, label=label)
plt.legend(**lgd_args)
plt.xlabel('Date'); plt.ylabel('RMSE')
t = ((dataset + ' e') if dataset != '' else 'E') + 'rror over a 30-days sliding window'
plt.title(t)
show_martian_years()
fig1 = plt.gcf()
plt.figure(figsize=(11,5))
for err_evol, label in errors:
plt.plot(*zip(*err_evol[1]), label=label)
plt.legend(**lgd_args)
plt.xlabel('Date'); plt.ylabel('RMSE')
t = ((dataset + ' e') if dataset != '' else 'E') + 'rror in all predictions up to the current date'
plt.title(t)
show_martian_years()
m = 0.95 * min(find_extreme(min, esf) for ((_,esf),_) in errors)
M = 1.05 * max(find_extreme(max, esf) for ((_,esf),_) in errors)
plt.ylim(m, M)
fig2 = plt.gcf()
return fig1, fig2
In [64]:
def initialize_rng(seed=None, seeder_rng=None, *args, **kwargs):
"""
Initializes a separate numpy random number generator object. Its state
is initialized using `seed`, if provided, or a newly generated seed if not.
Should a random number generator function be provided (`seeder_rng`), it
will be used to generate the seed. Otherwise, `numpy.random.randint` will be
used as the seeder. Valid functions to provide in `seeder_rng` include
`random.randrange` and `numpy.random.randint`, or equivalent functions from
independent generators, given by `random.Random` or `np.random.RandomState`.
"""
if seed is None:
random_integer = np.random.randint if seeder_rng is None else seeder_rng
# np.random.randint converts its arguments internally to C signed longs
# (32 bits, ranging in [-2**31, 2**31-1])
lim = 2**31
seed = random_integer(-lim, lim-1)
# RandomState requires a seed in [0, 2**32-1]
seed += lim
rng = np.random.RandomState(seed)
return rng, seed
In [71]:
def median_seed(model_class, model_args=None, train_args=None, nr_runs=11):
"""
Find a random number generator seed that leads a model to
achieve its estimated median performance.
"""
# uses globals: X, y, Xt, yt
model_args = {} if model_args is None else model_args
train_args = {} if train_args is None else train_args
rng, _ = initialize_rng(seed=1)
rngs, seeds = zip(*[
initialize_rng(seeder_rng=rng.randint)
for i in range(nr_runs)
])
errs = []
for r in rngs:
m = model_class(random_state=r, **model_args)
m.fit(X, y, **train_args)
yt_p = m.predict(Xt)
errs.append(RMSE(y_true=yt, y_pred=yt_p))
return sorted(zip(errs, seeds))[int(nr_runs / 2)]
Measure the performance obtained by Random Forests, for comparisons in plots shown later on.
The parameter settings below come from Alex's rf_baseline.py. For a discusson on their tuning, see this thread at the competition's forum.
For additional information, see: RandomForestRegressor,
ExtraTreesRegressor.
In [91]:
model_args = dict(n_estimators=100, n_jobs=-1, min_samples_leaf=5)
In [ ]:
# get a seed that results in a model of median performance; picked: 4290846341
#%time median_seed(RandomForestRegressor, model_args, nr_runs=11)[1]
In [93]:
m = RandomForestRegressor(random_state=np.random.RandomState(seed=4290846341), **model_args)
%time m.fit(X, y)
%time y_p = m.predict(X)
%time yt_p = m.predict(Xt)
In [94]:
rf_err_evol = error_evolution(y, y_p, X_date)
rf_err_evol_t = error_evolution(yt, yt_p, Xt_date)
# summary of the test set error
error_evolution_summary(rf_err_evol_t)
Out[94]:
In [ ]:
# get a seed that results in a model of median performance; picked: 630311759
#%time median_seed(ExtraTreesRegressor, model_args, nr_runs=11)[1]
In [96]:
m = ExtraTreesRegressor(random_state=np.random.RandomState(seed=630311759), **model_args)
%time m.fit(X, y)
%time y_p = m.predict(X)
%time yt_p = m.predict(Xt)
In [97]:
et_err_evol = error_evolution(y, y_p, X_date)
et_err_evol_t = error_evolution(yt, yt_p, Xt_date)
# summary of the test set error
error_evolution_summary(et_err_evol_t)
Out[97]:
Error over the train set
In [98]:
show_error_evolution([
(rf_err_evol, 'RandomForestRegressor'),
(et_err_evol, 'ExtraTreesRegressor')
], 'Train set');
Error over the test set
In [99]:
show_error_evolution([
(rf_err_evol_t, 'RandomForestRegressor'),
(et_err_evol_t, 'ExtraTreesRegressor')
], 'Test set');
In [94]:
def esn_eval_setup(model_args, train_args, nr_runs=50, desc=None):
"""
Evaluate the performance of ESN models, trained with
the given parameters, across several runs.
"""
# uses globals: X, y, Xt, yt
rng, seed = initialize_rng(seed=42)
res = []
for run in trange(nr_runs, leave=False, file=sys.stdout, desc=desc):
m = ESN(random_state=rng, **model_args)
m.fit(scale(X), y, **train_args)
if m.output_feedback:
yt_p = m.predict(scale(Xt), y_initial=train_args.get('y_initial'))
else:
yt_p = m.predict(scale(Xt))
e = RMSE(y_true=yt, y_pred=yt_p)
res.append(e)
return res
In [87]:
def esn_eval_param(param_name, param_values, model_args, train_args, nr_runs=50):
"Evaluate alternatives for an ESN parameter"
model_args = model_args.copy()
train_args = train_args.copy()
args = (model_args if param_name in model_args else train_args)
res = {}
for arg in param_values:
args[param_name] = arg
res[arg] = esn_eval_setup(model_args, train_args, nr_runs=nr_runs, desc=str(arg))
return res
Below, multiple parameter sweeps are performed, intended to tune parameters one at a time. Parameter values aren't however being changed across sub-sections, as they were already previously tuned.
The statistics below are shown to illustrate ESN's sensitivity to different parameter settings. Results are aggregated over 50 independent runs with each setup.
In [293]:
model_args = dict(nr_neurons=500, prob_connect=0.25, spectral_radius=0.99, activation='sigmoid', leaking_rate=1,
output_feedback=True, y_noise=0.005, alpha=60)
train_args = dict(y_initial=y_mean)
In [294]:
nn_res = esn_eval_param('nr_neurons', [100, 200, 300, 400, 500, 600],
model_args, train_args)
In [295]:
{a:np.mean(r) for (a,r) in nn_res.items()}
Out[295]:
In [296]:
{a:np.median(r) for (a,r) in nn_res.items()}
Out[296]:
In [297]:
model_args = dict(nr_neurons=500, prob_connect=0.25, spectral_radius=0.99, activation='sigmoid', leaking_rate=1,
output_feedback=True, y_noise=0.005, alpha=60)
train_args = dict(y_initial=y_mean)
In [298]:
pc_res = esn_eval_param('prob_connect', [0.1, 0.25, 0.5, 0.75, 1.0],
model_args, train_args)
In [299]:
{a:np.mean(r) for (a,r) in pc_res.items()}
Out[299]:
In [300]:
{a:np.median(r) for (a,r) in pc_res.items()}
Out[300]:
spectral_radius"The spectral radius of the reservoir weight matrix codetermines (i) the effective time constant of the echo state network (larger spectral radius implies slower decay of impulse response) and (ii) the amount of nonlinear interaction of input components through time (larger spectral radius implies longer-range interactions)."
"[...] it is empirically observed that the echo state property (ESP) is granted for any input if this spectral radius is smaller than unity. This has led in the literature to a far-spread but erroneous identification of the ESP with a spectral radius below 1. Specifically, the larger the input amplitude, the further above unity the spectral radius may be while still obtaining the ESP."
In [304]:
model_args = dict(nr_neurons=500, prob_connect=0.25, spectral_radius=0.99, activation='sigmoid', leaking_rate=1,
output_feedback=True, y_noise=0.005, alpha=60)
train_args = dict(y_initial=y_mean)
In [305]:
sr_res = esn_eval_param('spectral_radius', [0.7, 0.8, 0.9, 0.95, 0.99, 2, 3],
model_args, train_args)
In [306]:
{a:np.mean(r) for (a,r) in sr_res.items()}
Out[306]:
In [307]:
{a:np.median(r) for (a,r) in sr_res.items()}
Out[307]:
alphaRegularization term used in the Tikhonov regularization / Ridge regression of the readout's weights.
In [88]:
model_args = dict(nr_neurons=500, prob_connect=0.25, spectral_radius=0.99, activation='sigmoid', leaking_rate=1,
output_feedback=True, y_noise=0.005, alpha=60)
train_args = dict(y_initial=y_mean)
In [89]:
ap_res = esn_eval_param('alpha', [1e-9, 1, 10, 25, 50, 60, 70, 80],
model_args, train_args)
In [90]:
{a:np.mean(r) for (a,r) in ap_res.items()}
Out[90]:
In [91]:
{a:np.median(r) for (a,r) in ap_res.items()}
Out[91]:
In [95]:
model_args = dict(nr_neurons=500, prob_connect=0.25, spectral_radius=0.99, activation='sigmoid', leaking_rate=1,
output_feedback=True, y_noise=0.005, alpha=60)
train_args = dict(y_initial=y_mean)
In [96]:
of_res = esn_eval_param('output_feedback', [True, False],
model_args, train_args)
In [97]:
{a:np.mean(r) for (a,r) in of_res.items()}
Out[97]:
In [98]:
{a:np.median(r) for (a,r) in of_res.items()}
Out[98]:
In [99]:
model_args = dict(nr_neurons=500, prob_connect=0.25, spectral_radius=0.99, activation='sigmoid', leaking_rate=1,
output_feedback=True, y_noise=0.005, alpha=60)
train_args = dict(y_initial=y_mean)
In [100]:
yn_res = esn_eval_param('y_noise', [0, 0.005, 0.01, 0.05, 0.1, 0.15, 0.25],
model_args, train_args)
In [101]:
{a:np.mean(r) for (a,r) in yn_res.items()}
Out[101]:
In [102]:
{a:np.median(r) for (a,r) in yn_res.items()}
Out[102]:
In [103]:
model_args = dict(nr_neurons=500, prob_connect=0.25, spectral_radius=0.99, activation='sigmoid', leaking_rate=1,
output_feedback=True, y_noise=0.005, alpha=60)
train_args = dict(y_initial=y_mean)
In [104]:
yi_res = {}
for arg in ['zeros', 'y_mean']:
train_args['y_initial'] = y_mean if arg == 'y_mean' else None
yi_res[arg] = esn_eval_setup(model_args, train_args, nr_runs=50, desc=str(arg))
In [105]:
{a:np.mean(r) for (a,r) in yi_res.items()}
Out[105]:
In [106]:
{a:np.median(r) for (a,r) in yi_res.items()}
Out[106]:
In [240]:
model_args = dict(nr_neurons=500, prob_connect=0.25, spectral_radius=0.99, activation='sigmoid', leaking_rate=1,
output_feedback=True, y_noise=0.005, alpha=60)
train_args = dict(y_initial=y_mean)
In [241]:
af_res = esn_eval_param('activation', ESN._activation_func.keys(),
model_args, train_args)
In [244]:
sorted({a:np.nanmean(r) for (a,r) in af_res.items()}.items(), key=lambda i:i[1])
Out[244]:
In [245]:
sorted({a:np.nanmedian(r) for (a,r) in af_res.items()}.items(), key=lambda i:i[1])
Out[245]:
In [248]:
# activation functions that lead to NaNs in some runs
[a for (a,r) in af_res.items() if np.any(np.isnan(r))]
Out[248]:
The hyperbolic tangent ('hyp.tan'), the default in ESNs, actually performs quite badly here.
leaking_rateThe updating of reservoir neurons ends with the step $r_t = (1 - \gamma) r_{t-1} + \gamma r_{t}$, where $r_t$ is the reservoir's state at time $t$, and $\gamma \in (0,1]$ is the leaking rate.
For $\gamma=1$ the neuron's update completely forgets its previous state, while for $\gamma$ near 0, the latest activations barely have any impact in defining the neuron's new state.
The leaking rate "can be regarded as the speed of the reservoir update dynamics discretized in time". Alternatively, "it can be seen as a simple digital low-pass filter, also known as exponential smoothing, applied to every node." -- A practical guide to applying echo state networks (Sec. 3.2.6)
In [496]:
model_args = dict(nr_neurons=500, prob_connect=0.25, spectral_radius=0.99, activation='sigmoid', leaking_rate=1,
output_feedback=True, y_noise=0.005, alpha=60)
train_args = dict(y_initial=y_mean)
In [503]:
lr_res = esn_eval_param('leaking_rate', [0.1, 0.25, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9, 1.0],
model_args, train_args)
In [509]:
#{a:np.mean(r) for (a,r) in lr_res.items()}
#{a:np.median(r) for (a,r) in lr_res.items()}
In [507]:
sorted({a:np.mean(r) for (a,r) in lr_res.items()}.items(), key=lambda i:i[1])
Out[507]:
In [508]:
sorted({a:np.median(r) for (a,r) in lr_res.items()}.items(), key=lambda i:i[1])
Out[508]:
Picking leaking_rate=0.75
My final (and best) submission to the competition used quite different parameter settings. Those values had been previously tuned for different features, and were then slightly adjusted based on results from a limited parameter sweep. Here, their performance is evaluated, for comparison against the more extensively tuned settings used above, which achieve a median RMSE of 0.10240993273406276.
In [107]:
model_args = dict(nr_neurons=400, prob_connect=0.75, spectral_radius=0.95, activation='sigmoid', leaking_rate=1,
output_feedback=True, y_noise=0.1, alpha=30)
train_args = dict(y_initial=y_mean)
In [108]:
res = esn_eval_setup(model_args, train_args, nr_runs=50)
In [109]:
np.mean(res)
Out[109]:
In [110]:
np.median(res)
Out[110]:
Inspecting the behaviour of (non-ensembled) Echo State Networks on the problem.
In [102]:
model_args = dict(nr_neurons=500, prob_connect=0.25, spectral_radius=0.99, activation='sigmoid', leaking_rate=0.75,
y_noise=0.005, alpha=60)
train_args = dict(y_initial=None)
Training a single ESN, with no output_feedback
In [ ]:
# get a seed that results in a model of median performance; picked: 878115723
#%time median_seed(ESN, model_args, train_args, nr_runs=25)[1]
In [108]:
#rng, seed = initialize_rng(); print('Seed:', seed)
rng, seed = initialize_rng(seed=878115723)
m = ESN(random_state=rng, output_feedback=False, **model_args)
%time m.fit(scale(X), y, **train_args)
%time y_p = m.predict(scale(X))
%time yt_p = m.predict(scale(Xt))
RMSE(y_true=yt, y_pred=yt_p)
Out[108]:
In [109]:
esnnf_err_evol = error_evolution(y, y_p, X_date)
esnnf_err_evol_t = error_evolution(yt, yt_p, Xt_date)
# summary of the test set error
error_evolution_summary(esnnf_err_evol_t)
Out[109]:
Training a single ESN, with output_feedback
In [110]:
model_args['output_feedback'] = True
In [ ]:
# get a seed that results in a model of median performance; picked: 4290846341
#%time median_seed(ESN, model_args, train_args, nr_runs=25)[1]
In [114]:
#rng, seed = initialize_rng(); print('Seed:', seed)
rng, seed = initialize_rng(seed=2942995346)
m = ESN(random_state=rng, **model_args)
%time m.fit(scale(X), y, **train_args)
%time y_p = m.predict(scale(X))
%time yt_p = m.predict(scale(Xt))
RMSE(y_true=yt, y_pred=yt_p)
Out[114]:
In [115]:
esn_err_evol = error_evolution(y, y_p, X_date)
esn_err_evol_t = error_evolution(yt, yt_p, Xt_date)
# summary of the test set error
error_evolution_summary(esn_err_evol_t)
Out[115]:
In [116]:
# number of dimensions per layer (input, reservoir, readout) in each of the ensemble's models
print(m)
Error over the train set
In [117]:
show_error_evolution([
(rf_err_evol, 'RandomForestRegressor'),
(et_err_evol, 'ExtraTreesRegressor'),
(esnnf_err_evol, 'ESN (no y feedback)'),
(esn_err_evol, 'ESN')
], 'Train set');
Error over the test set
In [118]:
show_error_evolution([
(rf_err_evol_t, 'RandomForestRegressor'),
(et_err_evol_t, 'ExtraTreesRegressor'),
(esnnf_err_evol_t, 'ESN (no y feedback)'),
(esn_err_evol_t, 'ESN')
], 'Test set');
Take these results with a grain of salt, as the ESNs are heavily tuned, whereas the Random Forests aren't. Still, the plots show some interesting behaviours. The model that struggles the most in the train set (ESN with feedback) is the one that ultimately wins in the test set.
Inspecting the behaviour of ensembled Echo State Networks on the problem.
In total, 51 ESNs are trained. At each step, each of the 51 produces its prediction. Those predictions are aggregated with a median, and the result is fed back to all ESNs at the next step via the output-to-reservoir recurrent connections (if output_feedback=True and feedback_ensemble_y=True), thus helping to stabilize reservoirs' dynamics.
In [119]:
#y_init = y_mean
y_init = np.zeros(y.shape[1])
In [120]:
rng, seed = initialize_rng(seed=42)
print('Seed:', seed)
model_args = dict(aggregate='median', nr_neurons=500, prob_connect=0.25, spectral_radius=0.99,
activation='sigmoid', leaking_rate=0.75, output_feedback=True, y_noise=0.005, alpha=60)
train_args = dict(nr_models=51, y_initial=y_init)
#train_args['weighted'] = True
m = ESN_ensemble(**model_args)
%time m.fit(scale(X), y, **train_args)
In [130]:
# number of dimensions per layer (input, reservoir, readout) in each of the ensemble's models
print(m.M[0])
Evaluate error on the test set
In [121]:
# prediction with no ensembled feedback
# (may fail because of some models getting completely off track -- proper regularization (alpha) helps prevent that)
%time yt_p = m.predict(scale(Xt), y_initial=y_init)
RMSE(y_true=yt, y_pred=yt_p)
Out[121]:
In [122]:
# with ensembled feedback, test error if ensemble aggregation would use mean
m.aggregate = np.mean
%time yt_p = m.predict(scale(Xt), y_initial=y_init, feedback_ensemble_y=True)
RMSE(y_true=yt, y_pred=yt_p)
Out[122]:
In [123]:
# with ensembled feedback, test error if ensemble aggregation would use median
m.aggregate = np.median
%time yt_p = m.predict(scale(Xt), y_initial=y_init, feedback_ensemble_y=True)
RMSE(y_true=yt, y_pred=yt_p)
Out[123]:
Advance ESNs through the train set. Then, while preserving the final reservoirs' states (r), and supplying the true y(t-1), predict evolution on the test set. The test set starts right where the train set leaves off (1h apart), so the reservoirs' states can be carried over.
In [124]:
%time y_p = m.predict(scale(X), y_initial=y_init, feedback_ensemble_y=True)
print('Train set error:', RMSE(y_true=y, y_pred=y_p))
%time yt_p = m.predict(scale(Xt), reset_r=False, y_initial=y[-1], feedback_ensemble_y=True)
RMSE(y_true=yt, y_pred=yt_p)
Out[124]:
Collect data for the "error over time" plots.
In [125]:
esn_ensemb_err_evol = error_evolution(y, y_p, X_date)
esn_ensemb_err_evol_t = error_evolution(yt, yt_p, Xt_date)
# summary of the test set error
error_evolution_summary(esn_ensemb_err_evol_t)
Out[125]:
How does the ensemble's aggregate error change as more and more models are trained and added to the ensemble?
To reduce computation time in this analysis, given that feedback_ensemble_y=True, each model is made to predict the next y state while being fed in y_prev the ground truth (assumes the ensemble had obtained perfect prediction at the previous time step). That's the reason why these error values are lower than seen above.
In [126]:
# error change in the train set
%time e = m.error(scale(X), y, y_initial=y_init, feedback_ensemble_y=True)
m.show_error(e)
plt.savefig('plots/error_over_nrmodels__trainset.png');
In [127]:
# error change in the test set
%time et = m.error(scale(Xt), yt, y_initial=y_init, feedback_ensemble_y=True)
m.show_error(et)
plt.savefig('plots/error_over_nrmodels__testset.png');
Error over the train set
In [128]:
fig1, fig2 = show_error_evolution([
(rf_err_evol, 'RandomForestRegressor'),
(et_err_evol, 'ExtraTreesRegressor'),
(esn_err_evol, 'Echo State Network (single model)'), # using y feedback
(esn_ensemb_err_evol, 'Echo State Network (ensemble)'),
], 'Train set')
fig1.savefig('plots/error_over_time__trainset__per_month.png')
fig2.savefig('plots/error_over_time__trainset__so_far.png');
Error over the test set
In [129]:
fig1, fig2 = show_error_evolution([
(rf_err_evol_t, 'RandomForestRegressor'),
(et_err_evol_t, 'ExtraTreesRegressor'),
(esn_err_evol_t, 'Echo State Network (single model)'), # using y feedback
(esn_ensemb_err_evol_t, 'Echo State Network (ensemble)'),
], 'Test set')
fig1.savefig('plots/error_over_time__testset__per_month.png')
fig2.savefig('plots/error_over_time__testset__so_far.png');
Define train & test sets (respectively: full training data, and data to be predicted)
In [72]:
_X, _y = X_train.as_matrix(), Y_train.as_matrix()
_Xt, _yt = X_test.as_matrix(), Y_test.as_matrix()
Determine the mean y in the full train set
In [73]:
y_mean_tr = _y.mean(axis=0)
The code below regenerates my final (and best) submission to the competition (Leaderboard RMSE: 0.089078627464354; Final results RMSE: 0.088395630359812905).
The parameter settings below are those I was using at the time. See above the performance comparison against the parameter settings tuned after the end of the competition.
Train ensemble
In [74]:
rng, seed = initialize_rng(seed=20120617)
model_args = dict(aggregate='median', nr_neurons=400, prob_connect=0.75, spectral_radius=0.95,
activation='sigmoid', leaking_rate=1.0, output_feedback=True, y_noise=0.1, alpha=30)
train_args = dict(nr_models=61, y_initial=y_mean_tr)
m = ESN_ensemble(random_state=rng, **model_args)
%time m.fit(scale(_X), _y, **train_args)
In [75]:
# number of dimensions per layer (input, reservoir, readout) in each of the ensemble's models
print(m.M[0])
Advance ESNs through the train set. Then, while preserving the final reservoirs' states (r), and supplying the true y(t-1), predict evolution on the test set. The test set starts right where the train set leaves off (1h apart), so the reservoirs' states can be carried over.
In [76]:
%time y_p = m.predict(scale(_X), y_initial=y_mean_tr, feedback_ensemble_y=True)
print('Train set error:', RMSE(y_true=_y, y_pred=y_p))
%time yt_p = m.predict(scale(_Xt), reset_r=False, y_initial=_y[-1], feedback_ensemble_y=True)
In [77]:
# collect data for the "error over time" plots
esn_ensemb_trerr_evol = error_evolution(_y, y_p, dates=X_train.index)
error_evolution_summary(esn_ensemb_trerr_evol)
Out[77]:
Error over the train set
In [88]:
fig1, fig2 = show_error_evolution([
(esn_ensemb_trerr_evol, 'ESN ensemble'),
], 'Train set')
fig1.savefig('plots/error_over_time__fulltrainset__per_month.png')
fig2.savefig('plots/error_over_time__fulltrainset__so_far.png');
How does the ensemble's aggregate error change as more and more models are trained and added to the ensemble?
To reduce computation time in this analysis, given that feedback_ensemble_y=True, each model is made to predict the next y state while being fed in y_prev the ground truth (assumes the ensemble had obtained perfect prediction at the previous time step). That's the reason why these error values are lower than seen above.
In [79]:
# error change in the train set
%time e = m.error(scale(_X), _y, y_initial=y_mean_tr, feedback_ensemble_y=True)
m.show_error(e)
plt.savefig('plots/error_over_nrmodels__fulltrainset.png');
In [80]:
PREDICTON = yt_p
In [81]:
#Converting the prediction matrix to a dataframe
Y_test_hat = pd.DataFrame(PREDICTON, index=X_test.index, columns=power_cols)
In [82]:
# We need to convert the parsed datetime back to utc timestamp
#Y_test_hat['ut_ms'] = (Y_test_hat.index.astype(np.int64) * 1e-6).astype(int)
Y_test_hat['ut_ms'] = list(map(int, Y_test_hat.index.astype(np.int64) * 1e-6))
In [83]:
# Writing the submission file as csv
Y_test_hat[['ut_ms'] + power_cols].to_csv('lfs_submission_5b__rebuilt.csv', index=False)
In [84]:
#Y_test_hat
In [85]:
Y_test_hat.describe()
Out[85]: