Tutorial on RVM Regression with Derivatives

This tutorial expands upon the previous tutorial on linear regression in form of Relevance Vector Machines (RVMs) using linear and localized kernels. In this tutorial we will include derivatives and see if we still land in the ballpark. And heeeere we go!


In [1]:
%matplotlib notebook
from linear_model import RelevanceVectorMachine, distribution_wrapper, GaussianFeatures, \
    FourierFeatures, repeated_regression, plot_summary
from sklearn import preprocessing
import numpy as np
from scipy import stats, misc
import matplotlib#
import matplotlib.pylab as plt

matplotlib.rc('text', usetex=True)
matplotlib.rcParams['text.latex.preamble']=[r"\usepackage{amsmath}"]

from sklearn import utils, preprocessing
import sklearn

First things first, let's set up up the database to regress.


In [2]:
x = np.linspace(-5,5,500)
x_pred = np.linspace(1.5*x[0],1.5*x[-1],x.shape[0])
epsilon = stats.norm(loc=0,scale=0.01)
noise = epsilon.rvs(size=x.shape[0])
# t_0 = np.exp(-x**2) + noise
# t_1 = -2*x*np.exp(-x**2) + noise
t_0 = np.cos(x) + noise
t_1 = -np.sin(x) + noise

t = np.hstack((t_0,t_1))

N = len(t_0)
N_pred = len(x_pred)

fig = plt.figure(figsize=(5,5))
plt.plot(x,t_0,'ro',markerfacecolor="None",label="data")
plt.plot(x,t_1,'bo',markerfacecolor="None",label="data (1st derivative)")
plt.xlabel("input")
plt.ylabel("output")
plt.legend(loc=0)
plt.show()


1. Single Regression

1.1 Linear Kernel

Neat now let's test whether we can regress that data using a polynomial feature space.


In [3]:
# choosing the feature space
k = 5
trafo = preprocessing.PolynomialFeatures(k)

X_0 = trafo.fit_transform(x.reshape((-1,1)))
trafo_der  = preprocessing.PolynomialFeatures(k-1)
X_1 = trafo_der.fit_transform(x.reshape((-1,1)))
X_1 = np.hstack((np.zeros((X_1.shape[0],1)),X_1))
X = np.vstack((X_0,X_1))
t = np.hstack((t_0,t_1))
print(X.shape,t.shape)

M = X.shape[1]

# initializing hyperparameters
init_beta = 1./ np.var(t) # (that's the default start)
init_alphas = np.ones(X.shape[1])
init_alphas[1:] = np.inf

# setting up the model regression class
model = RelevanceVectorMachine(n_iter=250,verbose=False,compute_score=True,init_beta=init_beta,
                               init_alphas=init_alphas)
# regress
model.fit(X,t)

# predict
X_pred_0 = trafo.fit_transform(x_pred.reshape((-1,1)))
X_pred_1 = trafo_der.fit_transform(x_pred.reshape((-1,1)))
X_pred_1 = np.hstack((np.zeros((X_pred_1.shape[0],1)),X_pred_1))
X_pred = np.vstack((X_pred_0,X_pred_1))

y, yerr = model.predict(X_pred,return_std=True)

y_0, y_1 = y[:N_pred], y[N_pred:]
yerr_0, yerr_1 = yerr[:N_pred], yerr[N_pred:]


fig = plt.figure()
ax = fig.add_subplot(121)
ax.plot(x,t_0,'ro',label="data",markerfacecolor="None")
ax.plot(x,t_1,'bo',label="data (1st derivative)",markerfacecolor="None")
ax.fill_between(x_pred,y_0-2*yerr_0,y_0+2*yerr_0,alpha=.5,label="95\%")
ax.fill_between(x_pred,y_1-2*yerr_1,y_1+2*yerr_1,alpha=.5,label="95\% (1st derivative)")
ax.plot(x_pred,y_0,'-',label="estimate")
ax.plot(x_pred,y_1,'-',label="estimate (1st derivative)")
plt.legend(loc=0)
ax.set_xlabel("input")
ax.set_ylabel("output")

ax1 = fig.add_subplot(122)
ax1.plot(model.mse_,'-')
ax1.set_xlabel("iteration")
ax1.set_ylabel("MSE")
plt.tight_layout()
plt.show()


(1000, 6) (1000,)

1.2 Localized Kernel

Indeed that seemed to work. But what about a Gaussian feature space, will it be able to fit the Gaussian?


In [12]:


In [13]:
# choosing the feature space
# trafo_0 = GaussianFeatures(k=30, mu0=-3, dmu=.2, k_der=0)
# trafo_1 = GaussianFeatures(k=30, mu0=-3, dmu=.2, k_der=1)
trafo_0 = FourierFeatures(k=10, k_der=0)
trafo_1 = FourierFeatures(k=10, k_der=1)
X_0 = trafo_0.fit_transform(x.reshape((-1,1)))
X_1 = trafo_1.fit_transform(x.reshape((-1,1)))
X = np.vstack((X_0,X_1))

# initializing hyperparameters
init_beta = 1./ np.var(t) # (that's the default start)
init_alphas = np.ones(X.shape[1])
init_alphas[1:] = np.inf

# setting up the model regression class
model = RelevanceVectorMachine(n_iter=250,verbose=False,compute_score=True,init_beta=init_beta,
                               init_alphas=init_alphas,do_logbook=True,update_pct=1)

# regress
model.fit(X,t)

# predict
X_pred_0 = trafo_0.fit_transform(x_pred.reshape((-1,1)))
X_pred_1 = trafo_1.fit_transform(x_pred.reshape((-1,1)))
X_pred = np.vstack((X_pred_0,X_pred_1))

y, yerr = model.predict(X_pred, return_std=True)

y_0, y_1 = y[:N_pred], y[N_pred:]
yerr_0, yerr_1 = yerr[:N_pred], yerr[N_pred:]

fig = plt.figure()
ax = fig.add_subplot(121)
ax.plot(x,t_0,'ro',label="data",markerfacecolor="None")
ax.plot(x,t_1,'bo',label="data (1st derivative)",markerfacecolor="None")
ax.fill_between(x_pred,y_0-2*yerr_0,y_0+2*yerr_0,alpha=.5,label="95\%")
ax.fill_between(x_pred,y_1-2*yerr_1,y_1+2*yerr_1,alpha=.5,label="95\% (1st derivative)")
ax.plot(x_pred,y_0,'-',label="estimate")
ax.plot(x_pred,y_1,'-',label="estimate (1st derivative)")
plt.legend(loc=0)
ax.set_xlabel("input")
ax.set_ylabel("output")

ax1 = fig.add_subplot(122)
ax1.plot(model.mse_,'-')
ax1.set_xlabel("iteration")
ax1.set_ylabel("MSE")
plt.tight_layout()
plt.show()


Repeated Regression

2.1 Localized kernel


In [7]:
# choosing the feature space
trafo_0 = GaussianFeatures(k=30, mu0=-3, dmu=.2, k_der=0)
trafo_1 = GaussianFeatures(k=30, mu0=-3, dmu=.2, k_der=1)
base_trafo_0 = trafo_0.fit_transform
base_trafo_1 = trafo_1.fit_transform

# initializing hyperparameters using callable distributions giving new hyperparameters
# with every call (useful for repeated regression)
init_beta = distribution_wrapper(stats.halfnorm(scale=1),size=1,single=True)
init_alphas = distribution_wrapper(stats.halfnorm(scale=1),single=False)

model_type = RelevanceVectorMachine
model_kwargs = dict(n_iter=250,verbose=False,compute_score=True,init_beta=init_beta,
                       init_alphas=init_alphas,fit_intercept=False, update_pct=.9)

Nruns = 100
runtimes, coefs, models = repeated_regression(x, base_trafo_0, model_type, t=t,
                                              model_kwargs=model_kwargs, Nruns=Nruns,
                                              return_coefs=True, return_models=True,
                                              base_trafo_1=base_trafo_1)

X_0 = base_trafo_0(x.reshape((-1,1)))
X_1 = base_trafo_1(x.reshape((-1,1)))

plot_summary(models,noise,x,t,X_0,coefs,base_trafo_0,X_1=X_1)


Adding the derivatives to the regression process seems to increase the difficulty in finding the global minimum, since confidence intervals of the predicted functions do not completely overlap with the data everywhere. The weights are also much more uncertain and the noise levels seem to deviate by an order of magnitude. Hence there is either a bug or a fundamental issue. Interestingly enough the regression improves when limiting the number of basis functions which are allowed to be updated every iteration.