Exact GP Regression with Multiple GPUs


In [1]:
import math
import torch
import gpytorch
from matplotlib import pyplot as plt

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# Training data is 11 points in [0,1] inclusive regularly spaced
train_x = torch.linspace(0, 1, 100)
# True function is sin(2*pi*x) with Gaussian noise
train_y = torch.sin(train_x * (2 * math.pi)) + torch.randn(train_x.size()) * 0.2

train_x = train_x.cuda()
train_y = train_y.cuda()

In [ ]:


In [3]:
# We will use the simplest form of GP model, exact inference
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        base_covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
        
        self.covar_module = gpytorch.kernels.MultiDeviceKernel(
            base_covar_module, device_ids=range(torch.cuda.device_count()),
            output_device=torch.device('cuda')  # Default device
        )
    
    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

likelihood = gpytorch.likelihoods.GaussianLikelihood().to("cuda:0")
model = ExactGPModel(train_x, train_y, likelihood).to("cuda:0")

In [4]:
# Find optimal model hyperparameters
model.train()
likelihood.train()

# Use the adam optimizer
optimizer = torch.optim.Adam([
    {'params': model.parameters()},  # Includes GaussianLikelihood parameters
], lr=0.1)

# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

with gpytorch.settings.max_preconditioner_size(5):
    training_iter = 50
    for i in range(training_iter):
        # Zero gradients from previous iteration
        optimizer.zero_grad()
        # Output from model
        output = model(train_x)
        # Calc loss and backprop gradients
        loss = -mll(output, train_y)
        loss.backward()
        print('Iter %d/%d - Loss: %.3f   lengthscale: %.3f   noise: %.3f' % (
            i + 1, training_iter, loss.item(),
            model.covar_module.module.base_kernel.lengthscale.item(),
            model.likelihood.noise.item()
        ))
        optimizer.step()


Iter 1/50 - Loss: 0.958   lengthscale: 0.693   noise: 0.693
Iter 2/50 - Loss: 0.927   lengthscale: 0.644   noise: 0.644
Iter 3/50 - Loss: 0.893   lengthscale: 0.598   noise: 0.598
Iter 4/50 - Loss: 0.855   lengthscale: 0.555   noise: 0.554
Iter 5/50 - Loss: 0.810   lengthscale: 0.514   noise: 0.513
Iter 6/50 - Loss: 0.760   lengthscale: 0.475   noise: 0.474
Iter 7/50 - Loss: 0.705   lengthscale: 0.439   noise: 0.437
Iter 8/50 - Loss: 0.648   lengthscale: 0.405   noise: 0.402
Iter 9/50 - Loss: 0.593   lengthscale: 0.372   noise: 0.369
Iter 10/50 - Loss: 0.544   lengthscale: 0.342   noise: 0.338
Iter 11/50 - Loss: 0.499   lengthscale: 0.316   noise: 0.310
Iter 12/50 - Loss: 0.457   lengthscale: 0.293   noise: 0.283
Iter 13/50 - Loss: 0.421   lengthscale: 0.273   noise: 0.259
Iter 14/50 - Loss: 0.384   lengthscale: 0.256   noise: 0.236
Iter 15/50 - Loss: 0.346   lengthscale: 0.242   noise: 0.215
Iter 16/50 - Loss: 0.310   lengthscale: 0.231   noise: 0.195
Iter 17/50 - Loss: 0.279   lengthscale: 0.223   noise: 0.177
Iter 18/50 - Loss: 0.245   lengthscale: 0.216   noise: 0.161
Iter 19/50 - Loss: 0.209   lengthscale: 0.211   noise: 0.146
Iter 20/50 - Loss: 0.181   lengthscale: 0.208   noise: 0.133
Iter 21/50 - Loss: 0.143   lengthscale: 0.206   noise: 0.120
Iter 22/50 - Loss: 0.105   lengthscale: 0.206   noise: 0.109
Iter 23/50 - Loss: 0.083   lengthscale: 0.207   noise: 0.099
Iter 24/50 - Loss: 0.053   lengthscale: 0.210   noise: 0.090
Iter 25/50 - Loss: 0.011   lengthscale: 0.213   noise: 0.082
Iter 26/50 - Loss: -0.013   lengthscale: 0.218   noise: 0.074
Iter 27/50 - Loss: -0.036   lengthscale: 0.223   noise: 0.068
Iter 28/50 - Loss: -0.058   lengthscale: 0.229   noise: 0.062
Iter 29/50 - Loss: -0.082   lengthscale: 0.236   noise: 0.056
Iter 30/50 - Loss: -0.099   lengthscale: 0.244   noise: 0.051
Iter 31/50 - Loss: -0.119   lengthscale: 0.252   noise: 0.047
Iter 32/50 - Loss: -0.125   lengthscale: 0.262   noise: 0.043
Iter 33/50 - Loss: -0.136   lengthscale: 0.272   noise: 0.040
Iter 34/50 - Loss: -0.143   lengthscale: 0.283   noise: 0.037
Iter 35/50 - Loss: -0.147   lengthscale: 0.292   noise: 0.035
Iter 36/50 - Loss: -0.148   lengthscale: 0.301   noise: 0.032
Iter 37/50 - Loss: -0.145   lengthscale: 0.308   noise: 0.030
Iter 38/50 - Loss: -0.143   lengthscale: 0.313   noise: 0.029
Iter 39/50 - Loss: -0.137   lengthscale: 0.314   noise: 0.028
Iter 40/50 - Loss: -0.133   lengthscale: 0.313   noise: 0.026
Iter 41/50 - Loss: -0.132   lengthscale: 0.312   noise: 0.026
Iter 42/50 - Loss: -0.129   lengthscale: 0.309   noise: 0.025
Iter 43/50 - Loss: -0.127   lengthscale: 0.306   noise: 0.025
Iter 44/50 - Loss: -0.125   lengthscale: 0.301   noise: 0.024
Iter 45/50 - Loss: -0.125   lengthscale: 0.296   noise: 0.024
Iter 46/50 - Loss: -0.129   lengthscale: 0.291   noise: 0.024
Iter 47/50 - Loss: -0.130   lengthscale: 0.287   noise: 0.025
Iter 48/50 - Loss: -0.132   lengthscale: 0.282   noise: 0.025
Iter 49/50 - Loss: -0.132   lengthscale: 0.278   noise: 0.025
Iter 50/50 - Loss: -0.130   lengthscale: 0.274   noise: 0.026

In [5]:
# Get into evaluation (predictive posterior) mode
model.eval()
likelihood.eval()

# Test points are regularly spaced along [0,1]
# Make predictions by feeding model through likelihood
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    test_x = torch.linspace(0, 1, 51).cuda()
    observed_pred = model(test_x)
    observed_pred = likelihood(observed_pred)

In [6]:
with torch.no_grad():
    # Initialize plot
    f, ax = plt.subplots(1, 1, figsize=(4, 3))

    # Get upper and lower confidence bounds
    lower, upper = observed_pred.confidence_region()
    # Plot training data as black stars
    ax.plot(train_x.cpu().numpy(), train_y.cpu().numpy(), 'k*')
    # Plot predictive means as blue line
    ax.plot(test_x.cpu().numpy(), observed_pred.mean.cpu().numpy(), 'b')
    # Shade between the lower and upper confidence bounds
    ax.fill_between(test_x.squeeze().cpu().numpy(), lower.cpu().numpy(), upper.cpu().numpy(), alpha=0.5)
    ax.set_ylim([-3, 3])
    ax.legend(['Observed Data', 'Mean', 'Confidence'])
    plt.savefig('res.pdf')