Stochastic approximation demo.

This notebook briefly shows how stochastic approximation works.

Keisuke Fujii 7th Oct. 2016

We show

Simple regression, that can be done also by usual Gaussian Process.
Regression with non-Gaussian likelihood.

Import several libraries including GPinv



In [1]:

    
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

import tensorflow as tf
# Import GPinv
import GPinv
# Import GPflow as comparison
import GPflow

# import graphical things
import seaborn as sns

sns.set_context('poster')
sns.set_style('ticks')
import matplotlib
fontprop = matplotlib.font_manager.FontProperties(fname="/usr/share/fonts/truetype/ttf-dejavu//")
matplotlib.rcParams['mathtext.fontset'] = 'custom'
matplotlib.rcParams['mathtext.rm'] = 'DejaVu Sans'
matplotlib.rcParams['mathtext.it'] = 'DejaVu Sans:italic'
matplotlib.rcParams['mathtext.bf'] = 'DejaVu Sans:bold'

current_palette = sns.color_palette()

Make random generator static



In [2]:

    
rng = np.random.RandomState(0)

Simple regression

Synthetic data



In [3]:

    
X = np.linspace(0,6,40).reshape(-1,1)
Y = np.sin(X) + rng.randn(40,1)*0.3

GPflow.gpr.GPR



In [4]:

    
m_gpflow = GPflow.gpr.GPR(X, Y, GPflow.kernels.RBF(1))
rslt = m_gpflow.optimize()
rslt









    Out[4]:





      fun: 19.49947035345194
 hess_inv: <3x3 LbfgsInvHessProduct with dtype=float64>
      jac: array([ -3.78487235e-06,   9.04234859e-07,  -2.35666935e-07])
  message: b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
     nfev: 16
      nit: 11
   status: 0
  success: True
        x: array([ 1.36444855, -0.21198308, -2.20949039])



In [5]:

    
X_new = np.linspace(-0.1,6.1,40).reshape(-1,1)
f_mu, f_var = m_gpflow.predict_f(X_new)



In [6]:

    
plt.figure(figsize=(5,3))
plt.fill_between(X_new.flatten(), (f_mu+2*np.sqrt(f_var)).flatten(), (f_mu-2*np.sqrt(f_var)).flatten(), alpha=0.5)
plt.plot(X_new, f_mu)
plt.plot(X, Y, 'go', ms = 5)
plt.xlabel('x')
plt.ylabel('y')

sns.despine()
plt.savefig('figs/simple_regression.pdf')

GPinv.stvgp

In GPinv, we need custom likelihood, where at least the conditional likelihood of the data (logp) is specified.

The likelihood class should inherite GPinv.likelihood.Likelihood.



In [7]:

    
class GaussianLikelihood(GPinv.likelihoods.Likelihood):
    def __init__(self):
        GPinv.likelihoods.Likelihood.__init__(self)
        # variance parameter is assigned as GPinv.param.Param
        self.variance = GPinv.param.Param(1, GPinv.transforms.positive)
    
    def logp(self, F, Y):
        return GPinv.densities.gaussian(Y, F, self.variance)

Model definition.



In [8]:

    
# With stochastic KL
m_gpinv = GPinv.stvgp.StVGP(X, Y, GPinv.kernels.RBF(1, output_dim=1), likelihood=GaussianLikelihood(),
                           num_samples = 3)



In [9]:

    
global_step = tf.Variable(0, trainable=False)
starter_learning_rate = 0.05
learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                           20, 0.9)



In [10]:

    
trainer = tf.train.AdamOptimizer(learning_rate)



In [11]:

    
# This function visualizes the iteration.
from IPython import display

logf = []
def logger(x):
    if (logger.i % 5) == 0:
        obj = -m_gpinv._objective(x)[0]
        logf.append(obj)
        # display
        if (logger.i % 100) ==0:
            plt.clf()
            plt.plot(logf, '-ko', markersize=3, linewidth=1)
            plt.ylabel('ELBO')
            plt.xlabel('iteration')
            display.display(plt.gcf())
            display.clear_output(wait=True)
    logger.i+=1
logger.i = 1



In [12]:

    
import time
# start time
start_time = time.time()
plt.figure(figsize=(6,3))

# optimization by tf.train
_= m_gpinv.optimize(method=trainer, callback=logger, maxiter=1500, global_step = global_step)

display.clear_output(wait=True)
print('Ellapsed Time is', time.time()-start_time, ' (s)')









    



Ellapsed Time is 7.68377685546875  (s)



In [13]:

    
logf2 = np.array(logf)



In [14]:

    
# GPinv model also has predict_f method.
f_mu2, f_var2 = m_gpinv.predict_f(X_new)

With Analytic KL approximation



In [15]:

    
# With stochastic KL
m_gpinv = GPinv.stvgp.StVGP(X, Y, GPinv.kernels.RBF(1, output_dim=1), likelihood=GaussianLikelihood(),
                            KL_analytic=True, 
                            num_samples=3)



In [16]:

    
global_step = tf.Variable(0, trainable=False)
starter_learning_rate = 0.05
learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                           20, 0.9)



In [17]:

    
trainer = tf.train.AdamOptimizer(learning_rate)



In [18]:

    
import time
# start time
logf = []
start_time = time.time()
plt.figure(figsize=(6,3))

# optimization by tf.train
_= m_gpinv.optimize(method=trainer, callback=logger, maxiter=1500, global_step = global_step)

display.clear_output(wait=True)
print('Ellapsed Time is', time.time()-start_time, ' (s)')









    



Ellapsed Time is 8.405430316925049  (s)



In [19]:

    
logf3 = np.array(logf)



In [20]:

    
# GPinv model also has predict_f method.
f_mu3, f_var3 = m_gpinv.predict_f(X_new)

Compare results



In [21]:

    
plt.figure(figsize=(6,4))

plt.plot(X_new, f_mu+2*np.sqrt(f_var), '--', color = 'k')
plt.plot(X_new, f_mu-2*np.sqrt(f_var), '--', color = 'k')
plt.plot(X_new, f_mu, '-k', label='GPR')

plt.plot(X_new, f_mu3+2*np.sqrt(f_var3), '--', color= current_palette[1])
plt.plot(X_new, f_mu3-2*np.sqrt(f_var3), '--', color= current_palette[1])
plt.plot(X_new, f_mu3, '-', label='StVGP analytic KL', color=current_palette[1])

plt.plot(X_new, f_mu2+2*np.sqrt(f_var2), '--', color= current_palette[0])
plt.plot(X_new, f_mu2-2*np.sqrt(f_var2), '--', color= current_palette[0])
plt.plot(X_new, f_mu2, '-', label='StVGP stochastic KL', color=current_palette[0])

plt.plot(X, Y, 'o', ms = 8, color=current_palette[2])

plt.xlabel('x')
plt.ylabel('y')
plt.ylim(-1.8,1.5)

plt.legend(loc='best', fontsize=14)
sns.despine()
plt.tight_layout()
plt.savefig('figs/simple_regression.pdf')



In [22]:

    
plt.figure(figsize=(6,4.5))
it = list(range(0,1500,5))
plt.plot([0,1500], [-rslt['fun'],-rslt['fun']], '--k')
plt.plot(it,logf3, label='analytic KL',   color=current_palette[1], lw=2)
plt.plot(it,logf2, label='stochastic KL', color=current_palette[0], lw=2)
plt.ylim(-60,0)
plt.xlabel('iteration number')
plt.ylabel('ELBO')
plt.legend(loc='upper right')
plt.tight_layout()

# this is an inset axes over the main axes
a = plt.axes([.6, .35, .3, .25])
plt.plot([1100,1500], [-rslt['fun'],-rslt['fun']], '--k')
plt.plot(it[220:300],logf3[220:300], color=current_palette[1], lw=2)
plt.plot(it[220:300],logf2[220:300], color=current_palette[0], lw=2)
plt.ylim(-23,-18)
plt.xticks([1100,1300,1500])
plt.yticks([-23,-20,-17])

sns.despine()
plt.savefig('figs/iteration_behavior.pdf')



In [23]:

    
print(np.std(logf2[220:300]))
print(np.std(logf3[220:300]))









    



0.374698543398
0.908186952405