08/05/15 Theano day!

Notes on the theano day organized in the group by James, Zhenwen and Andreas.

Written by Alessandra Tosi, Fariba Yusefi and Javier Gonzalez



In [1]:

    
# Import Theano to start! 
import theano
import GPy
from theano import tensor as T
import numpy as np

1. Getting started



In [2]:

    
# Create a two dimensioanal array
X = T.matrix()



In [3]:

    
# Sum of the elements (even when is empty)
f = T.sum(T.square(X))
f









    Out[3]:





Sum.0



In [4]:

    
# Define theano function using f and X
my_func = theano.function([X], f)



In [5]:

    
# Generate some values for X and evaluate
X_values = np.random.randn(3,4)
my_func(X_values)









    Out[5]:





array(12.963304040797144)



In [6]:

    
# Define gradients
g = theano.grad(f,X)



In [7]:

    
# Compute the derivarives
mu_new_func = theano.function([X], [f,g])



In [8]:

    
# Theano derivatives
mu_new_func(X_values)









    Out[8]:





[array(12.963304040797144),
 array([[-2.38163752, -2.16098669, -0.87717575,  0.69632923],
        [ 2.54935613, -1.25319369,  0.35672432, -1.84071505],
        [ 0.31138535, -2.65571952,  1.61767688,  4.34798378]])]



In [9]:

    
# Exact derivatives
X_values*2









    Out[9]:





array([[-2.38163752, -2.16098669, -0.87717575,  0.69632923],
       [ 2.54935613, -1.25319369,  0.35672432, -1.84071505],
       [ 0.31138535, -2.65571952,  1.61767688,  4.34798378]])

2. Linear regression



In [10]:

    
# Define the elements of the regression
w = T.dvector()
Xw = T.dot(X,w)
y = T.dvector()
error = T.sum(T.square(y-Xw))
sigma = T.dscalar()
neg_log_lik = 0.5*y.size*np.log(2*np.pi) + 0.5*y.size*T.log(sigma**2) + 0.5*error/sigma**2



In [11]:

    
# Create objective
my_func = theano.function([X, y, sigma, w], [neg_log_lik, theano.grad(neg_log_lik, w), theano.grad(neg_log_lik, sigma)])



In [12]:

    
# test with data
w_true = np.random.randn(2)
X_values = np.random.randn(200,2)   
y = np.dot(X_values, w_true) + np.random.randn(200)*0.01
y_values = np.dot(X_values, w_true) + np.random.randn(200)*0.01



In [13]:

    
# evaluate likelihood, gradiends with respect to w and gradients with respect to sigma
my_func(X_values, y_values , 0.02, np.array([1,1]))









    Out[13]:





[array(1740553.6503363969),
 array([ 1130920.6618946 ,   756385.18434244]),
 array(-174105226.72308418)]

3. Team work (Alessandra, Fariba and Javier): implementing an RBF kernel in GPy using theano



In [14]:

    
# elements of the kernel
X   = T.matrix()
Z   = T.matrix()
len_scale  = T.dscalar()
sigma2  = T.dscalar()



In [15]:

    
# create the RBF kernel
r2 = ((X[:,None,:]/len_scale - Z[None,:,:]/len_scale)**2).sum(2)
rbf_kernel = sigma2*T.exp(-0.5*r2)



In [16]:

    
# Create the theano functions for r2 and the kernel
r2_eval   = theano.function([X,Z,len_scale],[r2])
kern_eval = theano.function([X,Z,len_scale,sigma2],[rbf_kernel])



In [17]:

    
# Generate some data to evaluate r2 and the kernel
X_val = np.random.randn(4,2)
Z_val = np.random.randn(6,2)
sigma2_val  = 1
len_scale_val = 2



In [18]:

    
# Distances evaluation!!! Order of the arguments should match
r2_eval(X_val,Z_val,len_scale_val)









    Out[18]:





[array([[ 0.11363081,  0.02207483,  0.26821606,  0.33116141,  0.28348765,
          1.18291004],
        [ 0.47115938,  0.14625441,  0.84486132,  0.57121556,  0.13850336,
          0.89131393],
        [ 0.21376622,  0.34201811,  0.04655479,  0.6857568 ,  0.98337181,
          2.25323581],
        [ 0.3233845 ,  0.7621669 ,  0.23656762,  0.48814614,  1.35426331,
          2.07027349]])]



In [19]:

    
# Kernel evaluation
kern_eval(X_val,Z_val,len_scale_val,sigma2_val)









    Out[19]:





[array([[ 0.94476845,  0.98902328,  0.87449559,  0.84740147,  0.86784355,
          0.55352131],
        [ 0.7901127 ,  0.92948259,  0.6554517 ,  0.75155733,  0.93309181,
          0.64040341],
        [ 0.89863071,  0.84281394,  0.97699143,  0.70972451,  0.61159444,
          0.32412763],
        [ 0.85070297,  0.68312088,  0.88844387,  0.78343039,  0.50807223,
          0.35517781]])]



In [20]:

    
# We check that it matches with the same GPy kernel



In [21]:

    
kernelGPy = GPy.kern.RBF(2, variance = sigma2_val, lengthscale=len_scale_val)



In [22]:

    
kernelGPy.K(X_val,Z_val)









    Out[22]:





array([[ 0.94476845,  0.98902328,  0.87449559,  0.84740147,  0.86784355,
         0.55352131],
       [ 0.7901127 ,  0.92948259,  0.6554517 ,  0.75155733,  0.93309181,
         0.64040341],
       [ 0.89863071,  0.84281394,  0.97699143,  0.70972451,  0.61159444,
         0.32412763],
       [ 0.85070297,  0.68312088,  0.88844387,  0.78343039,  0.50807223,
         0.35517781]])

So, it works!



In [24]:

    
# We calculate the derivatives of our kernel with respect to sigma2
dL_dK = T.matrix()
h =  T.sum(dL_dK * rbf_kernel)



In [25]:

    
# Define gradients with respect to sigma2
grads_wrt_sigma2 = theano.grad(h,sigma2)
grads_wrt_sigma2_eval = theano.function([X,Z,len_scale,sigma2,dL_dK],grads_wrt_sigma2)



In [26]:

    
# Define gradients with respect to the lengthscale
grads_wrt_lengthscale = theano.grad(h,len_scale)
grads_wrt_lengthscale_eval = theano.function([X,Z,len_scale,sigma2,dL_dK],grads_wrt_lengthscale)



In [28]:

    
# Define gradients with respect to X
grads_wrt_X= theano.grad(h,X)
grads_wrt_X_eval = theano.function([X,Z,len_scale,sigma2,dL_dK],grads_wrt_X)



In [29]:

    
# All gradients together
grads_all_eval = theano.function([X,Z,len_scale,sigma2,dL_dK], [grads_wrt_sigma2, grads_wrt_lengthscale, grads_wrt_X])



In [30]:

    
# We test the result in a simple GPy model
import GPy
from matplotlib import pyplot as plt



In [31]:

    
# Some data
x = np.linspace(-np.pi, np.pi, 201)[:,None]
y = np.sin(x) + np.random.rand(201)[:,None]



In [33]:

    
# plot the model
%pylab inline
model = GPy.models.GPRegression(x,y)
model.optimize()
model.plot()









    



Populating the interactive namespace from numpy and matplotlib






    



WARNING: pylab import has clobbered these variables: ['f']
`%matplotlib` prevents importing * from pylab and numpy






    Out[33]:





{'dataplot': [<matplotlib.lines.Line2D at 0x5d0c110>],
 'gpplot': [[<matplotlib.lines.Line2D at 0x5d0a090>],
  [<matplotlib.patches.Polygon at 0x5d0a290>],
  [<matplotlib.lines.Line2D at 0x5d0a7d0>],
  [<matplotlib.lines.Line2D at 0x5d0abd0>]]}



In [81]:

    
## Class of kernels for GPy using theano (that uses our grads_all_eval)

class TheanoKern(GPy.kern.Kern):
    def __init__(self, input_dim, variance=1., lengthscale=1.):
        GPy.kern.Kern.__init__(self, input_dim, active_dims=None, name='theanokern')
        self.var = GPy.core.Param('variance', variance)
        self.ls = GPy.core.Param('lengthscale', lengthscale)
        self.link_parameters(self.var, self.ls)
        # Add here a function which initializes all the theano symbolic functions
        
    def K(self, X, X2=None):
        if X2 is None:
            X2 = X
        return kern_eval(X, X2, self.ls[0], self.var[0])[0]
    
    def Kdiag(self, X):
        return np.diag(self.K(X))
    
    def update_gradients_full(self, dL_dK, X, X2=None):
        if X2 is None:
            X2 = X
        dvar, dl, dX = grads_all_eval(X, X2, self.ls[0], self.var[0], dL_dK)
        self.var.gradient = dvar
        self.ls.gradient = dl 
    
    def gradients_X(self, dL_dK, X, X2=None):
        if X2 is None:
            X2 = X
        return grads_wrt_X_eval(X, X2, self.ls[0], self.var[0], dL_dK)
    
    def update_gradients_diag(self, dL_dKdiag, X):
        pass



In [70]:

    
# Create our theano kernel
k = TheanoKern(2)
k.K(X_val)









    Out[70]:





array([[ 1.        ,  0.7246144 ,  0.63578887,  0.20724078],
       [ 0.7246144 ,  1.        ,  0.22850941,  0.04283463],
       [ 0.63578887,  0.22850941,  1.        ,  0.3758315 ],
       [ 0.20724078,  0.04283463,  0.3758315 ,  1.        ]])



In [83]:

    
# model
m = GPy.models.GPRegression(x, y, kernel=TheanoKern(1))



In [84]:

    
# Ckeck with the gradients
m.checkgrad(1)









    



                     Name                      |     Ratio     |  Difference   |  Analytical   |   Numerical   |   dF_ratio    
-------------------------------------------------------------------------------------------------------------------------------
 GP_regression.theanokern.variance[[0]]        |   1.000000    |   0.000000    |   1.896246    |   1.896246    |     2e-08     
 GP_regression.theanokern.lengthscale[[0]]     |   1.000000    |   0.000000    |   -5.847597   |   -5.847597   |     6e-08     
 GP_regression.Gaussian_noise.variance[[0]]    |   1.000000    |   0.000000    |   56.397305   |   56.397305   |     6e-07     






    Out[84]:





True

Done! :)



In [ ]:



In [ ]:



In [ ]: