In [ ]:
# The old notebook became too confusing!
# Trying to check Singular Values and Singular Vectors in Numpy
import numpy as np
# General Set-up
N = 2
A = 0.1 * np.random.randn(N, N) + np.diag(np.arange(1, N+1))
B = np.random.randn(N, N)
I = np.eye(N)
dA = np.random.randn(N, N)
dB = np.random.randn(N, N)
bC = np.random.randn(N, N)
eps = 1e-20
epsi = 1 / eps
Ae = A + 1j*eps*dA
Be = B + 1j*eps*dB # don't need it, I'm working with single input.
# SVD Set-up
u, s, vT = np.linalg.svd(A)
# Complex part
De, Du = np.linalg.eig(Ae.dot(Ae.T))
Du, _, _ = np.linalg.svd(Ae)
idx = De.argsort()[::-1]
De = De[idx]
Du = u[:,idx]
D = np.real(De) # only needed for what?
# forward propagation
dA = dA
dS = np.diag(I * u.T.dot(dA).dot(vT.T))
E = np.outer(np.ones(N), s) - np.outer(s, np.ones(N))
F = 1 / (E + np.eye(N)) - np.eye(N)
dU = u.dot(F * (u.T.dot(dA).dot(vT.T).dot(np.diag(s)) + np.diag(s).dot(vT).dot(dA.T).dot(u)))
# backward gradients
bS = np.random.randn(N) # gradients wrt singular values
bA = u.dot(np.diag(bS)).dot(vT) # backpropagated gradient wrt to matrix A
print('singular value')
print('svd error: ', (np.linalg.norm(s-np.sqrt(D))))
# Forward Check based on complex matrices
print('CVT error (vals): ', (np.linalg.norm(2*s*dS - epsi*np.imag(De))))
print('CVT error (vecs): ', (np.linalg.norm(dU - epsi*np.imag(Du))))
# Backward Check, these are essentially two traces!!
print('adj error: ',np.sum(dA*bA)-np.sum(dS*bS))
# I should be able to use the same identity for my testing purposes!!!
# trace(bC.T * dC) = trace(bA.T * dA) + trace(bB.T * dB)
In [ ]:
In [ ]:
import numpy as np
# Checking eigenvectors and eigenvalues in Numpy
np.random.seed(10)
# General Set-up
N = 5
A = 0.1 * np.random.randn(N, N) + np.diag(np.arange(1, N+1))
#A = A.dot(A.T)
B = np.random.randn(N, N)
I = np.eye(N)
dA = np.random.randn(N, N)
dB = np.random.randn(N, N)
bC = np.random.randn(N, N)
eps = 1e-20
epsi = 1 / eps
Ae = A + 1j*eps*dA
Be = B + 1j*eps*dB
# EIGEN Set-up
De, Ue = np.linalg.eig(Ae)
D = np.real(De)
U = np.real(Ue)
# make dC diagonal equal to zero
Ue = Ue.dot(np.diag(1 / np.diag(np.linalg.inv(U).dot(Ue))))
E = np.outer(np.ones(N), D) - np.outer(D, np.ones(N))
F = 1 / (E + np.eye(N)) - np.eye(N)
P = np.linalg.inv(U).dot(dA.dot(U))
dD = np.eye(N) * P
dU = U.dot(F*P)
#dD, dU = forward(A, dA)
bD = np.diag(np.random.randn(N)) # random perturbation of eigenvalues
bU = np.random.randn(N,N) # random perturbation of eigenvectors
bD = bD + F * (U.T.dot(bU))
bA = np.linalg.inv(U.T).dot(bD.dot(U.T))
print('eigenvalues and eigenvectors')
print('CVT error (vals): ', np.linalg.norm(np.diag(dD) - epsi*np.imag(De)))
print('CVT error (vecs): ', np.linalg.norm(dU - epsi*np.imag(Ue)))
print('adj error (backward): ', np.sum(dA*bA)-np.sum(dD*bD)-np.sum(dU*bU))
#print('CVT error (vecs): ', np.linalg.norm(np.abs(dU) - np.abs(epsi*np.imag(Ue))))
In [ ]:
dU
In [ ]:
# essentials for eig in PyTorch
import torch
N = 5
A = torch.randn(N,N).double()
L = A.mm(A.t())
I = torch.eye(N).double()
dA = torch.randn(N, N).double()
dL = dA.mm(A.t()) + A.mm(dA.t())
vals, vecs = torch.eig(L, eigenvectors=True)
vals = vals[:,0]
E = vals.expand(N,N) - vals.expand(N,N).t()
F = 1 / (E + I) - I
P = torch.inverse(vecs).mm(dL.mm(vecs))
dvals = I * P
dvecs = vecs.mm(F*P)
# backward pass
bvals = torch.randn(N).diag().double()
bvecs = torch.randn(N,N).double()
med = bvals + F * (vecs.t().mm(bvecs))
bL = torch.inverse(vecs.t()).mm(med.mm(vecs.t()))
bA = bL.mm(A) + bL.t().mm(A)
# check
torch.sum(bA * dA) - torch.sum(dvals * bvals) - torch.sum(dvecs * bvecs)
In [5]:
# essentials for svd in PyTorch-
import torch
M = 6
N = 5
A = torch.randn(M,N).double()
dA = torch.randn(M, N).double()
dL = dA.mm(A.t()) + A.mm(dA.t())
vecs, vals, v = torch.svd(A)
vals = vals**2
s = vals.size(0)
I = torch.eye(s).double()
E = vals.expand(s,s) - vals.expand(s,s).t()
F = 1 / (E + I) - I
P = vecs.t().mm(dL.mm(vecs))
dvals = I * P
dvecs = vecs.mm(F*P)
# backward pass
bvals = torch.randn(s).diag().double()
bvecs = torch.randn(vecs.size()).double()
med = bvals + F * (vecs.t().mm(bvecs))
bL = vecs.mm(med.mm(vecs.t()))
bA = bL.mm(A) + bL.t().mm(A)
# check
torch.sum(bA * dA) - torch.sum(dvals * bvals) - torch.sum(dvecs * bvecs)
Out[5]:
In [6]:
from torch.autograd import Variable
from dpp_nets.my_torch.linalg import custom_decomp
A = Variable(A, requires_grad=True)
vals, vecs = custom_decomp()(A)
torch.autograd.backward([vals, vecs],[bvals.diag(), bvecs])
torch.sum(A.grad.data == bA)
Out[6]:
In [ ]:
# This is an important comparison of my three gradient methods
# DON'T DELETE!!
# 1) Numpy function (Low-rank DPP-Factorization)
# 2) Allinone = Numpy gradient in Pytorch
# 3) Back-propagating through singular value decomposition
import numpy as np
from dpp_nets.dpp.score_dpp import score_dpp
import torch
from torch.autograd import Variable
# 1) NUMPY GRADIENT (THIS IS CONFIRMED TO BE THE SAME AS LOW-RANK PAPER)
embd = np.random.randn(10,10)
subset = np.array([1,1,1,0,0,1,0,1,0,0], dtype=float)
grad_numpy = score_dpp(embd, subset)
print(grad_numpy)
# 2) IN PYTORCH
embd = torch.DoubleTensor(embd)
vecs, vals, _ = torch.svd(embd)
vals.pow_(2)
subset = torch.DoubleTensor(subset)
def backward(kernel, subset):
dtype = kernel.type()
n, kernel_dim = kernel.size()
subset_sum = subset.long().sum()
grad_kernel = torch.zeros(kernel.size()).type(dtype)
if subset_sum:
# auxillary
P = torch.eye(n).masked_select(subset.expand(n,n).t().byte()).view(subset_sum, -1).type(dtype)
subembd = P.mm(kernel)
submatrix = subembd.mm(subembd.t())
submatinv = torch.inverse(submatrix)
subgrad = 2 * submatinv.mm(subembd)
subgrad = P.t().mm(subgrad)
grad_kernel.add_(subgrad)
# Gradient from whole L matrix
K = kernel.t().mm(kernel) # not L!
I_k = torch.eye(kernel_dim).type(dtype)
I = torch.eye(n).type(dtype)
inv = torch.inverse(I_k + K)
B = I - kernel.mm(inv).mm(kernel.t())
grad_from_full = 2 * B.mm(kernel)
grad_kernel.sub_(grad_from_full)
return grad_kernel
grad_pytorch = backward(embd, subset)
print(grad_pytorch)
grad_numpy - grad_pytorch.numpy()
# 3) GRADIENT WITH SVD BACKPROPGATION
def backward_1(vals, vecs, subset):
# Set-up
dtype = vals.type()
n = vecs.size(0)
n_vals = vals.size(0)
subset_sum = subset.long().sum()
grad_vals = 1 / vals
grad_vecs = torch.zeros(n, n_vals).type(dtype)
if subset_sum:
# auxillary
matrix = vecs.mm(vals.diag()).mm(vecs.t())
P = torch.eye(n).masked_select(subset.expand(n,n).t().byte()).view(subset_sum, -1).type(dtype)
submatrix = P.mm(matrix).mm(P.t())
subinv = torch.inverse(submatrix)
Pvecs = P.mm(vecs)
grad_vals += Pvecs.t().mm(subinv).mm(Pvecs).diag()
grad_vecs += P.t().mm(subinv).mm(Pvecs).mm(vals.diag())
return grad_vals, grad_vecs
def backward_2(grad_vals, grad_vecs, mat, vals, vecs):
# unpack
N = grad_vals.size(0)
# auxillary
I = grad_vecs.new(N,N).copy_(torch.eye(N))
F = vals.expand(N,N) - vals.expand(N,N).t()
F.add_(I).reciprocal_().sub_(I)
# gradient
grad_mat = grad_vals.diag() + F * (vecs.t().mm(grad_vecs)) # intermediate variable
grad_mat = vecs.mm(grad_mat.mm(vecs.t())) # this is the gradient wrt L
grad_mat = grad_mat.mm(mat) + grad_mat.t().mm(mat) # this is the grad wrt A
return grad_mat
grad_vals, grad_vecs = backward_1(vals, vecs, subset)
backward_2(grad_vals, grad_vecs, embd, vals, vecs)
In [ ]:
In [ ]:
from dpp_nets.my_torch.DPP import AllInOne
import torch
from torch.autograd import Variable
A = Variable(torch.randn(10,10).double(), requires_grad=True)
transform = AllInOne()
subset = transform(A)
subset.reinforce(1)
subset.sum().backward()
print(A.grad.data)
print(backward(A.data, subset.data)) # yes it works correctly
In [1]:
## Marginal Trainer
# Implement Gradient for Inverse
# in order to check if my gradient is working correctly
import torch
from torch.autograd import Variable, Function
from dpp_nets.my_torch.linalg import custom_decomp
class var_inverse(Function):
def forward(self, A):
C = torch.inverse(A)
self.save_for_backward(C)
return C
def backward(self, grad_C):
C, = self.saved_tensors
grad_A = -C.t().mm(grad_C).mm(C.t())
return grad_A
In [76]:
# Set-up
# backpropgation through singular values works!
M = 4
N = 6
data = torch.randn(M, N).double()
identity = Variable(torch.eye(M).double())
# short-cut (using SVD)
Vs = Variable(data, requires_grad=True)
vals, vecs = custom_decomp()(Vs)
Ltemp = vecs.mm(vals.diag()).mm(vecs.t())
K = identity - vecs.mm((1 / (vals +1)).diag()).mm(vecs.t())
print("K is :", K)
#K.diag().backward(torch.arange(0,max(M,N)).double())
#print("Grad is: ", Vs.grad.data)
# conventional (long-way)
Vl = Variable(data, requires_grad=True)
L = Vl.mm(Vl.t())
K = identity - var_inverse()((L + identity))
print("K is: ", K)
#K.diag().backward(torch.arange(0,max(M,N)).double())
#print("Grad is: ", Vl.grad.data)
In [47]:
# even better short-cut (when only interested in marginals)
Vss = Variable(data, requires_grad=True)
vals, vecs = custom_decomp()(Vss)
K = vecs.mm((1 / (vals +1)).diag()).mm(vecs.t()) # not really K
marginals = (1 - K.diag()).diag()
print("Marginals are: ", marginals)
marginals.backward(torch.arange(0,max(M,N)).diag().double())
print("Grad is:", Vss.grad.data)
# Deviation
(Vss.grad.data - Vl.grad.data).abs().sum() // N**2
In [7]:
from torch.autograd import Variable, Function
from dpp_nets.my_torch.linalg import custom_decomp
class new_custom_decomp(Function):
"""
This is a "strange" but useful decomposition.
It takes in a matrix A of size m x n.
It will return the eigenvectors (vecs)
and eigenvalues (vals) of the matrix
L = A.mm(A.t()), but without actually
computing L.
Any square or rectangular shapes are allowed
for A. :-)
"""
def __init__(self, some=False):
self.some = some
def forward(self, mat):
vecs, vals, _ = torch.svd(mat, self.some)
vals.pow_(2)
self.save_for_backward(mat, vals, vecs)
return vals, vecs
def backward(self, grad_vals, grad_vecs):
# unpack
mat, vals, vecs = self.saved_tensors
N = vals.size(0)
# auxillary
I = grad_vecs.new(N,N).copy_(torch.eye(N))
F = vals.expand(N,N) - vals.expand(N,N).t()
F.add_(I).reciprocal_().sub_(I)
# gradient
grad_mat = grad_vals.diag() + F * (vecs.t().mm(grad_vecs)) # intermediate variable
grad_mat = vecs.mm(grad_mat.mm(vecs.t())) # this is the gradient wrt L
grad_mat = grad_mat.mm(mat) + grad_mat.t().mm(mat) # this is the grad wrt A
return grad_mat
In [17]:
M = 6
N = 4
A = torch.randn(M, N)
L = A.mm(A.t())
vecs, vals, _ = torch.svd(A, some=False)
print(vecs.size(), vals.size())
vals = vals**2
In [18]:
print(L)
print(vecs.mm(vals.diag()).mm(vecs.t()))
In [ ]: