where $\|g_i\|_2 \leq G$
In [1]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
USE_COLAB = False
if not USE_COLAB:
plt.rc("text", usetex=True)
if USE_COLAB:
!pip install git+https://github.com/amkatrutsa/liboptpy
import liboptpy.unconstr_solvers as methods
import liboptpy.step_size as ss
m, n = 500, 100
A = np.random.randn(m, n)
x_true = np.random.randn(n)
b = A.dot(x_true)
In [2]:
f = lambda x: np.linalg.norm(A.dot(x) - b, 1)
subgrad = lambda x: A.T.dot(np.sign(A.dot(x) - b))
alpha = 1e-3
s = 1e-1
sg_methods = {
"SM 1/k": methods.fo.SubgradientMethod(f, subgrad, ss.InvIterStepSize()),
"SM fixed={}".format(alpha): methods.fo.SubgradientMethod(f, subgrad, ss.ConstantStepSize(alpha)),
"SM scaled fix, s={}".format(s): methods.fo.SubgradientMethod(f, subgrad,
ss.ScaledConstantStepSize(s)),
}
In [3]:
x0 = np.random.randn(n)
max_iter = 500
In [4]:
for m in sg_methods:
_ = sg_methods[m].solve(x0=x0, max_iter=max_iter)
In [5]:
plt.figure(figsize=(10, 8))
for m in sg_methods:
plt.semilogy([f(x) for x in sg_methods[m].get_convergence()], label=m)
plt.legend(fontsize=20)
plt.xlabel(r"Number of iterations, $k$", fontsize=26)
plt.ylabel(r"Objective, $f(x_k)$", fontsize=26)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.grid(True)
Consider the following ODE
$$ \frac{dx}{dt} = -f'(x(t)) $$and after discretization
$$ \frac{x_{k+1} - x_k}{\alpha} = -f'(x_k), $$where $\alpha = t_{k+1} - t_k$ is a grid step and $x_k \equiv x(t_k)$
From this follows
$$ x_{k+1} = x_k - \alpha f'(x_k), $$which is equivalent to gradient descent method
and after discretization in backward manner
$$ \frac{x_{k+1} - x_k}{\alpha} = -f'({\color{red}{x_{k+1}}}), $$After some re-arrangements
\begin{align*} & \left(\frac{1}{2\alpha} \|u - x_k\|_2^2 + f(u) \right)'(x_{k+1}) = 0 \\ & x_{k+1} = \arg\min_u \left(f(u) + \frac{1}{2\alpha} \|u - x_k\|_2^2\right) = prox_{\alpha f}(x_k) \end{align*}The method to solve positive semi-definite linear system given Cholesky factorization of $A + \epsilon I$ for some $\epsilon$ $$ f(x) = \frac{1}{2}x^{\top}Ax - b^{\top}x \to \min_x, $$ where $A \in \mathbb{S}^n_+$
\begin{align*} prox_{\alpha f} (x_k) & = \arg\min_u \left(\frac{1}{2}u^{\top}Au - b^{\top}u + \frac{1}{2\alpha} \|u - x_k\|_2^2\right) \\ & = \left(A + (1/ \alpha) I\right)^{-1}\left(b + (1 / \alpha)x_k\right) \\ & = x_k + (A + \epsilon I)^{-1}(b - Ax_k) \end{align*}Proximal map is not a contraction
There exists theory on firm non-expansiveness of proximal operator
This property can be extended to subdifferential.
Assume $f$ is twice differentiable and strong convex ($f''(x) \succ 0$).
Consider convex function $f$ such that
$$ f(x) = h(x) + g(x), $$where $h(x)$ is convex differentiable, and $g(x)$ can be convex extended-value function, so $g: \mathbb{R}^n \to \mathbb{R} \cup \{+\infty\}$
Then the one step of the proximal gradient method is
$$ x_{k+1} = prox_{\alpha_k g} (x_k - \alpha_k h'(x_k)) $$From the first-order optimality condition follows
\begin{align*} & 0 \in \alpha h'(x^*) + \alpha\partial g(x^*)\\ & 0 \in \alpha h'(x^*) + x^* - x^* + \alpha\partial g(x^*)\\ & (I - \alpha h')(x^*) \in (I + \alpha \partial g)(x^*)\\ & x^* = (I + \alpha \partial g)^{-1}(I - \alpha h')(x^*)\\ & x^* = prox_{\alpha g}(x^* - \alpha h'(x^*)) \end{align*}
In [6]:
def prox_alg(f, f_grad, g_prox, x0, num_iter, beta=0.5, fix_lam=None, accel=False):
conv = [x0]
x = x0.copy()
if accel:
t_prev = 1
t_next = (1 + np.sqrt(1 + 4 * t_prev**2)) / 2.
if fix_lam is None:
lam = 1.
for i in range(num_iter):
if accel and i > 0:
x = x + (t_prev - 1) / t_next * (x - conv[-2])
t_prev = t_next
t_next = (1 + np.sqrt(1 + 4 * t_prev**2)) / 2.
if fix_lam is None:
while True:
z = g_prox(x - lam * f_grad(x), lam)
if f(z) <= f(x) + f_grad(x).dot(z - x) + np.linalg.norm(z - x)**2 / (2 * lam):
break
else:
lam = lam * beta
else:
z = g_prox(x - fix_lam * f_grad(x), fix_lam)
x = z.copy()
conv.append(x)
return x, conv
In [7]:
f = lambda x: np.linalg.norm(A.dot(x) - y)**2 / 2
f_grad = lambda x: A.T.dot(A.dot(x) - y)
f_subgrad = lambda x: A.T.dot(A.dot(x) - y) + gamma * np.sign(x)
def g_prox(h, lam):
return np.sign(h) * np.maximum(np.abs(h) - lam * gamma, 0)
In [8]:
import sklearn.preprocessing as skprep
m = 500
n = 2500
A = np.random.rand(m, n)
A = skprep.normalize(A, norm="l2", axis=0)
L = np.linalg.eigvalsh(A.T.dot(A)).max()
x_true = np.random.randn(n)
x_true[np.random.rand(n) < 0.96] = 0
print("Number of nonzeros in x_true = {}".format(np.sum(x_true != 0)))
v = 9e-2 * np.random.randn(m)
y = A.dot(x_true) + v
gamma_max = np.linalg.norm(A.T.dot(y), ord=np.inf)
gamma = 0.1 * gamma_max
print("Gamma = {}".format(gamma))
print("S2N ratio = {}".format(np.linalg.norm(A.dot(x_true))**2 / np.linalg.norm(v)**2))
f_star = f(x_true) + gamma*np.linalg.norm(x_true, 1)
In [9]:
num_iter = 500
# x0 = np.zeros(n)
x0 = np.random.rand(n)
In [10]:
x, conv = prox_alg(f, f_grad, g_prox, x0, num_iter, fix_lam=None)
print("Number nonzeros in x* = {}".format(np.sum(x != 0)))
print("f* = {}".format(f(x) + gamma * np.linalg.norm(x, 1)))
x_acc, conv_acc = prox_alg(f, f_grad, g_prox, x0, num_iter, fix_lam=None, accel=True)
print("Number nonzeros in x* = {}".format(np.sum(x_acc != 0)))
print("f* = {}".format(f(x_acc) + gamma * np.linalg.norm(x_acc, 1)))
In [11]:
subgrad_m = methods.fo.SubgradientMethod(f, f_subgrad, ss.ConstantStepSize(1e-3))
x_subgrad = subgrad_m.solve(x0=x0, max_iter=num_iter)
print("f^* = {}".format(f(x_subgrad)))
In [12]:
plt.figure(figsize=(10, 8))
fontsize=25
plt.plot([f(x) + gamma * np.linalg.norm(x, 1) for x in conv], label="ISTA")
plt.plot([f(x) + gamma * np.linalg.norm(x, 1) for x in conv_acc], label="FISTA")
plt.plot([f(x) + gamma * np.linalg.norm(x, 1) for x in subgrad_m.get_convergence()], label="Subgradient method")
plt.legend(fontsize=fontsize)
plt.xlabel(r"Number of iterations, $k$", fontsize=fontsize)
plt.ylabel(r"Lasso objective, $h(x_k) + g(x_k)$", fontsize=fontsize)
plt.xticks(fontsize=fontsize)
_ = plt.yticks(fontsize=fontsize)
plt.yscale("log")
Good presentation about robust PCA is here
In [13]:
import copy
def prox_alg_mat(f, f_grad, g_prox, X0, num_iter, beta=0.5, fix_lam=None, accel=False):
conv = [X0]
X = copy.copy(X0)
if accel:
t_prev = 1
t_next = (1 + np.sqrt(1 + 4 * t_prev**2)) / 2.
if fix_lam is None:
lam = 1.
Z = [[] for X in X0]
for i in range(num_iter):
if accel and i > 0:
for i in range(len(X)):
X[i] = X[i] + (t_prev - 1) / t_next * (X[i] - conv[-2][i])
t_prev = t_next
t_next = (1 + np.sqrt(1 + 4 * t_prev**2)) / 2.
if fix_lam is None:
current_f = f(X)
current_grad = []
for grad in f_grad:
current_grad.append(grad(X))
while True:
for i, g in enumerate(g_prox):
Z[i] = g(X[i] - lam * current_grad[i], lam)
lin_term = 0
for i in range(len(Z)):
lin_term += np.trace(current_grad[i].T.dot(Z[i] - X[i]))
quad_term = 0
for i in range(len(Z)):
quad_term += np.linalg.norm(Z[i] - X[i])**2
if f(Z) <= current_f + lin_term + quad_term / (2 * lam):
break
else:
lam = lam * beta
else:
for i, g in enumerate(g_prox):
Z[i] = g(X[i] - fix_lam * f_grad[i](X), fix_lam)
X = copy.copy(Z)
conv.append(X)
return X, conv
In [18]:
m = 100
n = 100
rank = 4
L = np.random.randn(m, rank).dot(np.random.randn(rank, n))
S = 20 * np.random.rand(m * n) - 10
S[np.random.rand(m*n) < 0.9] = 0
print(np.sum(S != 0))
S = S.reshape(m, n)
V = 0.05 * np.random.randn(m, n)
A = L + S + V
max_gamma_3 = np.linalg.norm(A, ord="nuc")
gamma_3 = 0.01 * max_gamma_3
print(gamma_3)
max_gamma_2 = np.max(np.abs(A))
gamma_2 = 0.15 * max_gamma_2
print(gamma_2)
In [26]:
import cvxpy as cvx
L_c = cvx.Variable((m, n))
S_c = cvx.Variable((m, n))
problem = cvx.Problem(cvx.Minimize(0.5 * cvx.norm(A - (L_c + S_c))**2 + gamma_2 * cvx.norm(S_c, 1) +
gamma_3 * cvx.norm(L_c, "nuc")))
problem.solve(solver="SCS", verbose=True, eps=1e-5, max_iters=3000, use_indirect=True)
L_cvx = np.array(L_c.value)
S_cvx = np.array(S_c.value)
print("CVXPy optimal value = {}".format(problem.value))
print("Rank of lowrank term = {}".format(np.linalg.matrix_rank(L_cvx)))
In [22]:
def f(X):
Y = np.zeros_like(X[0])
for y in X:
Y = Y + y
return np.linalg.norm(A - Y, ord="fro")**2 * 0.5
def grad_f_L(X):
return X[0] + X[1] - A
def grad_f_S(X):
return X[0] + X[1] - A
f_grad = [grad_f_S, grad_f_L]
def entry_l1_prox(X, lam):
return np.sign(X) * np.maximum(np.abs(X) - gamma_2 * lam, 0)
def nuclear_norm_prox(X, lam):
U, sigma, V = np.linalg.svd(X, full_matrices=False)
return U.dot(np.diag(np.maximum(sigma - lam * gamma_3, 0))).dot(V)
g_prox = [entry_l1_prox, nuclear_norm_prox]
In [23]:
# X0 = [np.random.randn(m, n), np.random.randn(m, n)]
X0 = [np.zeros((m, n)), np.zeros((m, n))]
num_iter = 600
In [24]:
step = 5e-3
X_pm, conv = prox_alg_mat(f, f_grad, g_prox, X0, num_iter, beta=0.5, fix_lam=step, accel=False)
print("f* = {}".format(f(X_pm) + gamma_2 * np.sum(np.abs(X_pm[0])) + gamma_3 * np.linalg.norm(X_pm[1], "nuc")))
print("Rank of the low-rank term = {}".format(np.linalg.matrix_rank(X_pm[1])))
print("Number of nnz in sparse term = {}".format(np.sum(X_pm[0] != 0)))
X_pm_acc, conv_acc = prox_alg_mat(f, f_grad, g_prox, X0, num_iter, beta=0.5, fix_lam=step, accel=True)
print("f* = {}".format(f(X_pm_acc) + gamma_2 * np.sum(np.abs(X_pm_acc[0])) +
gamma_3 * np.linalg.norm(X_pm_acc[1], "nuc")))
print("Rank of the low-rank term = {}".format(np.linalg.matrix_rank(X_pm_acc[1])))
print("Number of nnz in sparse term = {}".format(np.sum(X_pm_acc[0] != 0)))
In [25]:
plt.figure(figsize=(10, 8))
fontsize=25
plt.plot([f(X) + gamma_2 * np.sum(np.abs(X[0])) + gamma_3 * np.linalg.norm(X[1], "nuc") for X in conv], label="PGM")
plt.plot([f(X) + gamma_2 * np.sum(np.abs(X[0])) + gamma_3 * np.linalg.norm(X[1], "nuc") for X in conv_acc], label="FPGM")
plt.legend(fontsize=fontsize)
plt.xlabel(r"Number of iterations, $k$", fontsize=fontsize)
plt.ylabel(r"Objective", fontsize=fontsize)
plt.xticks(fontsize=fontsize)
_ = plt.yticks(fontsize=fontsize)
plt.yscale("log")