$ \begin{array}{cccccc} &\textrm{pure Newton}&\textrm{newtmin1}&\textrm{newtmin2}&\textrm{newtmin3}&\textrm{GFGS}\\ 1&\surd &&&&\\ 2&\times &\times&\times&\times&\surd\\ 3&\surd &&&&\\ 4&\surd &&&&\\ 5&\surd &&&&\\ 6&\surd &&&&\\ 7&\times &\times&\times&\times&\surd\\ 8&\surd &&&&\\ 9&\surd &&&&\\ 10&\times &\times&\surd&&\\ 11&\times &\times&\times&\surd&\\ 12&\times &\times&\surd&&\\ 13&\surd &&&&\\ 14&\times &\times&\times&\times&\times\\ 15&\times &\times&\times&\times&\surd\\ 16&\surd &&&&\\ 17&\surd &&&&\\ 18&\times &\times&\times&\times&\times\\ \end{array} $
The above table shows the outcome of those methods, it is 'success' if the gradient's norm is relatively small, i.e., $\frac{\|g(x^*)\|}{\|g(x_0)\|}\leqslant10^{-5}$, this is a relatively high requirement and actually one can see that those 'failed' cases in BFGS and newtmin2 and newtmin3 also have a relatively small norm of the gradient. Below is technical details of the exercise.
In [4]:
using Toms566
using PyPlot
In [5]:
Pkg.test("Toms566")
In [6]:
function newtmin( obj, x0; maxIts=100, optTol=1e-6)
#"Pure" Newton method
# Minimize a function f using Newton’s method.
# obj: a function that evaluates the objective value,
# gradient, and Hessian at a point x, i.e.,
# (f, g, H) = obj(x)
# x0: starting point.
# maxIts (optional): maximum number of iterations.
# optTol (optional): optimality tolerance based on
# ||grad(x)|| <= optTol*||grad(x0)||
x=x0
status = 0
its = 0
(f0,g0,H0)=obj(x0)
(f,g,H)=obj(x)
while status != 1
x = x-inv(H)*g
(f,g,H)=obj(x)
its = its+1
if norm(g)<= optTol*norm(g0)
status = 1
end
if its>maxIts
status = 1
end
end
return (x, its)
end
Out[6]:
In [4]:
v=randn(10,1)
A=randn(10,10)
inv(A)
eye(10)
Out[4]:
In [5]:
function obj(x)
f=1/2*norm(x-v)^2
g=x-v
H=eye(10)
return (f,g,H)
end
Out[5]:
In [6]:
x0=zeros(10,1)
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(x-v)
Out[6]:
In [7]:
p = Problem(1) # Hellical Valley
Out[7]:
In [8]:
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
Out[8]:
In [9]:
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[9]:
In [10]:
p = Problem(2)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[10]:
In [11]:
p = Problem(3)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[11]:
In [12]:
p = Problem(4)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[12]:
In [13]:
p = Problem(5)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[13]:
In [14]:
p = Problem(6)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[14]:
In [15]:
p = Problem(7)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[15]:
In [16]:
p = Problem(8)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[16]:
In [17]:
p = Problem(9)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[17]:
In [18]:
p = Problem(10)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[18]:
In [19]:
p = Problem(11)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[19]:
In [20]:
p = Problem(12)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[20]:
In [21]:
p = Problem(13)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[21]:
In [22]:
p = Problem(14)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[22]:
In [23]:
p = Problem(15)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[23]:
In [24]:
p = Problem(16)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[24]:
In [25]:
p = Problem(17)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[25]:
In [ ]:
p = Problem(18)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
In [7]:
function newtmin1( obj, x0; maxIts=100, optTol=1e-6)
# Newton method with Armijo backtracking
# Minimize a function f using Newton’s method.
# obj: a function that evaluates the objective value,
# gradient, and Hessian at a point x, i.e.,
# (f, g, H) = obj(x)
# x0: starting point.
# maxIts (optional): maximum number of iterations.
# optTol (optional): optimality tolerance based on
# ||grad(x)|| <= optTol*||grad(x0)||
mu=1e-4
x=x0
status = 0
its = 0
(f0,g0,H0)=obj(x0)
(f,g,H)=obj(x)
while status != 1
alpha = 1
xnew = x-alpha*inv(H)*g
(fnew,gnew,Hnew)=obj(xnew)
sts =-fnew+f-alpha*mu*g'*inv(H)*g
while sts[1]<0
alpha=alpha/2
xnew = x-alpha*inv(H)*g
(fnew,gnew,Hnew)=obj(xnew)
sts=-fnew+f-alpha*mu*g'*inv(H)*g
end
x = x-alpha*inv(H)*g
(f,g,H)=obj(x)
its = its+1
if norm(g)<= optTol*norm(g0)
status = 1
end
if its>maxIts
status = 1
end
end
return (x, its)
end
Out[7]:
In [2]:
p = Problem(2)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin1( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
In [3]:
p = Problem(7)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin1( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
In [14]:
p = Problem(10)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin1( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[14]:
In [15]:
p = Problem(11)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin1( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[15]:
In [16]:
p = Problem(12)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin1( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[16]:
In [19]:
p = Problem(14)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin1( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[19]:
In [20]:
p = Problem(15)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin1( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[20]:
In [21]:
p = Problem(18)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin1( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[21]:
In [8]:
function newtmin2( obj, x0; maxIts=100, optTol=1e-6)
# Newton method with Armijo backtracking and some treatment with Hessian, treatment 1
# Minimize a function f using Newton’s method.
# obj: a function that evaluates the objective value,
# gradient, and Hessian at a point x, i.e.,
# (f, g, H) = obj(x)
# x0: starting point.
# maxIts (optional): maximum number of iterations.
# optTol (optional): optimality tolerance based on
# ||grad(x)|| <= optTol*||grad(x0)||
mu=1e-4
epsilon=0.01
x=x0
status = 0
its = 0
(f0,g0,H0)=obj(x0)
(f,g,H)=obj(x)
(V,S)=eig(H)
bar=ones(size(H,1))*epsilon
bar=Diagonal(bar)
bH=S*max(bar,Diagonal(V))*S'
while status != 1
alpha = 1
xnew = x-alpha*inv(bH)*g
(fnew,gnew,Hnew)=obj(xnew)
sts =-fnew+f-alpha*mu*g'*inv(bH)*g
while sts[1]<0
alpha=alpha/2
xnew = x-alpha*inv(bH)*g
(fnew,gnew,Hnew)=obj(xnew)
sts=-fnew+f-alpha*mu*g'*inv(bH)*g
end
x = x-alpha*inv(bH)*g
(f,g,H)=obj(x)
(V,S)=eig(H)
bar=ones(size(H,1))*epsilon
bar=Diagonal(bar)
bH=S*max(bar,Diagonal(V))*S'
its = its+1
if norm(g)<= optTol*norm(g0)
status = 1
end
if its>maxIts
status = 1
end
end
return (x, its)
end
Out[8]:
In [72]:
p = Problem(2)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin2( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[72]:
In [73]:
p = Problem(7)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin2( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[73]:
In [74]:
p = Problem(10)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin2( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[74]:
In [77]:
p = Problem(11)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin2( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[77]:
In [80]:
p = Problem(12)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin2( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[80]:
In [83]:
p = Problem(14)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin2( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[83]:
In [86]:
p = Problem(15)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin2( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[86]:
In [89]:
p = Problem(18)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin2( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[89]:
In [9]:
function newtmin3( obj, x0; maxIts=100, optTol=1e-6)
# Newton method with Armijo backtracking and some treatment with Hessian, treatment 2
# Minimize a function f using Newton’s method.
# obj: a function that evaluates the objective value,
# gradient, and Hessian at a point x, i.e.,
# (f, g, H) = obj(x)
# x0: starting point.
# maxIts (optional): maximum number of iterations.
# optTol (optional): optimality tolerance based on
# ||grad(x)|| <= optTol*||grad(x0)||
mu=1e-4
epsilon=0.01
x=x0
status = 0
its = 0
(f0,g0,H0)=obj(x0)
(f,g,H)=obj(x)
(V,S)=eig(H)
bar=ones(size(H,1))*epsilon
bar=Diagonal(bar)
bH=S*max(bar,abs(Diagonal(V)))*S'
while status != 1
alpha = 1
xnew = x-alpha*inv(bH)*g
(fnew,gnew,Hnew)=obj(xnew)
sts =-fnew+f-alpha*mu*g'*inv(bH)*g
while sts[1]<0
alpha=alpha/2
xnew = x-alpha*inv(bH)*g
(fnew,gnew,Hnew)=obj(xnew)
sts=-fnew+f-alpha*mu*g'*inv(bH)*g
end
x = x-alpha*inv(bH)*g
(f,g,H)=obj(x)
(V,S)=eig(H)
bar=ones(size(H,1))*epsilon
bar=Diagonal(bar)
bH=S*max(bar,abs(Diagonal(V)))*S'
its = its+1
if norm(g)<= optTol*norm(g0)
status = 1
end
if its>maxIts
status = 1
end
end
return (x, its)
end
Out[9]:
In [104]:
p = Problem(2)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin3( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[104]:
In [96]:
p = Problem(7)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin3( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[96]:
In [97]:
p = Problem(10)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin3( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[97]:
In [98]:
p = Problem(11)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin3( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[98]:
In [99]:
p = Problem(14)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin3( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[99]:
In [100]:
p = Problem(15)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin3( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[100]:
In [101]:
p = Problem(18)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=newtmin3( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[101]:
In [41]:
function mybfgs( obj, x0; maxIts=100, optTol=1e-6)
# BFGS method with Armijo backtracking and some treatment with initial Hessian, treatment 2
# Minimize a function f using Newton’s method.
# obj: a function that evaluates the objective value,
# gradient, and Hessian at a point x, i.e.,
# (f, g, H) = obj(x)
# x0: starting point.
# maxIts (optional): maximum number of iterations.
# optTol (optional): optimality tolerance based on
# ||grad(x)|| <= optTol*||grad(x0)||
mu=1e-4
eta=0.5
epsilon=0.01
x=x0
status = 0
its = 0
(f0,g0,H0)=obj(x0)
(f,g,H)=obj(x)
(V,S)=eig(H)
bar=ones(size(H,1))*epsilon
bar=Diagonal(bar)
bH=S*max(bar,abs(Diagonal(V)))*S'
Hk=bH
while status != 1
mp=-Hk*g
#////////////////////////////////////////////////////
#line search
alpha = 1
xnew = x+alpha*mp
(fnew,gnew,Hnew)=obj(xnew)
sts =-fnew+f+alpha*mu*g'*mp #Armijo
while (sts[1]<0)
alpha=alpha/2
xnew = x+alpha*mp
(fnew,gnew,Hnew)=obj(xnew)
sts=-fnew+f+alpha*mu*g'*mp
end
#////////////////////////////////////////////////////
sk=alpha*mp
yk=gnew-g
tt1=yk'*sk
tt1=tt1[1]
rho=1/tt1
Hk=(eye(size(Hk,1))-rho*sk*yk')*Hk*(eye(size(Hk,1))-rho*yk*sk')+rho*sk*sk'
x = x+alpha*mp
(f,g,H)=obj(x)
its = its+1
if norm(g)<= optTol*norm(g0)
status = 1
end
if its>maxIts
status = 1
end
end
return (x, its)
end
Out[41]:
In [43]:
p = Problem(18)
function obj(x)
f = p.obj(x) # objective value at x
g = p.grd(x) # gradient at x
H = p.hes(x) # Hessian at x
return (f,g,H)
end
x0=p.x0
(x,it)=mybfgs( obj, x0; maxIts=100, optTol=1e-6)
norm(p.grd(x))/norm(p.grd(x0))
Out[43]:
In [ ]: