# iLQG algorithm with inverted Pendulum

## Initialize variables

``````

In [1]:

import numpy as np
from drl.ilqg import Pendulum

``````
``````

---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-1-aabd84276b3c> in <module>()
1 import numpy as np
2 from drl.ilqg import Pendulum
----> 3 from drl.env import TwoLinkArm

ImportError: No module named 'drl.env'

``````
``````

In [26]:

env = Pendulum()

render_env = True

T = 100
dt = 0.05
state_dim = 2
action_dim = 1

epsilon = 0.00000001
lamb = 1.0
lamb_factor = 5
lamb_max = 1000
eps_converge = 0.00001

``````

## Helper functions

Define functions for calculating temporal difference derivative and losses.

### Only valid for pendulum environment!

``````

In [3]:

def dynamics(x, u):
th, thdot = x

g = 10.
m = 1.
l = 1.
dt = 0.05
max_torque = 8.
max_speed = 2.

u = np.clip(u, -max_torque, max_torque)[0]

newthdot = thdot + (-3 * g / (2 * l) * np.sin(th + np.pi) + 3. / (m * l ** 2) * u) * dt
newth = th + newthdot * dt
newthdot = np.clip(newthdot, -max_speed, max_speed)

return np.array([newth, newthdot])

``````
``````

In [4]:

def dynamics_derivatives(t, epsilon):
x, u = t

dfdx = []
for i in range(len(x)):
x_plus = np.copy(x)
x_plus[i] += epsilon
x_min = np.copy(x)
x_min[i] -= epsilon
dfdx.append((dynamics(x_plus, u) - dynamics(x_min, u)) / (2. * epsilon))

dfdu = []
for i in range(len(u)):
u_plus = np.copy(u)
u_plus[i] += epsilon
u_min = np.copy(u)
u_min[i] -= epsilon
dfdu.append((dynamics(x, u_plus) - dynamics(x, u_min)) / (2. * epsilon))

return np.stack(dfdx, axis=1), np.stack(dfdu, axis=1)

``````
``````

In [17]:

def calc_cost(t):
x, u = t
l = 0.01 * u**2
lx = np.zeros([2, 1])
lu = 0.02 * u
lxx = np.zeros([2, 2])
luu = 0.02
lux = np.zeros(2)
return l, lx, lu, lxx, luu, lux

def calc_final_cost(t):
x, u = t
l = (1 - np.cos(x[0]))**2 + 0.1 * x[1]**2 + 0.01 * u**2
lx = np.array([2.*(1. - np.cos(x[0]))*np.sin(x[0]), 0.2*x[1]]).reshape([2, 1])
lu = 0.02 * u
lxx = np.array([4.*np.sin(x[0]/2.)**2 * (2*np.cos(x[0]) + 1), 0., 0., 0.2]).reshape([2, 2])
luu = 0.02
lux = 0.
return l, lx, lu, lxx, luu, lux

``````

## Forward pass

Execute control sequence pi. Calculate dynamics derivatives and loss (derivatives for every step). Return derivatives and total loss.

``````

In [6]:

def forward_pass(pi):
trajectory = []
derivatives = []
cost = 0.

x = env.reset()

# Execute all steps except for final step
for i in range(T-1):
if render_env:
env.render()

u = pi[i]

# Calculate dyanmics derivatives and loss (derivatives)
seq = (x, u)
dfdx, dfdu = dynamics_derivatives(seq, epsilon)
l, lx, lu, lxx, luu, lux = calc_cost(seq)

cost += l

# Store values
derivatives.append((np.eye(state_dim) + dfdx*dt, dfdu*dt, lx*dt, lu*dt, lxx*dt, luu*dt, lux*dt))
trajectory.append(x)

# Execute action
x = env.step(u)

if render_env:
env.render(close=True)

seq = (x, np.zeros(1))

dfdx, dfdu = dynamics_derivatives(seq, epsilon)
l, lx, lu, lxx, luu, lux = calc_final_cost(seq)

cost += l

derivatives.append((np.eye(state_dim) + dfdx*dt, dfdu*dt, lx*dt, lu*dt, lxx*dt, luu*dt, lux*dt))
trajectory.append(x)

return trajectory, derivatives, cost

``````

Test forward pass with random control sequence for T steps, print first 10 samples of trajectory to check if values are correct.

``````

In [7]:

pi = np.random.randn(T, 1)
trajectory, derivatives, cost = forward_pass(pi)

print 'Trajectory: ', trajectory[:5]
print 'Derivatives: ', derivatives[:2]
print 'Cost: ', cost

``````
``````

Trajectory:  [array([ 3.14159265,  0.        ]), array([ 3.13823905, -0.06707199]), array([ 3.13281519, -0.10847737]), array([ 3.12411269, -0.17404993]), array([ 3.11192589, -0.2437359 ])]
Derivatives:  [(array([[ 1.048125,  0.0025  ],
[-0.0375  ,  1.05    ]]), array([[ 0.000375],
[ 0.0075  ]]), array([[  2.44929360e-17],
[  0.00000000e+00]]), array([-0.00044715]), array([[-0.2 ,  0.  ],
[ 0.  ,  0.01]]), 0.001, 0.0), (array([[ 1.04812501,  0.0025    ],
[-0.03749979,  1.05      ]]), array([[ 0.000375],
[ 0.0075  ]]), array([[ 0.00067072],
[-0.00067072]]), array([-0.0002928]), array([[-0.19999719,  0.        ],
[ 0.        ,  0.01      ]]), 0.001, 0.0)]
Cost:  [ 399.084288]

``````

## Backward pass

For final timestep \$T\$

\$V(x,T) = l_f(x_T)\$

Now iterate backwards from timestep \$t = T\$ to \$t = 0\$. Calculate \$Q\$-values using:

\$Q_x = l_x + f_x^TV'_x\\ Q_u = l_u + f_u^TV'_x\\ Q_{xx} = l_{xx} + f_x^TV'_{xx}f_x\\ Q_{uu} = l_{uu} + f_u^TV'_{xx}f_u\\ Q_{ux} = l_{ux} + f_u^TV'_{xx}f_x\$

From the \$Q\$-values we can derive the \$V\$-values and control modifications \$K\$, \$k\$

\$K = -Q_{uu}^{-1}Q_{ux}\\ k = -Q_{uu}^{-1}Q_{u}\$

\$V_x = Q_x - K^TQ_{uu}k\\ V_{xx} = Q_{xx} - K^TQ_{uu}K\$

Derivative sequence is \$(f_x, f_u, l_x, l_u, l_{xx}, l_{uu}, l_{ux})\$

``````

In [8]:

def backward_pass(derivatives, lamb):
K = []
k = []

Vx = derivatives[T-1][2]
Vxx = derivatives[T-1][4]

for i in reversed(range(T-1)):
der = derivatives[i]
Qx = der[2] + np.dot(der[0].T, Vx)
Qu = der[3] + np.dot(der[1].T, Vx)
Qxx = der[4] + np.dot(der[0].T, np.dot(Vxx, der[0]))
Quu = der[5] + np.dot(der[1].T, np.dot(Vxx, der[1]))
Qux = der[6] + np.dot(der[1].T, np.dot(Vxx ,der[0]))

U, S, V = np.linalg.svd(Quu)
S[S < 0] = 0.0
S += lamb
Quu_inv = np.dot(U, np.dot(np.diag(1.0/S), V.T))

Ks = -Quu_inv * Qux
ks = -Quu_inv * Qu

Vx = Qx - np.dot(Ks.T, np.dot(Quu, ks))
Vxx = Qxx - np.dot(Ks.T, np.dot(Quu, Ks))

K.append(Ks)
k.append(ks)

return K[::-1], k[::-1]

``````

Test backward pass with previous results from forward pass

``````

In [9]:

K, k = backward_pass(derivatives, lamb)

print 'K: ', K[:3]
print 'k: ', k[:3]

``````
``````

K:  [array([[-42.51047564,  -3.24080289]]), array([[-39.02954012,  -2.77522002]]), array([[-35.79318228,  -2.36361661]])]
k:  [array([[-0.0857754]]), array([[-0.0817944]]), array([[-0.07824677]])]

``````

## Create new control signals

``````

In [10]:

def update_control_signal(pi, trajectory, K, k):
pi_new = np.zeros(pi.shape)
x = trajectory[0][0].copy()
for i in range(T - 1):
u = pi[i] + k[i] + np.dot(K[i], trajectory[i])
pi_new[i] = u[0]
return pi_new

``````
``````

In [11]:

print K[T-5]
print trajectory[T-5]
print np.dot(K[T-5], trajectory[T-5])

``````
``````

[[ 0.00042261 -0.00036279]]
[ 3.32383154 -0.61496768]
[ 0.00162779]

``````
``````

In [12]:

pi_new = update_control_signal(pi, trajectory, K, k)

%matplotlib inline
import matplotlib.pyplot as plt

t = np.linspace(0, T, T)
plt.plot(t, pi)
plt.plot(t, pi_new)
plt.title('Control signals')
plt.legend(['pi_old', 'pi_new'])

``````
``````

Out[12]:

<matplotlib.legend.Legend at 0x7f6efa0b3650>

``````

## Full loop

``````

In [29]:

new_simulation = True
render_env = True

pi = np.random.randn(T, 1)

iter = 0
while(True):
if new_simulation:
trajectory, derivatives, cost = forward_pass(pi)
new_simulation = False

K, k = backward_pass(derivatives, lamb)

pi_new = update_control_signal(pi, trajectory, K, k)

new_trajectory, new_derivatives, new_cost = forward_pass(pi_new)

print 'cost ', iter, ': ', ((cost - new_cost)/cost)
iter += 1

if new_cost < cost:
print 'new cost < cost'
lamb /= lamb_factor

if (np.abs(cost - new_cost)/cost) < eps_converge:
print 'Converged!'
break

pi = pi_new
trajectory = new_trajectory
cost = new_cost

new_simulation = True
else:
print 'new_cost > cost'
lamb *= lamb_factor

if lamb > lamb_max:
print 'lamb > lamb_max. Dit not converge!'
break

``````
``````

cost  0 :  [ 0.04540252]
new cost < cost
cost  1 :  [-1.68088317]
new_cost > cost
cost  2 :  [ 0.0176433]
new cost < cost
cost  3 :  [-0.32104884]
new_cost > cost
cost  4 :  [-0.04419413]
new_cost > cost
cost  5 :  [-0.00715395]
new_cost > cost
cost  6 :  [-0.00117138]
new_cost > cost
cost  7 :  [-0.0002323]
new_cost > cost
lamb > lamb_max. Dit not converge!

``````