## IMPORTANT:

initial trajectory is important for final solution, with good initial trajectory it finds solution really easy. This suggests to use the old trajectory initial input to iLQG algorithm

``````

In [3]:

import numpy as np
from drl.ilqg import ilqg, LearnedDynamics

env = TwoLinkArm(g=0., wp=10., wv=1., wu=0.001)

N = 5 # number of future steps for iLQG
Nf = 2 # number of time-steps ahead and after current time-step for fitting linear model
num_episodes = 25
max_steps = 75

full_state = True

model = LearnedDynamics(max_steps, num_episodes, env.state_dim, env.action_dim, Nf)

``````
``````

In [4]:

x = env.reset(full_state=full_state)
x0 = env.q
goal = env.goal

# Initialize random control sequence
u = np.random.randn(max_steps, env.action_dim)

# Simulate system once
reward = 0.
for i_step in range(max_steps):
env.render()

x_new, r, t, _ = env.step(u[i_step,:], full_state=full_state)

x = x_new
reward += r
print('Iter %d, Steps %d, Reward: %.2f, Average reward: %.2f' % (0, i_step+1, reward, reward/i_step))

# Only use first N control inputs for iLQG estimator
u = u[:N,:]

for i_episode in range(1, num_episodes):
# Fit models
model.fit()

x = env.reset(x0, goal, full_state=full_state)
terminal = False
i_step = 0
reward = 0.

for i_step in range(max_steps):
env.render()

model.set_cur_step(i_step)

_, u, L, Vx, Vxx, cost = ilqg(model.dynamics_func, env.cost_func, x, u, {})

# Take step
x_new, r, t, _ = env.step(u[0, :], full_state=full_state)

model.add(i_episode, i_step, x, u[0, :], x_new)

u = np.concatenate((u[1:,:], np.random.randn(1, env.action_dim)))

x = x_new
reward += r
i_step += 1

if t:
break

print('Iter %d, Steps %d, Reward: %.2f, Average reward: %.2f' % (i_episode, i_step, reward, reward/i_step))

``````
``````

Iter 0, Steps 75, Reward: -1436.63, Average reward: -19.41
Iter 1, Steps 75, Reward: -998.66, Average reward: -13.32
Iter 2, Steps 75, Reward: -823.61, Average reward: -10.98
Iter 3, Steps 75, Reward: -694.03, Average reward: -9.25
Iter 4, Steps 75, Reward: -721.71, Average reward: -9.62
Iter 5, Steps 75, Reward: -702.38, Average reward: -9.37
Iter 6, Steps 75, Reward: -714.97, Average reward: -9.53
Iter 7, Steps 75, Reward: -750.33, Average reward: -10.00
Iter 8, Steps 75, Reward: -703.63, Average reward: -9.38
Iter 9, Steps 75, Reward: -702.08, Average reward: -9.36
Iter 10, Steps 75, Reward: -681.44, Average reward: -9.09
Iter 11, Steps 75, Reward: -689.65, Average reward: -9.20
Iter 12, Steps 75, Reward: -690.33, Average reward: -9.20
Iter 13, Steps 75, Reward: -691.93, Average reward: -9.23
Iter 14, Steps 75, Reward: -681.74, Average reward: -9.09
Iter 15, Steps 75, Reward: -689.35, Average reward: -9.19
Iter 16, Steps 75, Reward: -696.37, Average reward: -9.28
Iter 17, Steps 75, Reward: -698.86, Average reward: -9.32
Iter 18, Steps 75, Reward: -690.55, Average reward: -9.21
Iter 19, Steps 75, Reward: -693.54, Average reward: -9.25
Iter 20, Steps 75, Reward: -691.59, Average reward: -9.22
Iter 21, Steps 75, Reward: -692.29, Average reward: -9.23
Iter 22, Steps 75, Reward: -690.22, Average reward: -9.20
Iter 23, Steps 75, Reward: -690.10, Average reward: -9.20
Iter 24, Steps 75, Reward: -689.38, Average reward: -9.19

``````
``````

In [5]:

env.render(close=True)

``````