In [1]:
import numpy as np
from scipy.linalg import *
from drl.env.arm import TwoLinkArm
import time
%matplotlib notebook
In [2]:
epsilon = 1e-5
Ts = 10
env = TwoLinkArm(g=0.)
Q = np.eye(env.state_dim)*100.
R = np.eye(env.action_dim)*1.
In [3]:
def calc_derivatives(x, u):
A = np.zeros((env.state_dim, env.state_dim))
for i in range(env.state_dim):
x_tmp = x.copy()
x_tmp[i] += epsilon
_, f_1 = env.dynamics_func(x_tmp, u)
x_tmp = x.copy()
x_tmp[i] -= epsilon
_, f_2 = env.dynamics_func(x_tmp, u)
fxdx = (f_1 - f_2) / (2*epsilon)
A[:, i] = fxdx
B = np.zeros((env.state_dim, env.action_dim))
for i in range(env.action_dim):
u_tmp = u.copy()
u_tmp[i] += epsilon
_, f_1 = env.dynamics_func(x, u_tmp)
u_tmp = u.copy()
u_tmp[i] -= epsilon
_, f_2 = env.dynamics_func(x, u_tmp)
fxdu = (f_1 - f_2) / (2*epsilon)
B[:, i] = fxdu
return A, B
In [4]:
def run_experiment():
x = env.reset()
env.render()
u = [0.]*env.action_dim
V = 0.
for _ in range(int(Ts/env.dt)):
# Calculate optimal feedback gain K
error = x - env.get_goal()
A, B = calc_derivatives(error, u)
P = solve_continuous_are(A, B, Q, R)
K = np.dot(np.linalg.pinv(R), np.dot(B.T, P))
u = -np.dot(K, error)
x, _, _, _ = env.step(u)
env.render()
V += np.dot(error.T, np.dot(P, error))
print('Episode %s - Total Cost: %s' % (str(i), V))
return env.goal - x
In [5]:
for i in range(5):
error = run_experiment()
In [ ]:
env.render(close=True)
In [ ]:
num_targets = 5
def run_experiment2():
x = env.reset()
env.render()
u = [0.]*env.action_dim
V_tot = 0.
for i in range(num_targets):
env.set_goal()
goal_reached = False
while not goal_reached:
error = x - env.get_goal()
A, B = calc_derivatives(error, u)
P = solve_continuous_are(A, B, Q, R)
K = np.dot(np.linalg.pinv(R), np.dot(B.T, P))
u = -np.dot(K, error)
x, _, _, _ = env.step(u)
env.render()
V = np.dot(error.T, np.dot(P, error))
V_tot += V
if V < 1e-4:
goal_reached = True
print('Target %s - Total Cost: %s' % (str(i), V_tot))
In [ ]:
run_experiment2()
In [ ]:
env.render(close=True)