Unit testing


In [1]:
from importlib import reload

In [15]:
import matplotlib.pyplot as plt

gathering_game class


In [117]:
import gathering

In [118]:
reload(gathering)


Out[118]:
<module 'gathering' from '/home/roby/ssd/modules/gathering.py'>

In [119]:
from gathering import gathering_game

In [132]:
# test gathering_game class. test init functions
game_pars={}
game_pars['gamma']=.99
game_pars['N_apples']=2
game_pars['N_tagged']=5
# local vars, should not be changed
game_pars['W'] = 33 # Width, always odd
game_pars['H'] = 11 # Height, always odd
game_pars['size_obs_ahead'] = 15 # number of sites the players can see in front of them
game_pars['size_obs_side'] = 10 # number of sites the players can see on their side

test_game = gathering_game(game_pars)
print('pars',test_game.pars)
print(test_game.dir)
print(test_game.s.shape)
test_game.show_screen(show=True)


pars {'W': 33, 'size_obs_side': 10, 'N_tagged': 5, 'H': 11, 'size_obs_ahead': 15, 'gamma': 0.99, 'N_apples': 2}
[0 0]
(3, 33, 11)
Direction 0: right
Direction 1: right

In [133]:
# s_t, a_{0,t}, a_{1,t}, s_{t+1}
#r0,r1=test_game.transition_and_get_reward(test_game.actions_dict['stand_still'], test_game.actions_dict['rotate_right'])
#test_game.show_screen(show=True)
#
r0,r1=test_game.transition_and_get_reward(test_game.actions_dict['rotate_right'], test_game.actions_dict['rotate_right'])
test_game.show_screen(show=True)
r0,r1=test_game.transition_and_get_reward(test_game.actions_dict['stand_still'], test_game.actions_dict['step_forward'])
test_game.show_screen(show=True)
r0,r1=test_game.transition_and_get_reward(test_game.actions_dict['stand_still'], test_game.actions_dict['step_forward'])
test_game.show_screen(show=True)
r0,r1=test_game.transition_and_get_reward(test_game.actions_dict['stand_still'], test_game.actions_dict['step_forward'])
test_game.show_screen(show=True)
r0,r1=test_game.transition_and_get_reward(test_game.actions_dict['stand_still'], test_game.actions_dict['step_forward'])
test_game.show_screen(show=True)
r0,r1=test_game.transition_and_get_reward(test_game.actions_dict['stand_still'], test_game.actions_dict['step_forward'])
test_game.show_screen(show=True)


In update_status_apples: t_apples []
In update_tagged: t_tagged [-1, -1]
a0 rot,a1 rot
Direction 0: down
Direction 1: down
In update_status_apples: t_apples []
In update_tagged: t_tagged [-1, -1]
a0 rot,a1 mov
Direction 0: down
Direction 1: down
In update_status_apples: t_apples []
In update_tagged: t_tagged [-1, -1]
a0 rot,a1 mov
Direction 0: down
Direction 1: down
In update_status_apples: t_apples []
In update_tagged: t_tagged [-1, -1]
a0 rot,a1 mov
Direction 0: down
Direction 1: down
In update_status_apples: t_apples []
In update_tagged: t_tagged [-1, -1]
a0 rot,a1 mov
Direction 0: down
Direction 1: down
In update_status_apples: t_apples []
In update_tagged: t_tagged [-1, -1]
a0 rot,a1 mov
Direction 0: down
Direction 1: down

In [134]:
r0,r1=test_game.transition_and_get_reward(test_game.actions_dict['stand_still'], test_game.actions_dict['rotate_left'])
r0,r1=test_game.transition_and_get_reward(test_game.actions_dict['stand_still'], test_game.actions_dict['rotate_left'])
test_game.show_screen(show=True)


In update_status_apples: t_apples []
In update_tagged: t_tagged [-1, -1]
a0 rot,a1 rot
In update_status_apples: t_apples []
In update_tagged: t_tagged [-1, -1]
a0 rot,a1 rot
Direction 1: up
Direction 0: down

In [135]:
r0,r1=test_game.transition_and_get_reward(test_game.actions_dict['use_beam'], test_game.actions_dict['use_beam'])
test_game.show_screen(show=True)
r0,r1=test_game.transition_and_get_reward(test_game.actions_dict['use_beam'], test_game.actions_dict['use_beam'])
test_game.show_screen(show=True)


In update_status_apples: t_apples []
In update_tagged: t_tagged [-1, -1]
a0 beam,a1 beam
In use_beam: opponent hit!
In use_beam: opponent hit!
Direction 1: up
Direction 0: down
In update_status_apples: t_apples []
In update_tagged: t_tagged [-1, -1]
a0 beam,a1 beam
In use_beam: opponent hit!
In use_beam: opponent hit!
Direction 1: up
Direction 0: down

In [141]:
r0,r1=test_game.transition_and_get_reward(test_game.actions_dict['stand_still'], test_game.actions_dict['stand_still'])
test_game.show_screen(show=True)


In update_status_apples: t_apples []
In update_tagged: t_tagged [6, 6]
a0 rot,a1 rot
Direction 1: up
Direction 0: down

In [73]:
test_game.reset()

In [101]:
# s_t, a_{0,t}, a_{1,t}, s_{t+1}
r0,r1=test_game.transition_and_get_reward(test_game.actions_dict['stand_still'], test_game.actions_dict['rotate_right'])
test_game.show_screen(show=True)
#
r0,r1=test_game.transition_and_get_reward(test_game.actions_dict['step_left'], test_game.actions_dict['use_beam'])
test_game.show_screen(show=True)


In update_status_apples: t_apples []
a0 rot,a1 rot
Direction 0: right
Direction 1: down
In update_status_apples: t_apples []
a0 mov,a1 beam
in get_new_pos, pl 0 cur_pos 12 7
cur_dir,a 0 1
new_x,new_y 12 12.0 8 8.0
in move_and_update_apples, pl 0 pos 12 7
s: 255
sh gr (5,) (5,)
pos_pl [12, 5] pos_opp [12, 8] gr_xs,gr_ys [12 12 12 12 12] [0 1 2 3 4]
Direction 0: right
Direction 1: down

In [102]:
test_game.show_screen(show=True)


Direction 0: right
Direction 1: down

In [103]:
# test of observation functions
# test of obs_0  
#r0,r1=test.transition_and_get_reward(test.actions_dict['rotate_right'], test.actions_dict['rotate_left'])
#test.show_screen()
#print('Reward', r0,r1)
obs_0_s=test_game.obs_0()
to_show = obs_0_s.transpose((2,1,0))
print(to_show.shape)
plt.imshow(to_show,origin='lower')
plt.show()
# test of obs_1
obs_1_s=test_game.obs_1()
to_show = obs_1_s.transpose((2,1,0))
print(to_show.shape)
plt.imshow(to_show,origin='lower')
plt.show()


In obs_0, pos_0: 12 8
(21, 16, 3)
In obs_1, pos_1: 12 5
(21, 16, 3)

In [8]:
test.reset()
test.show_screen()


Direction 0: right
Direction 1: right

In [20]:
for i in range(15):
    test.transition_and_get_reward(test.actions_dict['step_forward'], test.actions_dict['step_forward'])
test.show_screen()


In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 0 7
cur_dir,a 0 0
new_x,new_y 1 1.0 7 7.0
in get_new_pos, pl 1 cur_pos 0 5
cur_dir,a 0 0
new_x,new_y 1 1.0 5 5.0
in move_and_update_apples, pl 0 pos 0 7
s: 255
in move_and_update_apples, pl 1 pos 0 5
s: 255
In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 1 7
cur_dir,a 0 0
new_x,new_y 2 2.0 7 7.0
in get_new_pos, pl 1 cur_pos 1 5
cur_dir,a 0 0
new_x,new_y 2 2.0 5 5.0
in move_and_update_apples, pl 0 pos 1 7
s: 255
in move_and_update_apples, pl 1 pos 1 5
s: 255
In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 2 7
cur_dir,a 0 0
new_x,new_y 3 3.0 7 7.0
in get_new_pos, pl 1 cur_pos 2 5
cur_dir,a 0 0
new_x,new_y 3 3.0 5 5.0
in move_and_update_apples, pl 0 pos 2 7
s: 255
in move_and_update_apples, pl 1 pos 2 5
s: 255
In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 3 7
cur_dir,a 0 0
new_x,new_y 4 4.0 7 7.0
in get_new_pos, pl 1 cur_pos 3 5
cur_dir,a 0 0
new_x,new_y 4 4.0 5 5.0
in move_and_update_apples, pl 0 pos 3 7
s: 255
in move_and_update_apples, pl 1 pos 3 5
s: 255
In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 4 7
cur_dir,a 0 0
new_x,new_y 5 5.0 7 7.0
in get_new_pos, pl 1 cur_pos 4 5
cur_dir,a 0 0
new_x,new_y 5 5.0 5 5.0
in move_and_update_apples, pl 0 pos 4 7
s: 255
in move_and_update_apples, pl 1 pos 4 5
s: 255
In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 5 7
cur_dir,a 0 0
new_x,new_y 6 6.0 7 7.0
in get_new_pos, pl 1 cur_pos 5 5
cur_dir,a 0 0
new_x,new_y 6 6.0 5 5.0
in move_and_update_apples, pl 0 pos 5 7
s: 255
in move_and_update_apples, pl 1 pos 5 5
s: 255
In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 6 7
cur_dir,a 0 0
new_x,new_y 7 7.0 7 7.0
in get_new_pos, pl 1 cur_pos 6 5
cur_dir,a 0 0
new_x,new_y 7 7.0 5 5.0
in move_and_update_apples, pl 0 pos 6 7
s: 255
in move_and_update_apples, pl 1 pos 6 5
s: 255
In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 7 7
cur_dir,a 0 0
new_x,new_y 8 8.0 7 7.0
in get_new_pos, pl 1 cur_pos 7 5
cur_dir,a 0 0
new_x,new_y 8 8.0 5 5.0
in move_and_update_apples, pl 0 pos 7 7
s: 255
in move_and_update_apples, pl 1 pos 7 5
s: 255
In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 8 7
cur_dir,a 0 0
new_x,new_y 9 9.0 7 7.0
in get_new_pos, pl 1 cur_pos 8 5
cur_dir,a 0 0
new_x,new_y 9 9.0 5 5.0
in move_and_update_apples, pl 0 pos 8 7
s: 255
in move_and_update_apples, pl 1 pos 8 5
s: 255
In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 9 7
cur_dir,a 0 0
new_x,new_y 10 10.0 7 7.0
in get_new_pos, pl 1 cur_pos 9 5
cur_dir,a 0 0
new_x,new_y 10 10.0 5 5.0
in move_and_update_apples, pl 0 pos 9 7
s: 255
in move_and_update_apples, pl 1 pos 9 5
s: 255
In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 10 7
cur_dir,a 0 0
new_x,new_y 11 11.0 7 7.0
in get_new_pos, pl 1 cur_pos 10 5
cur_dir,a 0 0
new_x,new_y 11 11.0 5 5.0
in move_and_update_apples, pl 0 pos 10 7
s: 255
in move_and_update_apples, pl 1 pos 10 5
s: 255
In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 11 7
cur_dir,a 0 0
new_x,new_y 12 12.0 7 7.0
in get_new_pos, pl 1 cur_pos 11 5
cur_dir,a 0 0
new_x,new_y 12 12.0 5 5.0
in move_and_update_apples, pl 0 pos 11 7
s: 255
in move_and_update_apples, pl 1 pos 11 5
s: 255
In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 12 7
cur_dir,a 0 0
new_x,new_y 13 13.0 7 7.0
in get_new_pos, pl 1 cur_pos 12 5
cur_dir,a 0 0
new_x,new_y 13 13.0 5 5.0
in move_and_update_apples, pl 0 pos 12 7
s: 255
in move_and_update_apples, pl 1 pos 12 5
s: 255
In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 13 7
cur_dir,a 0 0
new_x,new_y 14 14.0 7 7.0
in get_new_pos, pl 1 cur_pos 13 5
cur_dir,a 0 0
new_x,new_y 14 14.0 5 5.0
in move_and_update_apples, pl 0 pos 13 7
s: 255
in move_and_update_apples, pl 1 pos 13 5
s: 255
In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 14 7
cur_dir,a 0 0
new_x,new_y 15 15.0 7 7.0
in get_new_pos, pl 1 cur_pos 14 5
cur_dir,a 0 0
new_x,new_y 15 15.0 5 5.0
in move_and_update_apples, pl 0 pos 14 7
s: 255
in move_and_update_apples, pl 1 pos 14 5
s: 255
Direction 0: right
Direction 1: right

In [37]:
#r0,r1=test.transition_and_get_reward(test.actions_dict['stand_still'], test.actions_dict['stand_still'])
r0,r1=test.transition_and_get_reward(test.actions_dict['step_forward'], test.actions_dict['step_forward'])
#r0,r1=test.transition_and_get_reward(test.actions_dict['step_left'], test.actions_dict['step_right'])
test.show_screen()
print('Reward',r0,r1)


In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 20 8
cur_dir,a 0 0
new_x,new_y 21 21.0 8 8.0
in get_new_pos, pl 1 cur_pos 20 4
cur_dir,a 0 0
new_x,new_y 21 21.0 4 4.0
in move_and_update_apples, pl 0 pos 20 8
s: 255
in move_and_update_apples, pl 1 pos 20 4
s: 255
Direction 0: right
Direction 1: right
Reward 0 0

In [195]:
r0,r1=test.transition_and_get_reward(test.actions_dict['step_right'], test.actions_dict['step_right'])
test.show_screen()
print('Reward', r0,r1)


In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 15 7
cur_dir,a 0 3
new_x,new_y 15 15.0 6 6.0
in get_new_pos, pl 1 cur_pos 15 5
cur_dir,a 0 3
new_x,new_y 15 15.0 4 4.0
In move_and_update_apples, got an apple!
in move_and_update_apples, pl 0 pos 15 7
s: 255
in move_and_update_apples, pl 1 pos 15 5
s: 255
Direction 0: right
Direction 1: right
Reward 1 0

In [10]:
# test the transition functions by performing random moves:
import time
def random_actions():
    # init
    game = gathering_game(game_pars)
    # play N random actions and show on screen
    N = 5
    for t in range(N):
        print('Time',game.global_time)
        a0,a1 = (8*np.random.random((2,))).astype(int)
        for k,v in game.actions_dict.items():
            if a0 == v:
                print('Action 0:',k)
            if a1 == v:
                print('Action 1:',k)
        game.transition_and_get_reward(a0, a1)
        game.show_screen()
        time.sleep(1)

In [11]:
random_actions()


Time 0
Action 1: use_beam
Action 0: step_back-ward
In update_status_apples: t_apples []
a0 mov,a1 beam
in get_new_pos, pl 0 cur_pos 0 7
cur_dir,a 0 2
new_x,new_y -1 -1.0 7 7.0
out of box
in move_and_update_apples, pl 0 pos 0 7
s: 255
sh gr (32,) (32,)
pos_pl [0, 5] pos_opp [0, 7] gr_xs,gr_ys [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
 26 27 28 29 30 31 32] [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
Direction 0: right
Direction 1: right
Time 1
Action 0: rotate_right
Action 1: rotate_left
In update_status_apples: t_apples []
a0 rot,a1 rot
Direction 0: down
Direction 1: up
Time 2
Action 0: step_right
Action 1: rotate_right
In update_status_apples: t_apples []
a0 mov,a1 rot
in get_new_pos, pl 0 cur_pos 0 7
cur_dir,a 3 3
new_x,new_y -1 -1.0 7 7.0
out of box
in move_and_update_apples, pl 0 pos 0 7
s: 255
Direction 1: right
Direction 0: down
Time 3
Action 0: step_back-ward
Action 1: rotate_left
In update_status_apples: t_apples []
a0 mov,a1 rot
in get_new_pos, pl 0 cur_pos 0 7
cur_dir,a 3 2
new_x,new_y 0 3.06161699787e-16 8 8.0
in move_and_update_apples, pl 0 pos 0 7
s: 255
Direction 0: down
Direction 1: up
Time 4
Action 0: step_left
Action 1: step_forward
In update_status_apples: t_apples []
a0,a1 mov
in get_new_pos, pl 0 cur_pos 0 8
cur_dir,a 3 1
new_x,new_y 1 1.0 8 8.0
in get_new_pos, pl 1 cur_pos 0 5
cur_dir,a 1 0
new_x,new_y 0 6.12323399574e-17 6 6.0
in move_and_update_apples, pl 0 pos 0 8
s: 255
in move_and_update_apples, pl 1 pos 0 5
s: 255
Direction 0: down
Direction 1: up

dqn class


In [4]:
reload(dqn_file)


Out[4]:
<module 'dqn_file' from '/home/roby/ssd/modules/dqn_file.py'>

In [14]:
import dqn_file
from dqn_file import dqn

In [15]:
# test of dqn: OK
import torch
import torch.autograd as autograd

C_in = 3 # for RGB
C_H = 32 # number of hidden units (or channels)
C_out = 8 # number of actions.
kernel_size = 5 
stride = 2
# width and height of observation region
obs_window_W = 21
obs_window_H = 16

model_test = dqn(C_in, C_H, C_out, kernel_size, stride, obs_window_H, obs_window_W)
for p in model_test.parameters():
    print(p.size())

# test with a random smaple (use unsqueeze to get extra batch dimension)
x_test = autograd.Variable(torch.randn(C_in, obs_window_H, obs_window_W).unsqueeze(0))
print('x',x_test.size(),type(x_test))
y_pred = model_test(x_test)
print(y_pred.data)
print(y_pred.data.max(1))
print(y_pred.data.max(1)[1])


torch.Size([32, 3, 5, 5])
torch.Size([32])
torch.Size([1728, 1728])
torch.Size([1728])
torch.Size([32, 32, 5, 5])
torch.Size([32])
torch.Size([8, 96])
torch.Size([8])
x torch.Size([1, 3, 16, 21]) <class 'torch.autograd.variable.Variable'>

 0.0000  0.0753  0.0824  0.0000  0.1008  0.0000  0.0000  0.0000
[torch.FloatTensor of size 1x8]

(
 0.1008
[torch.FloatTensor of size 1x1]
, 
 4
[torch.LongTensor of size 1x1]
)

 4
[torch.LongTensor of size 1x1]

q_learner class


In [6]:
import q_learner_file

In [125]:
reload(q_learner_file)


Out[125]:
<module 'q_learner_file' from '/home/roby/ssd/modules/q_learner_file.py'>

In [126]:
from q_learner_file import experience, replay_memory, q_learner

In [11]:
# test namedtuple. all its members are torch tensors: OK
import torch
s = torch.randn(3,2,2).unsqueeze(0)
a = torch.Tensor([1])
sp = torch.randn(3,2,2).unsqueeze(0)
r = torch.Tensor([0])
test_exp = experience(s,a,r,sp)
test_exp.next_observation


Out[11]:
(0 ,0 ,.,.) = 
  1.2230  2.0554
 -1.9772 -0.9009

(0 ,1 ,.,.) = 
 -0.0195  0.5406
  0.0003  2.2373

(0 ,2 ,.,.) = 
 -0.0450 -1.2376
 -0.0837  0.1768
[torch.FloatTensor of size 1x3x2x2]

In [16]:
# test of memory: OK
import torch
import torch.autograd as autograd
N=5
batch_size = 2
rpl_mem_test = replay_memory(N)
for i in range(N):
    s = torch.randn(3,obs_window_W,obs_window_H).unsqueeze(0)
    a = torch.floor(torch.rand(1)*8)
    sp = torch.randn(3,obs_window_W,obs_window_H).unsqueeze(0)
    r = torch.randn(1)
    rpl_mem_test.push(s,a,r,sp)  

# create batch as in optimize
sample_experience = rpl_mem_test.sample(batch_size)
minibatch = experience(*zip(*sample_experience))
next_obs_batch = autograd.Variable(torch.cat(minibatch.next_observation),
                                     volatile=True)
obs_batch = autograd.Variable(torch.cat(minibatch.observation))
action_batch = autograd.Variable(torch.cat(minibatch.action))
reward_batch = autograd.Variable(torch.cat(minibatch.reward))

# test grad_false in max_Q_next_obs=max_a Q(next_obs, a) using previous test model
max_Q_next_obs = model_test(next_obs_batch).max(1)[0]
print(max_Q_next_obs.data)
print(max_Q_next_obs.creator)
print(max_Q_next_obs.grad)
print(max_Q_next_obs.requires_grad)
print(max_Q_next_obs.volatile)
max_Q_next_obs.volatile = False
print('after volatile=False')
print(max_Q_next_obs.requires_grad)
print(max_Q_next_obs.volatile)


 0.1744
 0.1628
[torch.FloatTensor of size 2x1]

None
None
False
True
after volatile=False
False
False

In [127]:
# parameters 
qpars = {}
qpars['C_in'] = 3 # for RGB
qpars['C_H'] = 32 # number of hidden units (or channels)
qpars['C_out'] = 8 # number of actions.
qpars['kernel_size'] = 5 
qpars['stride'] = 2
qpars['obs_window_W'] = 21
qpars['obs_window_H'] = 16
qpars['capacity'] = 5
qpars['batch_size'] = 2
qpars['gamma'] = .99
qpars['eps_start'] = 0.9
qpars['eps_end'] = 0.05
qpars['decay_rate'] = 200

agent_test = q_learner(qpars)

In [128]:
# test of preprocess_obs and policy: OK
my_obs = agent_test.preprocess_obs(test_game.obs_0())
agent_test.eps_greedy(my_obs, 0.5)


pos_0 0 7
In pad_and_slice: obs_window_up, obs_window_down, obs_window_right, obs_window_left 11 0 16 0
In pad_and_slice: pad_up, pad_down, pad_right, pad_left 7 3 0 0
In pad_and_slice: slc_screen_x, slc_screen_y slice(0, 16, None) slice(0, 11, None)
In obs_0: ret.shape, right one (3, 16, 21) (3, 16, 21)
rand 0.972882396263042 eps 0.5
In eps_greedy
Out[128]:
 7
[torch.LongTensor of size 1x1]

In [129]:
# test of perceive: OK
import random
s=test_game.obs_0()
a=torch.LongTensor([[random.randrange(agent_test.C_out)]])
r=1
sp=test_game.obs_1()
t=50
agent_test.perceive(s,a,r,sp,t)


pos_0 0 7
In pad_and_slice: obs_window_up, obs_window_down, obs_window_right, obs_window_left 11 0 16 0
In pad_and_slice: pad_up, pad_down, pad_right, pad_left 7 3 0 0
In pad_and_slice: slc_screen_x, slc_screen_y slice(0, 16, None) slice(0, 11, None)
In obs_0: ret.shape, right one (3, 16, 21) (3, 16, 21)
pos_1 0 5
In pad_and_slice: obs_window_up, obs_window_down, obs_window_right, obs_window_left 11 0 16 0
In pad_and_slice: pad_up, pad_down, pad_right, pad_left 5 5 0 0
In pad_and_slice: slc_screen_x, slc_screen_y slice(0, 16, None) slice(0, 11, None)
In obs_1: ret.shape, right one (3, 16, 21) (3, 16, 21)
rand 0.7801092642062027 eps 0.711980665611
In eps_greedy
In q_learner.optimize: do nothing, len(rpl_memory), bacth_size 1 2
Out[129]:
 7
[torch.LongTensor of size 1x1]

In [123]:
len(agent_test.rpl_memory.memory)


Out[123]:
2