In [0]:
# !apt-get install python-opengl -y  >/dev/null
# !apt install xvfb -y >/dev/null


WARNING: apt does not have a stable CLI interface. Use with caution in scripts.


In [0]:
# !pip install pyvirtualdisplay >/dev/null
# !pip install piglet >/dev/null

In [0]:
# from pyvirtualdisplay import Display
# display = Display(visible=0, size=(1400, 900))
# display.start()


Out[0]:
<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1005'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1005'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [0]:
!pip install gym >/dev/null

In [0]:
!pip install JSAnimation >/dev/null

In [0]:
%matplotlib inline
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
import matplotlib.pyplot as plt
from IPython.display import display

def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 144)
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    display(display_animation(anim, default_mode='once'))

Step 2: Playing Pong


In [4]:
import gym
env = gym.make('Pong-v0')


/usr/local/lib/python3.6/dist-packages/gym/envs/registration.py:14: PkgResourcesDeprecationWarning: Parameters to load are deprecated.  Call .resolve and .require separately.
  result = entry_point.load(False)

In [5]:
env.action_space


Out[5]:
Discrete(6)

In [6]:
env.observation_space


Out[6]:
Box(210, 160, 3)

In [7]:
# Run a demo of the environment
observation = env.reset()
cumulated_reward = 0

frames = []
for t in range(1000):
#     print(observation)
    frames.append(env.render(mode = 'rgb_array'))
    # very stupid agent, just makes a random action within the allowd action space
    action = env.action_space.sample()
#     print("Action: {}".format(t+1))    
    observation, reward, done, info = env.step(action)
#     print(reward)
    cumulated_reward += reward
    if done:
        print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
        break
print("Episode finished without success, accumulated reward = {}".format(cumulated_reward))

env.close()


Episode finished without success, accumulated reward = -17.0

In [8]:
display_frames_as_gif(frames)




Once Loop Reflect

In [0]:
def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
  """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  I[I == 144] = 0 # erase background (background type 1)
  I[I == 109] = 0 # erase background (background type 2)
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(np.float).ravel()

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = sigmoid(logp)
  return p, h # return probability of taking action 2, and hidden state

def model_step(model, observation, prev_x):
  # preprocess the observation, set input to network to be difference image
  cur_x = prepro(observation)
  x = cur_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = cur_x
  
  # forward the policy network and sample an action from the returned probability
  aprob, _ = policy_forward(x)
  action = 2 if aprob >= 0.5 else 3 # roll the dice!
  
  return action, prev_x

def play_game(env, model):
  observation = env.reset()

  frames = []
  cumulated_reward = 0

  prev_x = None # used in computing the difference frame

  for t in range(1000):
      frames.append(env.render(mode = 'rgb_array'))
      action, prev_x = model_step(model, observation, prev_x)
      observation, reward, done, info = env.step(action)
      cumulated_reward += reward
      if done:
          print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
          break
  print("Episode finished without success, accumulated reward = {}".format(cumulated_reward))
  env.close()
  display_frames_as_gif(frames)

Step 3: Policy Gradient from Scratch


In [0]:
import numpy as np

# model initialization
H = 200 # number of hidden layer neurons
D = 80 * 80 # input dimensionality: 80x80 grid
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)

# import pickle
# model = pickle.load(open('model.pkl', 'rb'))

In [11]:
# random init model
play_game(env, model)


Episode finished without success, accumulated reward = -20.0


Once Loop Reflect

In [0]:
# hyperparameters
batch_size = 10 # every how many episodes to do a param update?
# learning_rate = 1e-4
learning_rate = 1e-3
 
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
  
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def discount_rewards(r):
  """ take 1D float array of rewards and compute discounted reward """
  discounted_r = np.zeros_like(r, dtype=np.float32)
  running_add = 0
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

def policy_backward(epx, eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = np.dot(eph.T, epdlogp).ravel()
  dh = np.outer(epdlogp, model['W2'])
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}

def train_model(env, model, total_episodes = 100):
  hist = []
  observation = env.reset()

  prev_x = None # used in computing the difference frame
  xs,hs,dlogps,drs = [],[],[],[]
  running_reward = None
  reward_sum = 0
  episode_number = 0

  while True:
    # preprocess the observation, set input to network to be difference image
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
      episode_number += 1

      # stack together all inputs, hidden states, action gradients, and rewards for this episode
      epx = np.vstack(xs)
      eph = np.vstack(hs)
      epdlogp = np.vstack(dlogps)
      epr = np.vstack(drs)
      xs,hs,dlogps,drs = [],[],[],[] # reset array memory

      # compute the discounted reward backwards through time
      discounted_epr = discount_rewards(epr)
      # standardize the rewards to be unit normal (helps control the gradient estimator variance)
      discounted_epr -= np.mean(discounted_epr)
      discounted_epr /= np.std(discounted_epr)

      epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
      grad = policy_backward(epx, eph, epdlogp)
      for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

      # perform rmsprop parameter update every batch_size episodes
      if episode_number % batch_size == 0:
        for k,v in model.items():
          g = grad_buffer[k] # gradient
          rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
          model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
          grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

      # boring book-keeping
      running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
      hist.append((episode_number, reward_sum, running_reward))
      print ('resetting env. episode %f, reward total was %f. running mean: %f' % (episode_number, reward_sum, running_reward))
      reward_sum = 0
      observation = env.reset() # reset env
      prev_x = None
      if episode_number == total_episodes: return hist

  #   if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
  #     print (('ep %d: game finished, reward: %f' % (episode_number, reward)) + ('' if reward == -1 else ' !!!!!!!!'))

In [13]:
%time hist1 = train_model(env, model, total_episodes=500)


resetting env. episode 1.000000, reward total was -21.000000. running mean: -21.000000
resetting env. episode 2.000000, reward total was -20.000000. running mean: -20.990000
resetting env. episode 3.000000, reward total was -21.000000. running mean: -20.990100
resetting env. episode 4.000000, reward total was -19.000000. running mean: -20.970199
resetting env. episode 5.000000, reward total was -21.000000. running mean: -20.970497
resetting env. episode 6.000000, reward total was -21.000000. running mean: -20.970792
resetting env. episode 7.000000, reward total was -21.000000. running mean: -20.971084
resetting env. episode 8.000000, reward total was -21.000000. running mean: -20.971373
resetting env. episode 9.000000, reward total was -21.000000. running mean: -20.971660
resetting env. episode 10.000000, reward total was -20.000000. running mean: -20.961943
resetting env. episode 11.000000, reward total was -21.000000. running mean: -20.962324
resetting env. episode 12.000000, reward total was -20.000000. running mean: -20.952700
resetting env. episode 13.000000, reward total was -21.000000. running mean: -20.953173
resetting env. episode 14.000000, reward total was -21.000000. running mean: -20.953642
resetting env. episode 15.000000, reward total was -20.000000. running mean: -20.944105
resetting env. episode 16.000000, reward total was -21.000000. running mean: -20.944664
resetting env. episode 17.000000, reward total was -21.000000. running mean: -20.945217
resetting env. episode 18.000000, reward total was -21.000000. running mean: -20.945765
resetting env. episode 19.000000, reward total was -21.000000. running mean: -20.946308
resetting env. episode 20.000000, reward total was -21.000000. running mean: -20.946845
resetting env. episode 21.000000, reward total was -20.000000. running mean: -20.937376
resetting env. episode 22.000000, reward total was -21.000000. running mean: -20.938002
resetting env. episode 23.000000, reward total was -21.000000. running mean: -20.938622
resetting env. episode 24.000000, reward total was -19.000000. running mean: -20.919236
resetting env. episode 25.000000, reward total was -21.000000. running mean: -20.920044
resetting env. episode 26.000000, reward total was -21.000000. running mean: -20.920843
resetting env. episode 27.000000, reward total was -21.000000. running mean: -20.921635
resetting env. episode 28.000000, reward total was -20.000000. running mean: -20.912419
resetting env. episode 29.000000, reward total was -21.000000. running mean: -20.913294
resetting env. episode 30.000000, reward total was -21.000000. running mean: -20.914161
resetting env. episode 31.000000, reward total was -20.000000. running mean: -20.905020
resetting env. episode 32.000000, reward total was -20.000000. running mean: -20.895970
resetting env. episode 33.000000, reward total was -21.000000. running mean: -20.897010
resetting env. episode 34.000000, reward total was -21.000000. running mean: -20.898040
resetting env. episode 35.000000, reward total was -21.000000. running mean: -20.899059
resetting env. episode 36.000000, reward total was -21.000000. running mean: -20.900069
resetting env. episode 37.000000, reward total was -21.000000. running mean: -20.901068
resetting env. episode 38.000000, reward total was -20.000000. running mean: -20.892057
resetting env. episode 39.000000, reward total was -21.000000. running mean: -20.893137
resetting env. episode 40.000000, reward total was -21.000000. running mean: -20.894205
resetting env. episode 41.000000, reward total was -20.000000. running mean: -20.885263
resetting env. episode 42.000000, reward total was -20.000000. running mean: -20.876411
resetting env. episode 43.000000, reward total was -21.000000. running mean: -20.877647
resetting env. episode 44.000000, reward total was -21.000000. running mean: -20.878870
resetting env. episode 45.000000, reward total was -20.000000. running mean: -20.870082
resetting env. episode 46.000000, reward total was -21.000000. running mean: -20.871381
resetting env. episode 47.000000, reward total was -20.000000. running mean: -20.862667
resetting env. episode 48.000000, reward total was -21.000000. running mean: -20.864040
resetting env. episode 49.000000, reward total was -21.000000. running mean: -20.865400
resetting env. episode 50.000000, reward total was -21.000000. running mean: -20.866746
resetting env. episode 51.000000, reward total was -21.000000. running mean: -20.868078
resetting env. episode 52.000000, reward total was -21.000000. running mean: -20.869398
resetting env. episode 53.000000, reward total was -21.000000. running mean: -20.870704
resetting env. episode 54.000000, reward total was -21.000000. running mean: -20.871997
resetting env. episode 55.000000, reward total was -21.000000. running mean: -20.873277
resetting env. episode 56.000000, reward total was -21.000000. running mean: -20.874544
resetting env. episode 57.000000, reward total was -21.000000. running mean: -20.875798
resetting env. episode 58.000000, reward total was -19.000000. running mean: -20.857040
resetting env. episode 59.000000, reward total was -21.000000. running mean: -20.858470
resetting env. episode 60.000000, reward total was -21.000000. running mean: -20.859885
resetting env. episode 61.000000, reward total was -20.000000. running mean: -20.851286
resetting env. episode 62.000000, reward total was -20.000000. running mean: -20.842774
resetting env. episode 63.000000, reward total was -21.000000. running mean: -20.844346
resetting env. episode 64.000000, reward total was -21.000000. running mean: -20.845902
resetting env. episode 65.000000, reward total was -21.000000. running mean: -20.847443
resetting env. episode 66.000000, reward total was -21.000000. running mean: -20.848969
resetting env. episode 67.000000, reward total was -21.000000. running mean: -20.850479
resetting env. episode 68.000000, reward total was -21.000000. running mean: -20.851974
resetting env. episode 69.000000, reward total was -21.000000. running mean: -20.853455
resetting env. episode 70.000000, reward total was -21.000000. running mean: -20.854920
resetting env. episode 71.000000, reward total was -21.000000. running mean: -20.856371
resetting env. episode 72.000000, reward total was -21.000000. running mean: -20.857807
resetting env. episode 73.000000, reward total was -19.000000. running mean: -20.839229
resetting env. episode 74.000000, reward total was -20.000000. running mean: -20.830837
resetting env. episode 75.000000, reward total was -21.000000. running mean: -20.832529
resetting env. episode 76.000000, reward total was -17.000000. running mean: -20.794203
resetting env. episode 77.000000, reward total was -21.000000. running mean: -20.796261
resetting env. episode 78.000000, reward total was -21.000000. running mean: -20.798299
resetting env. episode 79.000000, reward total was -21.000000. running mean: -20.800316
resetting env. episode 80.000000, reward total was -21.000000. running mean: -20.802312
resetting env. episode 81.000000, reward total was -21.000000. running mean: -20.804289
resetting env. episode 82.000000, reward total was -21.000000. running mean: -20.806246
resetting env. episode 83.000000, reward total was -21.000000. running mean: -20.808184
resetting env. episode 84.000000, reward total was -20.000000. running mean: -20.800102
resetting env. episode 85.000000, reward total was -20.000000. running mean: -20.792101
resetting env. episode 86.000000, reward total was -21.000000. running mean: -20.794180
resetting env. episode 87.000000, reward total was -20.000000. running mean: -20.786238
resetting env. episode 88.000000, reward total was -21.000000. running mean: -20.788376
resetting env. episode 89.000000, reward total was -21.000000. running mean: -20.790492
resetting env. episode 90.000000, reward total was -21.000000. running mean: -20.792587
resetting env. episode 91.000000, reward total was -21.000000. running mean: -20.794661
resetting env. episode 92.000000, reward total was -20.000000. running mean: -20.786715
resetting env. episode 93.000000, reward total was -20.000000. running mean: -20.778848
resetting env. episode 94.000000, reward total was -20.000000. running mean: -20.771059
resetting env. episode 95.000000, reward total was -21.000000. running mean: -20.773349
resetting env. episode 96.000000, reward total was -21.000000. running mean: -20.775615
resetting env. episode 97.000000, reward total was -21.000000. running mean: -20.777859
resetting env. episode 98.000000, reward total was -19.000000. running mean: -20.760080
resetting env. episode 99.000000, reward total was -21.000000. running mean: -20.762479
resetting env. episode 100.000000, reward total was -21.000000. running mean: -20.764855
resetting env. episode 101.000000, reward total was -21.000000. running mean: -20.767206
resetting env. episode 102.000000, reward total was -21.000000. running mean: -20.769534
resetting env. episode 103.000000, reward total was -21.000000. running mean: -20.771839
resetting env. episode 104.000000, reward total was -21.000000. running mean: -20.774120
resetting env. episode 105.000000, reward total was -21.000000. running mean: -20.776379
resetting env. episode 106.000000, reward total was -19.000000. running mean: -20.758615
resetting env. episode 107.000000, reward total was -21.000000. running mean: -20.761029
resetting env. episode 108.000000, reward total was -19.000000. running mean: -20.743419
resetting env. episode 109.000000, reward total was -20.000000. running mean: -20.735985
resetting env. episode 110.000000, reward total was -21.000000. running mean: -20.738625
resetting env. episode 111.000000, reward total was -19.000000. running mean: -20.721239
resetting env. episode 112.000000, reward total was -21.000000. running mean: -20.724026
resetting env. episode 113.000000, reward total was -18.000000. running mean: -20.696786
resetting env. episode 114.000000, reward total was -21.000000. running mean: -20.699818
resetting env. episode 115.000000, reward total was -21.000000. running mean: -20.702820
resetting env. episode 116.000000, reward total was -21.000000. running mean: -20.705792
resetting env. episode 117.000000, reward total was -21.000000. running mean: -20.708734
resetting env. episode 118.000000, reward total was -20.000000. running mean: -20.701646
resetting env. episode 119.000000, reward total was -21.000000. running mean: -20.704630
resetting env. episode 120.000000, reward total was -21.000000. running mean: -20.707584
resetting env. episode 121.000000, reward total was -20.000000. running mean: -20.700508
resetting env. episode 122.000000, reward total was -21.000000. running mean: -20.703503
resetting env. episode 123.000000, reward total was -21.000000. running mean: -20.706468
resetting env. episode 124.000000, reward total was -21.000000. running mean: -20.709403
resetting env. episode 125.000000, reward total was -21.000000. running mean: -20.712309
resetting env. episode 126.000000, reward total was -20.000000. running mean: -20.705186
resetting env. episode 127.000000, reward total was -20.000000. running mean: -20.698134
resetting env. episode 128.000000, reward total was -21.000000. running mean: -20.701153
resetting env. episode 129.000000, reward total was -20.000000. running mean: -20.694141
resetting env. episode 130.000000, reward total was -21.000000. running mean: -20.697200
resetting env. episode 131.000000, reward total was -21.000000. running mean: -20.700228
resetting env. episode 132.000000, reward total was -20.000000. running mean: -20.693226
resetting env. episode 133.000000, reward total was -21.000000. running mean: -20.696293
resetting env. episode 134.000000, reward total was -21.000000. running mean: -20.699330
resetting env. episode 135.000000, reward total was -21.000000. running mean: -20.702337
resetting env. episode 136.000000, reward total was -19.000000. running mean: -20.685314
resetting env. episode 137.000000, reward total was -20.000000. running mean: -20.678461
resetting env. episode 138.000000, reward total was -20.000000. running mean: -20.671676
resetting env. episode 139.000000, reward total was -21.000000. running mean: -20.674959
resetting env. episode 140.000000, reward total was -21.000000. running mean: -20.678210
resetting env. episode 141.000000, reward total was -21.000000. running mean: -20.681428
resetting env. episode 142.000000, reward total was -20.000000. running mean: -20.674613
resetting env. episode 143.000000, reward total was -20.000000. running mean: -20.667867
resetting env. episode 144.000000, reward total was -21.000000. running mean: -20.671188
resetting env. episode 145.000000, reward total was -20.000000. running mean: -20.664477
resetting env. episode 146.000000, reward total was -21.000000. running mean: -20.667832
resetting env. episode 147.000000, reward total was -21.000000. running mean: -20.671153
resetting env. episode 148.000000, reward total was -21.000000. running mean: -20.674442
resetting env. episode 149.000000, reward total was -20.000000. running mean: -20.667698
resetting env. episode 150.000000, reward total was -20.000000. running mean: -20.661021
resetting env. episode 151.000000, reward total was -21.000000. running mean: -20.664410
resetting env. episode 152.000000, reward total was -20.000000. running mean: -20.657766
resetting env. episode 153.000000, reward total was -20.000000. running mean: -20.651189
resetting env. episode 154.000000, reward total was -21.000000. running mean: -20.654677
resetting env. episode 155.000000, reward total was -21.000000. running mean: -20.658130
resetting env. episode 156.000000, reward total was -21.000000. running mean: -20.661549
resetting env. episode 157.000000, reward total was -20.000000. running mean: -20.654933
resetting env. episode 158.000000, reward total was -19.000000. running mean: -20.638384
resetting env. episode 159.000000, reward total was -21.000000. running mean: -20.642000
resetting env. episode 160.000000, reward total was -20.000000. running mean: -20.635580
resetting env. episode 161.000000, reward total was -20.000000. running mean: -20.629224
resetting env. episode 162.000000, reward total was -21.000000. running mean: -20.632932
resetting env. episode 163.000000, reward total was -21.000000. running mean: -20.636603
resetting env. episode 164.000000, reward total was -20.000000. running mean: -20.630237
resetting env. episode 165.000000, reward total was -21.000000. running mean: -20.633934
resetting env. episode 166.000000, reward total was -21.000000. running mean: -20.637595
resetting env. episode 167.000000, reward total was -21.000000. running mean: -20.641219
resetting env. episode 168.000000, reward total was -20.000000. running mean: -20.634807
resetting env. episode 169.000000, reward total was -21.000000. running mean: -20.638459
resetting env. episode 170.000000, reward total was -21.000000. running mean: -20.642074
resetting env. episode 171.000000, reward total was -21.000000. running mean: -20.645653
resetting env. episode 172.000000, reward total was -21.000000. running mean: -20.649197
resetting env. episode 173.000000, reward total was -21.000000. running mean: -20.652705
resetting env. episode 174.000000, reward total was -21.000000. running mean: -20.656178
resetting env. episode 175.000000, reward total was -20.000000. running mean: -20.649616
resetting env. episode 176.000000, reward total was -20.000000. running mean: -20.643120
resetting env. episode 177.000000, reward total was -21.000000. running mean: -20.646689
resetting env. episode 178.000000, reward total was -21.000000. running mean: -20.650222
resetting env. episode 179.000000, reward total was -20.000000. running mean: -20.643720
resetting env. episode 180.000000, reward total was -20.000000. running mean: -20.637282
resetting env. episode 181.000000, reward total was -21.000000. running mean: -20.640910
resetting env. episode 182.000000, reward total was -20.000000. running mean: -20.634500
resetting env. episode 183.000000, reward total was -20.000000. running mean: -20.628155
resetting env. episode 184.000000, reward total was -21.000000. running mean: -20.631874
resetting env. episode 185.000000, reward total was -21.000000. running mean: -20.635555
resetting env. episode 186.000000, reward total was -21.000000. running mean: -20.639200
resetting env. episode 187.000000, reward total was -20.000000. running mean: -20.632808
resetting env. episode 188.000000, reward total was -20.000000. running mean: -20.626480
resetting env. episode 189.000000, reward total was -20.000000. running mean: -20.620215
resetting env. episode 190.000000, reward total was -20.000000. running mean: -20.614013
resetting env. episode 191.000000, reward total was -21.000000. running mean: -20.617872
resetting env. episode 192.000000, reward total was -21.000000. running mean: -20.621694
resetting env. episode 193.000000, reward total was -21.000000. running mean: -20.625477
resetting env. episode 194.000000, reward total was -21.000000. running mean: -20.629222
resetting env. episode 195.000000, reward total was -20.000000. running mean: -20.622930
resetting env. episode 196.000000, reward total was -21.000000. running mean: -20.626700
resetting env. episode 197.000000, reward total was -21.000000. running mean: -20.630433
resetting env. episode 198.000000, reward total was -20.000000. running mean: -20.624129
resetting env. episode 199.000000, reward total was -18.000000. running mean: -20.597888
resetting env. episode 200.000000, reward total was -20.000000. running mean: -20.591909
resetting env. episode 201.000000, reward total was -19.000000. running mean: -20.575990
resetting env. episode 202.000000, reward total was -20.000000. running mean: -20.570230
resetting env. episode 203.000000, reward total was -20.000000. running mean: -20.564528
resetting env. episode 204.000000, reward total was -21.000000. running mean: -20.568882
resetting env. episode 205.000000, reward total was -19.000000. running mean: -20.553194
resetting env. episode 206.000000, reward total was -21.000000. running mean: -20.557662
resetting env. episode 207.000000, reward total was -19.000000. running mean: -20.542085
resetting env. episode 208.000000, reward total was -20.000000. running mean: -20.536664
resetting env. episode 209.000000, reward total was -21.000000. running mean: -20.541298
resetting env. episode 210.000000, reward total was -21.000000. running mean: -20.545885
resetting env. episode 211.000000, reward total was -20.000000. running mean: -20.540426
resetting env. episode 212.000000, reward total was -20.000000. running mean: -20.535021
resetting env. episode 213.000000, reward total was -21.000000. running mean: -20.539671
resetting env. episode 214.000000, reward total was -20.000000. running mean: -20.534275
resetting env. episode 215.000000, reward total was -19.000000. running mean: -20.518932
resetting env. episode 216.000000, reward total was -20.000000. running mean: -20.513742
resetting env. episode 217.000000, reward total was -21.000000. running mean: -20.518605
resetting env. episode 218.000000, reward total was -20.000000. running mean: -20.513419
resetting env. episode 219.000000, reward total was -19.000000. running mean: -20.498285
resetting env. episode 220.000000, reward total was -21.000000. running mean: -20.503302
resetting env. episode 221.000000, reward total was -21.000000. running mean: -20.508269
resetting env. episode 222.000000, reward total was -20.000000. running mean: -20.503186
resetting env. episode 223.000000, reward total was -20.000000. running mean: -20.498154
resetting env. episode 224.000000, reward total was -19.000000. running mean: -20.483173
resetting env. episode 225.000000, reward total was -21.000000. running mean: -20.488341
resetting env. episode 226.000000, reward total was -21.000000. running mean: -20.493458
resetting env. episode 227.000000, reward total was -20.000000. running mean: -20.488523
resetting env. episode 228.000000, reward total was -20.000000. running mean: -20.483638
resetting env. episode 229.000000, reward total was -20.000000. running mean: -20.478802
resetting env. episode 230.000000, reward total was -20.000000. running mean: -20.474014
resetting env. episode 231.000000, reward total was -20.000000. running mean: -20.469273
resetting env. episode 232.000000, reward total was -20.000000. running mean: -20.464581
resetting env. episode 233.000000, reward total was -18.000000. running mean: -20.439935
resetting env. episode 234.000000, reward total was -21.000000. running mean: -20.445535
resetting env. episode 235.000000, reward total was -20.000000. running mean: -20.441080
resetting env. episode 236.000000, reward total was -20.000000. running mean: -20.436669
resetting env. episode 237.000000, reward total was -21.000000. running mean: -20.442303
resetting env. episode 238.000000, reward total was -21.000000. running mean: -20.447880
resetting env. episode 239.000000, reward total was -20.000000. running mean: -20.443401
resetting env. episode 240.000000, reward total was -19.000000. running mean: -20.428967
resetting env. episode 241.000000, reward total was -21.000000. running mean: -20.434677
resetting env. episode 242.000000, reward total was -21.000000. running mean: -20.440330
resetting env. episode 243.000000, reward total was -20.000000. running mean: -20.435927
resetting env. episode 244.000000, reward total was -18.000000. running mean: -20.411568
resetting env. episode 245.000000, reward total was -21.000000. running mean: -20.417452
resetting env. episode 246.000000, reward total was -20.000000. running mean: -20.413278
resetting env. episode 247.000000, reward total was -21.000000. running mean: -20.419145
resetting env. episode 248.000000, reward total was -21.000000. running mean: -20.424953
resetting env. episode 249.000000, reward total was -21.000000. running mean: -20.430704
resetting env. episode 250.000000, reward total was -21.000000. running mean: -20.436397
resetting env. episode 251.000000, reward total was -21.000000. running mean: -20.442033
resetting env. episode 252.000000, reward total was -20.000000. running mean: -20.437612
resetting env. episode 253.000000, reward total was -21.000000. running mean: -20.443236
resetting env. episode 254.000000, reward total was -21.000000. running mean: -20.448804
resetting env. episode 255.000000, reward total was -21.000000. running mean: -20.454316
resetting env. episode 256.000000, reward total was -19.000000. running mean: -20.439773
resetting env. episode 257.000000, reward total was -20.000000. running mean: -20.435375
resetting env. episode 258.000000, reward total was -20.000000. running mean: -20.431021
resetting env. episode 259.000000, reward total was -20.000000. running mean: -20.426711
resetting env. episode 260.000000, reward total was -19.000000. running mean: -20.412444
resetting env. episode 261.000000, reward total was -20.000000. running mean: -20.408320
resetting env. episode 262.000000, reward total was -21.000000. running mean: -20.414236
resetting env. episode 263.000000, reward total was -19.000000. running mean: -20.400094
resetting env. episode 264.000000, reward total was -21.000000. running mean: -20.406093
resetting env. episode 265.000000, reward total was -21.000000. running mean: -20.412032
resetting env. episode 266.000000, reward total was -20.000000. running mean: -20.407912
resetting env. episode 267.000000, reward total was -21.000000. running mean: -20.413833
resetting env. episode 268.000000, reward total was -20.000000. running mean: -20.409694
resetting env. episode 269.000000, reward total was -19.000000. running mean: -20.395597
resetting env. episode 270.000000, reward total was -20.000000. running mean: -20.391641
resetting env. episode 271.000000, reward total was -19.000000. running mean: -20.377725
resetting env. episode 272.000000, reward total was -20.000000. running mean: -20.373948
resetting env. episode 273.000000, reward total was -21.000000. running mean: -20.380208
resetting env. episode 274.000000, reward total was -21.000000. running mean: -20.386406
resetting env. episode 275.000000, reward total was -21.000000. running mean: -20.392542
resetting env. episode 276.000000, reward total was -18.000000. running mean: -20.368617
resetting env. episode 277.000000, reward total was -21.000000. running mean: -20.374931
resetting env. episode 278.000000, reward total was -19.000000. running mean: -20.361181
resetting env. episode 279.000000, reward total was -19.000000. running mean: -20.347569
resetting env. episode 280.000000, reward total was -19.000000. running mean: -20.334094
resetting env. episode 281.000000, reward total was -19.000000. running mean: -20.320753
resetting env. episode 282.000000, reward total was -18.000000. running mean: -20.297545
resetting env. episode 283.000000, reward total was -18.000000. running mean: -20.274570
resetting env. episode 284.000000, reward total was -21.000000. running mean: -20.281824
resetting env. episode 285.000000, reward total was -21.000000. running mean: -20.289006
resetting env. episode 286.000000, reward total was -20.000000. running mean: -20.286116
resetting env. episode 287.000000, reward total was -21.000000. running mean: -20.293255
resetting env. episode 288.000000, reward total was -21.000000. running mean: -20.300322
resetting env. episode 289.000000, reward total was -19.000000. running mean: -20.287319
resetting env. episode 290.000000, reward total was -21.000000. running mean: -20.294446
resetting env. episode 291.000000, reward total was -21.000000. running mean: -20.301501
resetting env. episode 292.000000, reward total was -17.000000. running mean: -20.268486
resetting env. episode 293.000000, reward total was -21.000000. running mean: -20.275801
resetting env. episode 294.000000, reward total was -20.000000. running mean: -20.273043
resetting env. episode 295.000000, reward total was -20.000000. running mean: -20.270313
resetting env. episode 296.000000, reward total was -21.000000. running mean: -20.277610
resetting env. episode 297.000000, reward total was -19.000000. running mean: -20.264834
resetting env. episode 298.000000, reward total was -18.000000. running mean: -20.242185
resetting env. episode 299.000000, reward total was -21.000000. running mean: -20.249764
resetting env. episode 300.000000, reward total was -20.000000. running mean: -20.247266
resetting env. episode 301.000000, reward total was -21.000000. running mean: -20.254793
resetting env. episode 302.000000, reward total was -20.000000. running mean: -20.252245
resetting env. episode 303.000000, reward total was -20.000000. running mean: -20.249723
resetting env. episode 304.000000, reward total was -19.000000. running mean: -20.237226
resetting env. episode 305.000000, reward total was -21.000000. running mean: -20.244853
resetting env. episode 306.000000, reward total was -21.000000. running mean: -20.252405
resetting env. episode 307.000000, reward total was -21.000000. running mean: -20.259881
resetting env. episode 308.000000, reward total was -21.000000. running mean: -20.267282
resetting env. episode 309.000000, reward total was -17.000000. running mean: -20.234609
resetting env. episode 310.000000, reward total was -20.000000. running mean: -20.232263
resetting env. episode 311.000000, reward total was -21.000000. running mean: -20.239940
resetting env. episode 312.000000, reward total was -21.000000. running mean: -20.247541
resetting env. episode 313.000000, reward total was -21.000000. running mean: -20.255066
resetting env. episode 314.000000, reward total was -19.000000. running mean: -20.242515
resetting env. episode 315.000000, reward total was -19.000000. running mean: -20.230090
resetting env. episode 316.000000, reward total was -19.000000. running mean: -20.217789
resetting env. episode 317.000000, reward total was -20.000000. running mean: -20.215611
resetting env. episode 318.000000, reward total was -21.000000. running mean: -20.223455
resetting env. episode 319.000000, reward total was -20.000000. running mean: -20.221220
resetting env. episode 320.000000, reward total was -19.000000. running mean: -20.209008
resetting env. episode 321.000000, reward total was -20.000000. running mean: -20.206918
resetting env. episode 322.000000, reward total was -21.000000. running mean: -20.214849
resetting env. episode 323.000000, reward total was -20.000000. running mean: -20.212700
resetting env. episode 324.000000, reward total was -21.000000. running mean: -20.220573
resetting env. episode 325.000000, reward total was -17.000000. running mean: -20.188368
resetting env. episode 326.000000, reward total was -20.000000. running mean: -20.186484
resetting env. episode 327.000000, reward total was -20.000000. running mean: -20.184619
resetting env. episode 328.000000, reward total was -20.000000. running mean: -20.182773
resetting env. episode 329.000000, reward total was -20.000000. running mean: -20.180945
resetting env. episode 330.000000, reward total was -21.000000. running mean: -20.189136
resetting env. episode 331.000000, reward total was -18.000000. running mean: -20.167244
resetting env. episode 332.000000, reward total was -21.000000. running mean: -20.175572
resetting env. episode 333.000000, reward total was -21.000000. running mean: -20.183816
resetting env. episode 334.000000, reward total was -19.000000. running mean: -20.171978
resetting env. episode 335.000000, reward total was -21.000000. running mean: -20.180258
resetting env. episode 336.000000, reward total was -20.000000. running mean: -20.178456
resetting env. episode 337.000000, reward total was -20.000000. running mean: -20.176671
resetting env. episode 338.000000, reward total was -21.000000. running mean: -20.184904
resetting env. episode 339.000000, reward total was -19.000000. running mean: -20.173055
resetting env. episode 340.000000, reward total was -20.000000. running mean: -20.171325
resetting env. episode 341.000000, reward total was -21.000000. running mean: -20.179612
resetting env. episode 342.000000, reward total was -19.000000. running mean: -20.167816
resetting env. episode 343.000000, reward total was -21.000000. running mean: -20.176137
resetting env. episode 344.000000, reward total was -17.000000. running mean: -20.144376
resetting env. episode 345.000000, reward total was -21.000000. running mean: -20.152932
resetting env. episode 346.000000, reward total was -21.000000. running mean: -20.161403
resetting env. episode 347.000000, reward total was -21.000000. running mean: -20.169789
resetting env. episode 348.000000, reward total was -19.000000. running mean: -20.158091
resetting env. episode 349.000000, reward total was -18.000000. running mean: -20.136510
resetting env. episode 350.000000, reward total was -20.000000. running mean: -20.135145
resetting env. episode 351.000000, reward total was -20.000000. running mean: -20.133794
resetting env. episode 352.000000, reward total was -21.000000. running mean: -20.142456
resetting env. episode 353.000000, reward total was -19.000000. running mean: -20.131031
resetting env. episode 354.000000, reward total was -19.000000. running mean: -20.119721
resetting env. episode 355.000000, reward total was -18.000000. running mean: -20.098524
resetting env. episode 356.000000, reward total was -21.000000. running mean: -20.107538
resetting env. episode 357.000000, reward total was -19.000000. running mean: -20.096463
resetting env. episode 358.000000, reward total was -20.000000. running mean: -20.095498
resetting env. episode 359.000000, reward total was -21.000000. running mean: -20.104543
resetting env. episode 360.000000, reward total was -19.000000. running mean: -20.093498
resetting env. episode 361.000000, reward total was -21.000000. running mean: -20.102563
resetting env. episode 362.000000, reward total was -21.000000. running mean: -20.111537
resetting env. episode 363.000000, reward total was -19.000000. running mean: -20.100422
resetting env. episode 364.000000, reward total was -21.000000. running mean: -20.109418
resetting env. episode 365.000000, reward total was -21.000000. running mean: -20.118323
resetting env. episode 366.000000, reward total was -20.000000. running mean: -20.117140
resetting env. episode 367.000000, reward total was -20.000000. running mean: -20.115969
resetting env. episode 368.000000, reward total was -20.000000. running mean: -20.114809
resetting env. episode 369.000000, reward total was -20.000000. running mean: -20.113661
resetting env. episode 370.000000, reward total was -21.000000. running mean: -20.122524
resetting env. episode 371.000000, reward total was -20.000000. running mean: -20.121299
resetting env. episode 372.000000, reward total was -21.000000. running mean: -20.130086
resetting env. episode 373.000000, reward total was -20.000000. running mean: -20.128785
resetting env. episode 374.000000, reward total was -20.000000. running mean: -20.127497
resetting env. episode 375.000000, reward total was -19.000000. running mean: -20.116223
resetting env. episode 376.000000, reward total was -21.000000. running mean: -20.125060
resetting env. episode 377.000000, reward total was -20.000000. running mean: -20.123810
resetting env. episode 378.000000, reward total was -20.000000. running mean: -20.122572
resetting env. episode 379.000000, reward total was -18.000000. running mean: -20.101346
resetting env. episode 380.000000, reward total was -19.000000. running mean: -20.090332
resetting env. episode 381.000000, reward total was -19.000000. running mean: -20.079429
resetting env. episode 382.000000, reward total was -19.000000. running mean: -20.068635
resetting env. episode 383.000000, reward total was -19.000000. running mean: -20.057948
resetting env. episode 384.000000, reward total was -20.000000. running mean: -20.057369
resetting env. episode 385.000000, reward total was -20.000000. running mean: -20.056795
resetting env. episode 386.000000, reward total was -21.000000. running mean: -20.066227
resetting env. episode 387.000000, reward total was -19.000000. running mean: -20.055565
resetting env. episode 388.000000, reward total was -21.000000. running mean: -20.065009
resetting env. episode 389.000000, reward total was -19.000000. running mean: -20.054359
resetting env. episode 390.000000, reward total was -21.000000. running mean: -20.063816
resetting env. episode 391.000000, reward total was -21.000000. running mean: -20.073178
resetting env. episode 392.000000, reward total was -20.000000. running mean: -20.072446
resetting env. episode 393.000000, reward total was -19.000000. running mean: -20.061721
resetting env. episode 394.000000, reward total was -20.000000. running mean: -20.061104
resetting env. episode 395.000000, reward total was -20.000000. running mean: -20.060493
resetting env. episode 396.000000, reward total was -20.000000. running mean: -20.059888
resetting env. episode 397.000000, reward total was -20.000000. running mean: -20.059289
resetting env. episode 398.000000, reward total was -20.000000. running mean: -20.058696
resetting env. episode 399.000000, reward total was -20.000000. running mean: -20.058109
resetting env. episode 400.000000, reward total was -21.000000. running mean: -20.067528
resetting env. episode 401.000000, reward total was -18.000000. running mean: -20.046853
resetting env. episode 402.000000, reward total was -20.000000. running mean: -20.046384
resetting env. episode 403.000000, reward total was -21.000000. running mean: -20.055921
resetting env. episode 404.000000, reward total was -19.000000. running mean: -20.045361
resetting env. episode 405.000000, reward total was -21.000000. running mean: -20.054908
resetting env. episode 406.000000, reward total was -21.000000. running mean: -20.064359
resetting env. episode 407.000000, reward total was -21.000000. running mean: -20.073715
resetting env. episode 408.000000, reward total was -19.000000. running mean: -20.062978
resetting env. episode 409.000000, reward total was -21.000000. running mean: -20.072348
resetting env. episode 410.000000, reward total was -20.000000. running mean: -20.071625
resetting env. episode 411.000000, reward total was -19.000000. running mean: -20.060909
resetting env. episode 412.000000, reward total was -19.000000. running mean: -20.050299
resetting env. episode 413.000000, reward total was -21.000000. running mean: -20.059796
resetting env. episode 414.000000, reward total was -18.000000. running mean: -20.039198
resetting env. episode 415.000000, reward total was -19.000000. running mean: -20.028806
resetting env. episode 416.000000, reward total was -19.000000. running mean: -20.018518
resetting env. episode 417.000000, reward total was -21.000000. running mean: -20.028333
resetting env. episode 418.000000, reward total was -21.000000. running mean: -20.038050
resetting env. episode 419.000000, reward total was -18.000000. running mean: -20.017669
resetting env. episode 420.000000, reward total was -20.000000. running mean: -20.017493
resetting env. episode 421.000000, reward total was -20.000000. running mean: -20.017318
resetting env. episode 422.000000, reward total was -21.000000. running mean: -20.027145
resetting env. episode 423.000000, reward total was -20.000000. running mean: -20.026873
resetting env. episode 424.000000, reward total was -21.000000. running mean: -20.036604
resetting env. episode 425.000000, reward total was -21.000000. running mean: -20.046238
resetting env. episode 426.000000, reward total was -21.000000. running mean: -20.055776
resetting env. episode 427.000000, reward total was -21.000000. running mean: -20.065218
resetting env. episode 428.000000, reward total was -19.000000. running mean: -20.054566
resetting env. episode 429.000000, reward total was -19.000000. running mean: -20.044020
resetting env. episode 430.000000, reward total was -21.000000. running mean: -20.053580
resetting env. episode 431.000000, reward total was -19.000000. running mean: -20.043044
resetting env. episode 432.000000, reward total was -21.000000. running mean: -20.052614
resetting env. episode 433.000000, reward total was -20.000000. running mean: -20.052088
resetting env. episode 434.000000, reward total was -21.000000. running mean: -20.061567
resetting env. episode 435.000000, reward total was -20.000000. running mean: -20.060951
resetting env. episode 436.000000, reward total was -19.000000. running mean: -20.050342
resetting env. episode 437.000000, reward total was -20.000000. running mean: -20.049838
resetting env. episode 438.000000, reward total was -21.000000. running mean: -20.059340
resetting env. episode 439.000000, reward total was -20.000000. running mean: -20.058747
resetting env. episode 440.000000, reward total was -21.000000. running mean: -20.068159
resetting env. episode 441.000000, reward total was -21.000000. running mean: -20.077477
resetting env. episode 442.000000, reward total was -20.000000. running mean: -20.076703
resetting env. episode 443.000000, reward total was -19.000000. running mean: -20.065936
resetting env. episode 444.000000, reward total was -19.000000. running mean: -20.055276
resetting env. episode 445.000000, reward total was -21.000000. running mean: -20.064724
resetting env. episode 446.000000, reward total was -21.000000. running mean: -20.074076
resetting env. episode 447.000000, reward total was -21.000000. running mean: -20.083336
resetting env. episode 448.000000, reward total was -19.000000. running mean: -20.072502
resetting env. episode 449.000000, reward total was -19.000000. running mean: -20.061777
resetting env. episode 450.000000, reward total was -21.000000. running mean: -20.071159
resetting env. episode 451.000000, reward total was -21.000000. running mean: -20.080448
resetting env. episode 452.000000, reward total was -21.000000. running mean: -20.089643
resetting env. episode 453.000000, reward total was -19.000000. running mean: -20.078747
resetting env. episode 454.000000, reward total was -20.000000. running mean: -20.077959
resetting env. episode 455.000000, reward total was -20.000000. running mean: -20.077180
resetting env. episode 456.000000, reward total was -20.000000. running mean: -20.076408
resetting env. episode 457.000000, reward total was -18.000000. running mean: -20.055644
resetting env. episode 458.000000, reward total was -21.000000. running mean: -20.065088
resetting env. episode 459.000000, reward total was -21.000000. running mean: -20.074437
resetting env. episode 460.000000, reward total was -20.000000. running mean: -20.073692
resetting env. episode 461.000000, reward total was -20.000000. running mean: -20.072955
resetting env. episode 462.000000, reward total was -20.000000. running mean: -20.072226
resetting env. episode 463.000000, reward total was -20.000000. running mean: -20.071504
resetting env. episode 464.000000, reward total was -21.000000. running mean: -20.080789
resetting env. episode 465.000000, reward total was -21.000000. running mean: -20.089981
resetting env. episode 466.000000, reward total was -20.000000. running mean: -20.089081
resetting env. episode 467.000000, reward total was -20.000000. running mean: -20.088190
resetting env. episode 468.000000, reward total was -19.000000. running mean: -20.077308
resetting env. episode 469.000000, reward total was -20.000000. running mean: -20.076535
resetting env. episode 470.000000, reward total was -19.000000. running mean: -20.065770
resetting env. episode 471.000000, reward total was -21.000000. running mean: -20.075112
resetting env. episode 472.000000, reward total was -21.000000. running mean: -20.084361
resetting env. episode 473.000000, reward total was -21.000000. running mean: -20.093517
resetting env. episode 474.000000, reward total was -21.000000. running mean: -20.102582
resetting env. episode 475.000000, reward total was -20.000000. running mean: -20.101556
resetting env. episode 476.000000, reward total was -20.000000. running mean: -20.100541
resetting env. episode 477.000000, reward total was -18.000000. running mean: -20.079535
resetting env. episode 478.000000, reward total was -20.000000. running mean: -20.078740
resetting env. episode 479.000000, reward total was -20.000000. running mean: -20.077953
resetting env. episode 480.000000, reward total was -19.000000. running mean: -20.067173
resetting env. episode 481.000000, reward total was -18.000000. running mean: -20.046501
resetting env. episode 482.000000, reward total was -19.000000. running mean: -20.036036
resetting env. episode 483.000000, reward total was -21.000000. running mean: -20.045676
resetting env. episode 484.000000, reward total was -20.000000. running mean: -20.045219
resetting env. episode 485.000000, reward total was -20.000000. running mean: -20.044767
resetting env. episode 486.000000, reward total was -21.000000. running mean: -20.054319
resetting env. episode 487.000000, reward total was -17.000000. running mean: -20.023776
resetting env. episode 488.000000, reward total was -21.000000. running mean: -20.033538
resetting env. episode 489.000000, reward total was -19.000000. running mean: -20.023203
resetting env. episode 490.000000, reward total was -21.000000. running mean: -20.032971
resetting env. episode 491.000000, reward total was -17.000000. running mean: -20.002641
resetting env. episode 492.000000, reward total was -19.000000. running mean: -19.992615
resetting env. episode 493.000000, reward total was -20.000000. running mean: -19.992689
resetting env. episode 494.000000, reward total was -19.000000. running mean: -19.982762
resetting env. episode 495.000000, reward total was -19.000000. running mean: -19.972934
resetting env. episode 496.000000, reward total was -18.000000. running mean: -19.953205
resetting env. episode 497.000000, reward total was -17.000000. running mean: -19.923673
resetting env. episode 498.000000, reward total was -21.000000. running mean: -19.934436
resetting env. episode 499.000000, reward total was -19.000000. running mean: -19.925092
resetting env. episode 500.000000, reward total was -19.000000. running mean: -19.915841
CPU times: user 32min 49s, sys: 15min 24s, total: 48min 13s
Wall time: 34min 24s

In [14]:
play_game(env, model)


Episode finished without success, accumulated reward = -8.0


Once Loop Reflect

In [0]:
%time hist2 = train_model(env, model, total_episodes=500)


resetting env. episode 1.000000, reward total was -19.000000. running mean: -19.000000
resetting env. episode 2.000000, reward total was -19.000000. running mean: -19.000000
resetting env. episode 3.000000, reward total was -17.000000. running mean: -18.980000
resetting env. episode 4.000000, reward total was -20.000000. running mean: -18.990200
resetting env. episode 5.000000, reward total was -18.000000. running mean: -18.980298
resetting env. episode 6.000000, reward total was -21.000000. running mean: -19.000495
resetting env. episode 7.000000, reward total was -20.000000. running mean: -19.010490
resetting env. episode 8.000000, reward total was -21.000000. running mean: -19.030385
resetting env. episode 9.000000, reward total was -21.000000. running mean: -19.050081
resetting env. episode 10.000000, reward total was -18.000000. running mean: -19.039581
resetting env. episode 11.000000, reward total was -18.000000. running mean: -19.029185
resetting env. episode 12.000000, reward total was -19.000000. running mean: -19.028893
resetting env. episode 13.000000, reward total was -18.000000. running mean: -19.018604
resetting env. episode 14.000000, reward total was -17.000000. running mean: -18.998418
resetting env. episode 15.000000, reward total was -20.000000. running mean: -19.008434
resetting env. episode 16.000000, reward total was -19.000000. running mean: -19.008349
resetting env. episode 17.000000, reward total was -19.000000. running mean: -19.008266
resetting env. episode 18.000000, reward total was -19.000000. running mean: -19.008183
resetting env. episode 19.000000, reward total was -19.000000. running mean: -19.008101
resetting env. episode 20.000000, reward total was -15.000000. running mean: -18.968020
resetting env. episode 21.000000, reward total was -19.000000. running mean: -18.968340
resetting env. episode 22.000000, reward total was -21.000000. running mean: -18.988657
resetting env. episode 23.000000, reward total was -21.000000. running mean: -19.008770
resetting env. episode 24.000000, reward total was -18.000000. running mean: -18.998682
resetting env. episode 25.000000, reward total was -19.000000. running mean: -18.998696
resetting env. episode 26.000000, reward total was -20.000000. running mean: -19.008709
resetting env. episode 27.000000, reward total was -20.000000. running mean: -19.018622
resetting env. episode 28.000000, reward total was -20.000000. running mean: -19.028435
resetting env. episode 29.000000, reward total was -19.000000. running mean: -19.028151
resetting env. episode 30.000000, reward total was -21.000000. running mean: -19.047870
resetting env. episode 31.000000, reward total was -15.000000. running mean: -19.007391
resetting env. episode 32.000000, reward total was -20.000000. running mean: -19.017317
resetting env. episode 33.000000, reward total was -19.000000. running mean: -19.017144
resetting env. episode 34.000000, reward total was -18.000000. running mean: -19.006972
resetting env. episode 35.000000, reward total was -19.000000. running mean: -19.006903
resetting env. episode 36.000000, reward total was -21.000000. running mean: -19.026834
resetting env. episode 37.000000, reward total was -17.000000. running mean: -19.006565
resetting env. episode 38.000000, reward total was -18.000000. running mean: -18.996500
resetting env. episode 39.000000, reward total was -19.000000. running mean: -18.996535
resetting env. episode 40.000000, reward total was -19.000000. running mean: -18.996569
resetting env. episode 41.000000, reward total was -19.000000. running mean: -18.996604
resetting env. episode 42.000000, reward total was -17.000000. running mean: -18.976638
resetting env. episode 43.000000, reward total was -19.000000. running mean: -18.976871
resetting env. episode 44.000000, reward total was -18.000000. running mean: -18.967102
resetting env. episode 45.000000, reward total was -19.000000. running mean: -18.967431
resetting env. episode 46.000000, reward total was -17.000000. running mean: -18.947757
resetting env. episode 47.000000, reward total was -21.000000. running mean: -18.968280
resetting env. episode 48.000000, reward total was -19.000000. running mean: -18.968597
resetting env. episode 49.000000, reward total was -19.000000. running mean: -18.968911
resetting env. episode 50.000000, reward total was -18.000000. running mean: -18.959222
resetting env. episode 51.000000, reward total was -19.000000. running mean: -18.959629
resetting env. episode 52.000000, reward total was -20.000000. running mean: -18.970033
resetting env. episode 53.000000, reward total was -19.000000. running mean: -18.970333
resetting env. episode 54.000000, reward total was -20.000000. running mean: -18.980629
resetting env. episode 55.000000, reward total was -17.000000. running mean: -18.960823
resetting env. episode 56.000000, reward total was -19.000000. running mean: -18.961215
resetting env. episode 57.000000, reward total was -21.000000. running mean: -18.981603
resetting env. episode 58.000000, reward total was -19.000000. running mean: -18.981787
resetting env. episode 59.000000, reward total was -21.000000. running mean: -19.001969
resetting env. episode 60.000000, reward total was -19.000000. running mean: -19.001949
resetting env. episode 61.000000, reward total was -19.000000. running mean: -19.001930
resetting env. episode 62.000000, reward total was -18.000000. running mean: -18.991910
resetting env. episode 63.000000, reward total was -18.000000. running mean: -18.981991
resetting env. episode 64.000000, reward total was -20.000000. running mean: -18.992171
resetting env. episode 65.000000, reward total was -21.000000. running mean: -19.012250
resetting env. episode 66.000000, reward total was -20.000000. running mean: -19.022127
resetting env. episode 67.000000, reward total was -19.000000. running mean: -19.021906
resetting env. episode 68.000000, reward total was -20.000000. running mean: -19.031687
resetting env. episode 69.000000, reward total was -17.000000. running mean: -19.011370
resetting env. episode 70.000000, reward total was -20.000000. running mean: -19.021256
resetting env. episode 71.000000, reward total was -18.000000. running mean: -19.011044
resetting env. episode 72.000000, reward total was -19.000000. running mean: -19.010933
resetting env. episode 73.000000, reward total was -19.000000. running mean: -19.010824
resetting env. episode 74.000000, reward total was -19.000000. running mean: -19.010716
resetting env. episode 75.000000, reward total was -17.000000. running mean: -18.990609
resetting env. episode 76.000000, reward total was -21.000000. running mean: -19.010702
resetting env. episode 77.000000, reward total was -18.000000. running mean: -19.000595
resetting env. episode 78.000000, reward total was -18.000000. running mean: -18.990590
resetting env. episode 79.000000, reward total was -18.000000. running mean: -18.980684
resetting env. episode 80.000000, reward total was -18.000000. running mean: -18.970877
resetting env. episode 81.000000, reward total was -19.000000. running mean: -18.971168
resetting env. episode 82.000000, reward total was -17.000000. running mean: -18.951456
resetting env. episode 83.000000, reward total was -18.000000. running mean: -18.941942
resetting env. episode 84.000000, reward total was -18.000000. running mean: -18.932522
resetting env. episode 85.000000, reward total was -18.000000. running mean: -18.923197
resetting env. episode 86.000000, reward total was -18.000000. running mean: -18.913965
resetting env. episode 87.000000, reward total was -21.000000. running mean: -18.934826
resetting env. episode 88.000000, reward total was -21.000000. running mean: -18.955477
resetting env. episode 89.000000, reward total was -17.000000. running mean: -18.935922
resetting env. episode 90.000000, reward total was -19.000000. running mean: -18.936563
resetting env. episode 91.000000, reward total was -20.000000. running mean: -18.947198
resetting env. episode 92.000000, reward total was -19.000000. running mean: -18.947726
resetting env. episode 93.000000, reward total was -15.000000. running mean: -18.908248
resetting env. episode 94.000000, reward total was -18.000000. running mean: -18.899166
resetting env. episode 95.000000, reward total was -20.000000. running mean: -18.910174
resetting env. episode 96.000000, reward total was -17.000000. running mean: -18.891073
resetting env. episode 97.000000, reward total was -16.000000. running mean: -18.862162
resetting env. episode 98.000000, reward total was -17.000000. running mean: -18.843540
resetting env. episode 99.000000, reward total was -21.000000. running mean: -18.865105
resetting env. episode 100.000000, reward total was -18.000000. running mean: -18.856454
resetting env. episode 101.000000, reward total was -19.000000. running mean: -18.857889
resetting env. episode 102.000000, reward total was -21.000000. running mean: -18.879310
resetting env. episode 103.000000, reward total was -21.000000. running mean: -18.900517
resetting env. episode 104.000000, reward total was -20.000000. running mean: -18.911512
resetting env. episode 105.000000, reward total was -20.000000. running mean: -18.922397
resetting env. episode 106.000000, reward total was -19.000000. running mean: -18.923173
resetting env. episode 107.000000, reward total was -15.000000. running mean: -18.883941
resetting env. episode 108.000000, reward total was -20.000000. running mean: -18.895102
resetting env. episode 109.000000, reward total was -18.000000. running mean: -18.886151
resetting env. episode 110.000000, reward total was -20.000000. running mean: -18.897289
resetting env. episode 111.000000, reward total was -16.000000. running mean: -18.868316
resetting env. episode 112.000000, reward total was -21.000000. running mean: -18.889633
resetting env. episode 113.000000, reward total was -17.000000. running mean: -18.870737
resetting env. episode 114.000000, reward total was -12.000000. running mean: -18.802029
resetting env. episode 115.000000, reward total was -17.000000. running mean: -18.784009
resetting env. episode 116.000000, reward total was -21.000000. running mean: -18.806169
resetting env. episode 117.000000, reward total was -18.000000. running mean: -18.798107
resetting env. episode 118.000000, reward total was -17.000000. running mean: -18.780126
resetting env. episode 119.000000, reward total was -20.000000. running mean: -18.792325
resetting env. episode 120.000000, reward total was -19.000000. running mean: -18.794402
resetting env. episode 121.000000, reward total was -17.000000. running mean: -18.776458
resetting env. episode 122.000000, reward total was -17.000000. running mean: -18.758693
resetting env. episode 123.000000, reward total was -15.000000. running mean: -18.721106
resetting env. episode 124.000000, reward total was -14.000000. running mean: -18.673895
resetting env. episode 125.000000, reward total was -19.000000. running mean: -18.677156
resetting env. episode 126.000000, reward total was -17.000000. running mean: -18.660385
resetting env. episode 127.000000, reward total was -19.000000. running mean: -18.663781
resetting env. episode 128.000000, reward total was -20.000000. running mean: -18.677143
resetting env. episode 129.000000, reward total was -15.000000. running mean: -18.640372
resetting env. episode 130.000000, reward total was -19.000000. running mean: -18.643968
resetting env. episode 131.000000, reward total was -17.000000. running mean: -18.627528
resetting env. episode 132.000000, reward total was -19.000000. running mean: -18.631253
resetting env. episode 133.000000, reward total was -19.000000. running mean: -18.634940
resetting env. episode 134.000000, reward total was -17.000000. running mean: -18.618591
resetting env. episode 135.000000, reward total was -20.000000. running mean: -18.632405
resetting env. episode 136.000000, reward total was -20.000000. running mean: -18.646081
resetting env. episode 137.000000, reward total was -17.000000. running mean: -18.629620
resetting env. episode 138.000000, reward total was -19.000000. running mean: -18.633324
resetting env. episode 139.000000, reward total was -18.000000. running mean: -18.626991
resetting env. episode 140.000000, reward total was -19.000000. running mean: -18.630721
resetting env. episode 141.000000, reward total was -20.000000. running mean: -18.644414
resetting env. episode 142.000000, reward total was -17.000000. running mean: -18.627970
resetting env. episode 143.000000, reward total was -21.000000. running mean: -18.651690
resetting env. episode 144.000000, reward total was -19.000000. running mean: -18.655173
resetting env. episode 145.000000, reward total was -17.000000. running mean: -18.638621
resetting env. episode 146.000000, reward total was -16.000000. running mean: -18.612235
resetting env. episode 147.000000, reward total was -17.000000. running mean: -18.596113
resetting env. episode 148.000000, reward total was -17.000000. running mean: -18.580152
resetting env. episode 149.000000, reward total was -19.000000. running mean: -18.584350
resetting env. episode 150.000000, reward total was -17.000000. running mean: -18.568507
resetting env. episode 151.000000, reward total was -19.000000. running mean: -18.572821
resetting env. episode 152.000000, reward total was -19.000000. running mean: -18.577093
resetting env. episode 153.000000, reward total was -21.000000. running mean: -18.601322
resetting env. episode 154.000000, reward total was -15.000000. running mean: -18.565309
resetting env. episode 155.000000, reward total was -19.000000. running mean: -18.569656
resetting env. episode 156.000000, reward total was -17.000000. running mean: -18.553959
resetting env. episode 157.000000, reward total was -19.000000. running mean: -18.558420
resetting env. episode 158.000000, reward total was -19.000000. running mean: -18.562836
resetting env. episode 159.000000, reward total was -16.000000. running mean: -18.537207
resetting env. episode 160.000000, reward total was -19.000000. running mean: -18.541835
resetting env. episode 161.000000, reward total was -19.000000. running mean: -18.546417
resetting env. episode 162.000000, reward total was -18.000000. running mean: -18.540953
resetting env. episode 163.000000, reward total was -20.000000. running mean: -18.555543
resetting env. episode 164.000000, reward total was -17.000000. running mean: -18.539988
resetting env. episode 165.000000, reward total was -17.000000. running mean: -18.524588
resetting env. episode 166.000000, reward total was -19.000000. running mean: -18.529342
resetting env. episode 167.000000, reward total was -19.000000. running mean: -18.534049
resetting env. episode 168.000000, reward total was -17.000000. running mean: -18.518708
resetting env. episode 169.000000, reward total was -18.000000. running mean: -18.513521
resetting env. episode 170.000000, reward total was -16.000000. running mean: -18.488386
resetting env. episode 171.000000, reward total was -21.000000. running mean: -18.513502
resetting env. episode 172.000000, reward total was -19.000000. running mean: -18.518367
resetting env. episode 173.000000, reward total was -16.000000. running mean: -18.493183
resetting env. episode 174.000000, reward total was -17.000000. running mean: -18.478251
resetting env. episode 175.000000, reward total was -20.000000. running mean: -18.493469
resetting env. episode 176.000000, reward total was -18.000000. running mean: -18.488534
resetting env. episode 177.000000, reward total was -16.000000. running mean: -18.463649
resetting env. episode 178.000000, reward total was -21.000000. running mean: -18.489012
resetting env. episode 179.000000, reward total was -17.000000. running mean: -18.474122
resetting env. episode 180.000000, reward total was -20.000000. running mean: -18.489381
resetting env. episode 181.000000, reward total was -17.000000. running mean: -18.474487
resetting env. episode 182.000000, reward total was -18.000000. running mean: -18.469742
resetting env. episode 183.000000, reward total was -18.000000. running mean: -18.465045
resetting env. episode 184.000000, reward total was -12.000000. running mean: -18.400394
resetting env. episode 185.000000, reward total was -19.000000. running mean: -18.406391
resetting env. episode 186.000000, reward total was -18.000000. running mean: -18.402327
resetting env. episode 187.000000, reward total was -20.000000. running mean: -18.418303
resetting env. episode 188.000000, reward total was -17.000000. running mean: -18.404120
resetting env. episode 189.000000, reward total was -19.000000. running mean: -18.410079
resetting env. episode 190.000000, reward total was -14.000000. running mean: -18.365978
resetting env. episode 191.000000, reward total was -16.000000. running mean: -18.342319
resetting env. episode 192.000000, reward total was -17.000000. running mean: -18.328895
resetting env. episode 193.000000, reward total was -17.000000. running mean: -18.315606
resetting env. episode 194.000000, reward total was -19.000000. running mean: -18.322450
resetting env. episode 195.000000, reward total was -17.000000. running mean: -18.309226
resetting env. episode 196.000000, reward total was -20.000000. running mean: -18.326134
resetting env. episode 197.000000, reward total was -20.000000. running mean: -18.342872
resetting env. episode 198.000000, reward total was -19.000000. running mean: -18.349444
resetting env. episode 199.000000, reward total was -17.000000. running mean: -18.335949
resetting env. episode 200.000000, reward total was -17.000000. running mean: -18.322590
resetting env. episode 201.000000, reward total was -18.000000. running mean: -18.319364
resetting env. episode 202.000000, reward total was -17.000000. running mean: -18.306170
resetting env. episode 203.000000, reward total was -21.000000. running mean: -18.333108
resetting env. episode 204.000000, reward total was -21.000000. running mean: -18.359777
resetting env. episode 205.000000, reward total was -17.000000. running mean: -18.346180
resetting env. episode 206.000000, reward total was -20.000000. running mean: -18.362718
resetting env. episode 207.000000, reward total was -21.000000. running mean: -18.389091
resetting env. episode 208.000000, reward total was -17.000000. running mean: -18.375200
resetting env. episode 209.000000, reward total was -15.000000. running mean: -18.341448
resetting env. episode 210.000000, reward total was -17.000000. running mean: -18.328033
resetting env. episode 211.000000, reward total was -17.000000. running mean: -18.314753
resetting env. episode 212.000000, reward total was -19.000000. running mean: -18.321605
resetting env. episode 213.000000, reward total was -16.000000. running mean: -18.298389
resetting env. episode 214.000000, reward total was -15.000000. running mean: -18.265405
resetting env. episode 215.000000, reward total was -16.000000. running mean: -18.242751
resetting env. episode 216.000000, reward total was -19.000000. running mean: -18.250324
resetting env. episode 217.000000, reward total was -18.000000. running mean: -18.247821
resetting env. episode 218.000000, reward total was -15.000000. running mean: -18.215342
resetting env. episode 219.000000, reward total was -17.000000. running mean: -18.203189
resetting env. episode 220.000000, reward total was -18.000000. running mean: -18.201157
resetting env. episode 221.000000, reward total was -18.000000. running mean: -18.199145
resetting env. episode 222.000000, reward total was -17.000000. running mean: -18.187154
resetting env. episode 223.000000, reward total was -18.000000. running mean: -18.185282
resetting env. episode 224.000000, reward total was -19.000000. running mean: -18.193430
resetting env. episode 225.000000, reward total was -19.000000. running mean: -18.201495
resetting env. episode 226.000000, reward total was -19.000000. running mean: -18.209480
resetting env. episode 227.000000, reward total was -15.000000. running mean: -18.177386
resetting env. episode 228.000000, reward total was -20.000000. running mean: -18.195612
resetting env. episode 229.000000, reward total was -18.000000. running mean: -18.193656
resetting env. episode 230.000000, reward total was -18.000000. running mean: -18.191719
resetting env. episode 231.000000, reward total was -20.000000. running mean: -18.209802
resetting env. episode 232.000000, reward total was -17.000000. running mean: -18.197704
resetting env. episode 233.000000, reward total was -19.000000. running mean: -18.205727
resetting env. episode 234.000000, reward total was -19.000000. running mean: -18.213670
resetting env. episode 235.000000, reward total was -19.000000. running mean: -18.221533
resetting env. episode 236.000000, reward total was -19.000000. running mean: -18.229318
resetting env. episode 237.000000, reward total was -13.000000. running mean: -18.177024
resetting env. episode 238.000000, reward total was -21.000000. running mean: -18.205254
resetting env. episode 239.000000, reward total was -19.000000. running mean: -18.213202
resetting env. episode 240.000000, reward total was -18.000000. running mean: -18.211070
resetting env. episode 241.000000, reward total was -16.000000. running mean: -18.188959
resetting env. episode 242.000000, reward total was -21.000000. running mean: -18.217069
resetting env. episode 243.000000, reward total was -17.000000. running mean: -18.204899
resetting env. episode 244.000000, reward total was -18.000000. running mean: -18.202850
resetting env. episode 245.000000, reward total was -15.000000. running mean: -18.170821
resetting env. episode 246.000000, reward total was -21.000000. running mean: -18.199113
resetting env. episode 247.000000, reward total was -15.000000. running mean: -18.167122
resetting env. episode 248.000000, reward total was -19.000000. running mean: -18.175451
resetting env. episode 249.000000, reward total was -17.000000. running mean: -18.163696
resetting env. episode 250.000000, reward total was -16.000000. running mean: -18.142059
resetting env. episode 251.000000, reward total was -19.000000. running mean: -18.150638
resetting env. episode 252.000000, reward total was -19.000000. running mean: -18.159132
resetting env. episode 253.000000, reward total was -21.000000. running mean: -18.187541
resetting env. episode 254.000000, reward total was -17.000000. running mean: -18.175665
resetting env. episode 255.000000, reward total was -20.000000. running mean: -18.193909
resetting env. episode 256.000000, reward total was -20.000000. running mean: -18.211970
resetting env. episode 257.000000, reward total was -19.000000. running mean: -18.219850
resetting env. episode 258.000000, reward total was -21.000000. running mean: -18.247651
resetting env. episode 259.000000, reward total was -17.000000. running mean: -18.235175
resetting env. episode 260.000000, reward total was -16.000000. running mean: -18.212823
resetting env. episode 261.000000, reward total was -18.000000. running mean: -18.210695
resetting env. episode 262.000000, reward total was -18.000000. running mean: -18.208588
resetting env. episode 263.000000, reward total was -11.000000. running mean: -18.136502
resetting env. episode 264.000000, reward total was -17.000000. running mean: -18.125137
resetting env. episode 265.000000, reward total was -16.000000. running mean: -18.103886
resetting env. episode 266.000000, reward total was -21.000000. running mean: -18.132847
resetting env. episode 267.000000, reward total was -21.000000. running mean: -18.161518
resetting env. episode 268.000000, reward total was -18.000000. running mean: -18.159903
resetting env. episode 269.000000, reward total was -16.000000. running mean: -18.138304
resetting env. episode 270.000000, reward total was -16.000000. running mean: -18.116921
resetting env. episode 271.000000, reward total was -19.000000. running mean: -18.125752
resetting env. episode 272.000000, reward total was -19.000000. running mean: -18.134494
resetting env. episode 273.000000, reward total was -17.000000. running mean: -18.123149
resetting env. episode 274.000000, reward total was -20.000000. running mean: -18.141918
resetting env. episode 275.000000, reward total was -17.000000. running mean: -18.130499
resetting env. episode 276.000000, reward total was -18.000000. running mean: -18.129194
resetting env. episode 277.000000, reward total was -19.000000. running mean: -18.137902
resetting env. episode 278.000000, reward total was -17.000000. running mean: -18.126523
resetting env. episode 279.000000, reward total was -19.000000. running mean: -18.135258
resetting env. episode 280.000000, reward total was -19.000000. running mean: -18.143905
resetting env. episode 281.000000, reward total was -20.000000. running mean: -18.162466
resetting env. episode 282.000000, reward total was -17.000000. running mean: -18.150841
resetting env. episode 283.000000, reward total was -18.000000. running mean: -18.149333
resetting env. episode 284.000000, reward total was -18.000000. running mean: -18.147840
resetting env. episode 285.000000, reward total was -19.000000. running mean: -18.156361
resetting env. episode 286.000000, reward total was -19.000000. running mean: -18.164798
resetting env. episode 287.000000, reward total was -17.000000. running mean: -18.153150
resetting env. episode 288.000000, reward total was -21.000000. running mean: -18.181618
resetting env. episode 289.000000, reward total was -17.000000. running mean: -18.169802
resetting env. episode 290.000000, reward total was -18.000000. running mean: -18.168104
resetting env. episode 291.000000, reward total was -16.000000. running mean: -18.146423
resetting env. episode 292.000000, reward total was -17.000000. running mean: -18.134959
resetting env. episode 293.000000, reward total was -20.000000. running mean: -18.153609
resetting env. episode 294.000000, reward total was -19.000000. running mean: -18.162073
resetting env. episode 295.000000, reward total was -15.000000. running mean: -18.130452
resetting env. episode 296.000000, reward total was -21.000000. running mean: -18.159148
resetting env. episode 297.000000, reward total was -18.000000. running mean: -18.157556
resetting env. episode 298.000000, reward total was -18.000000. running mean: -18.155981
resetting env. episode 299.000000, reward total was -19.000000. running mean: -18.164421
resetting env. episode 300.000000, reward total was -17.000000. running mean: -18.152777
resetting env. episode 301.000000, reward total was -18.000000. running mean: -18.151249
resetting env. episode 302.000000, reward total was -14.000000. running mean: -18.109736
resetting env. episode 303.000000, reward total was -17.000000. running mean: -18.098639
resetting env. episode 304.000000, reward total was -19.000000. running mean: -18.107653
resetting env. episode 305.000000, reward total was -19.000000. running mean: -18.116576
resetting env. episode 306.000000, reward total was -17.000000. running mean: -18.105410
resetting env. episode 307.000000, reward total was -18.000000. running mean: -18.104356
resetting env. episode 308.000000, reward total was -12.000000. running mean: -18.043313
resetting env. episode 309.000000, reward total was -14.000000. running mean: -18.002880
resetting env. episode 310.000000, reward total was -16.000000. running mean: -17.982851
resetting env. episode 311.000000, reward total was -19.000000. running mean: -17.993022
resetting env. episode 312.000000, reward total was -19.000000. running mean: -18.003092
resetting env. episode 313.000000, reward total was -11.000000. running mean: -17.933061
resetting env. episode 314.000000, reward total was -15.000000. running mean: -17.903730
resetting env. episode 315.000000, reward total was -18.000000. running mean: -17.904693
resetting env. episode 316.000000, reward total was -21.000000. running mean: -17.935646
resetting env. episode 317.000000, reward total was -18.000000. running mean: -17.936290
resetting env. episode 318.000000, reward total was -19.000000. running mean: -17.946927
resetting env. episode 319.000000, reward total was -17.000000. running mean: -17.937458
resetting env. episode 320.000000, reward total was -19.000000. running mean: -17.948083
resetting env. episode 321.000000, reward total was -18.000000. running mean: -17.948602
resetting env. episode 322.000000, reward total was -19.000000. running mean: -17.959116
resetting env. episode 323.000000, reward total was -17.000000. running mean: -17.949525
resetting env. episode 324.000000, reward total was -18.000000. running mean: -17.950030
resetting env. episode 325.000000, reward total was -17.000000. running mean: -17.940529
resetting env. episode 326.000000, reward total was -19.000000. running mean: -17.951124
resetting env. episode 327.000000, reward total was -17.000000. running mean: -17.941613
resetting env. episode 328.000000, reward total was -19.000000. running mean: -17.952197
resetting env. episode 329.000000, reward total was -20.000000. running mean: -17.972675
resetting env. episode 330.000000, reward total was -16.000000. running mean: -17.952948
resetting env. episode 331.000000, reward total was -17.000000. running mean: -17.943419
resetting env. episode 332.000000, reward total was -17.000000. running mean: -17.933984
resetting env. episode 333.000000, reward total was -18.000000. running mean: -17.934645
resetting env. episode 334.000000, reward total was -21.000000. running mean: -17.965298
resetting env. episode 335.000000, reward total was -17.000000. running mean: -17.955645
resetting env. episode 336.000000, reward total was -17.000000. running mean: -17.946089
resetting env. episode 337.000000, reward total was -19.000000. running mean: -17.956628
resetting env. episode 338.000000, reward total was -16.000000. running mean: -17.937062
resetting env. episode 339.000000, reward total was -15.000000. running mean: -17.907691
resetting env. episode 340.000000, reward total was -13.000000. running mean: -17.858614
resetting env. episode 341.000000, reward total was -19.000000. running mean: -17.870028
resetting env. episode 342.000000, reward total was -17.000000. running mean: -17.861328
resetting env. episode 343.000000, reward total was -18.000000. running mean: -17.862714
resetting env. episode 344.000000, reward total was -21.000000. running mean: -17.894087
resetting env. episode 345.000000, reward total was -14.000000. running mean: -17.855146
resetting env. episode 346.000000, reward total was -17.000000. running mean: -17.846595
resetting env. episode 347.000000, reward total was -17.000000. running mean: -17.838129
resetting env. episode 348.000000, reward total was -18.000000. running mean: -17.839748
resetting env. episode 349.000000, reward total was -16.000000. running mean: -17.821350
resetting env. episode 350.000000, reward total was -19.000000. running mean: -17.833137
resetting env. episode 351.000000, reward total was -17.000000. running mean: -17.824805
resetting env. episode 352.000000, reward total was -19.000000. running mean: -17.836557
resetting env. episode 353.000000, reward total was -16.000000. running mean: -17.818192
resetting env. episode 354.000000, reward total was -14.000000. running mean: -17.780010
resetting env. episode 355.000000, reward total was -19.000000. running mean: -17.792210
resetting env. episode 356.000000, reward total was -20.000000. running mean: -17.814288
resetting env. episode 357.000000, reward total was -19.000000. running mean: -17.826145
resetting env. episode 358.000000, reward total was -18.000000. running mean: -17.827883
resetting env. episode 359.000000, reward total was -19.000000. running mean: -17.839604
resetting env. episode 360.000000, reward total was -19.000000. running mean: -17.851208
resetting env. episode 361.000000, reward total was -12.000000. running mean: -17.792696
resetting env. episode 362.000000, reward total was -17.000000. running mean: -17.784769
resetting env. episode 363.000000, reward total was -14.000000. running mean: -17.746922
resetting env. episode 364.000000, reward total was -18.000000. running mean: -17.749452
resetting env. episode 365.000000, reward total was -21.000000. running mean: -17.781958
resetting env. episode 366.000000, reward total was -19.000000. running mean: -17.794138
resetting env. episode 367.000000, reward total was -19.000000. running mean: -17.806197
resetting env. episode 368.000000, reward total was -21.000000. running mean: -17.838135
resetting env. episode 369.000000, reward total was -13.000000. running mean: -17.789754
resetting env. episode 370.000000, reward total was -21.000000. running mean: -17.821856
resetting env. episode 371.000000, reward total was -20.000000. running mean: -17.843637
resetting env. episode 372.000000, reward total was -19.000000. running mean: -17.855201
resetting env. episode 373.000000, reward total was -20.000000. running mean: -17.876649
resetting env. episode 374.000000, reward total was -17.000000. running mean: -17.867883
resetting env. episode 375.000000, reward total was -15.000000. running mean: -17.839204
resetting env. episode 376.000000, reward total was -20.000000. running mean: -17.860812
resetting env. episode 377.000000, reward total was -17.000000. running mean: -17.852204
resetting env. episode 378.000000, reward total was -12.000000. running mean: -17.793682
resetting env. episode 379.000000, reward total was -13.000000. running mean: -17.745745
resetting env. episode 380.000000, reward total was -19.000000. running mean: -17.758287
resetting env. episode 381.000000, reward total was -19.000000. running mean: -17.770704
resetting env. episode 382.000000, reward total was -19.000000. running mean: -17.782997
resetting env. episode 383.000000, reward total was -18.000000. running mean: -17.785167
resetting env. episode 384.000000, reward total was -19.000000. running mean: -17.797316
resetting env. episode 385.000000, reward total was -12.000000. running mean: -17.739343
resetting env. episode 386.000000, reward total was -20.000000. running mean: -17.761949
resetting env. episode 387.000000, reward total was -16.000000. running mean: -17.744330
resetting env. episode 388.000000, reward total was -18.000000. running mean: -17.746886
resetting env. episode 389.000000, reward total was -16.000000. running mean: -17.729418
resetting env. episode 390.000000, reward total was -17.000000. running mean: -17.722123
resetting env. episode 391.000000, reward total was -15.000000. running mean: -17.694902
resetting env. episode 392.000000, reward total was -20.000000. running mean: -17.717953
resetting env. episode 393.000000, reward total was -19.000000. running mean: -17.730774
resetting env. episode 394.000000, reward total was -15.000000. running mean: -17.703466
resetting env. episode 395.000000, reward total was -14.000000. running mean: -17.666431
resetting env. episode 396.000000, reward total was -15.000000. running mean: -17.639767
resetting env. episode 397.000000, reward total was -19.000000. running mean: -17.653369
resetting env. episode 398.000000, reward total was -13.000000. running mean: -17.606835
resetting env. episode 399.000000, reward total was -19.000000. running mean: -17.620767
resetting env. episode 400.000000, reward total was -17.000000. running mean: -17.614559
resetting env. episode 401.000000, reward total was -17.000000. running mean: -17.608414
resetting env. episode 402.000000, reward total was -19.000000. running mean: -17.622330
resetting env. episode 403.000000, reward total was -19.000000. running mean: -17.636106
resetting env. episode 404.000000, reward total was -20.000000. running mean: -17.659745
resetting env. episode 405.000000, reward total was -21.000000. running mean: -17.693148
resetting env. episode 406.000000, reward total was -15.000000. running mean: -17.666216
resetting env. episode 407.000000, reward total was -20.000000. running mean: -17.689554
resetting env. episode 408.000000, reward total was -18.000000. running mean: -17.692659
resetting env. episode 409.000000, reward total was -16.000000. running mean: -17.675732
resetting env. episode 410.000000, reward total was -20.000000. running mean: -17.698975
resetting env. episode 411.000000, reward total was -14.000000. running mean: -17.661985
resetting env. episode 412.000000, reward total was -19.000000. running mean: -17.675365
resetting env. episode 413.000000, reward total was -17.000000. running mean: -17.668612
resetting env. episode 414.000000, reward total was -18.000000. running mean: -17.671925
resetting env. episode 415.000000, reward total was -14.000000. running mean: -17.635206
resetting env. episode 416.000000, reward total was -18.000000. running mean: -17.638854
resetting env. episode 417.000000, reward total was -20.000000. running mean: -17.662466
resetting env. episode 418.000000, reward total was -14.000000. running mean: -17.625841
resetting env. episode 419.000000, reward total was -21.000000. running mean: -17.659583
resetting env. episode 420.000000, reward total was -20.000000. running mean: -17.682987
resetting env. episode 421.000000, reward total was -19.000000. running mean: -17.696157
resetting env. episode 422.000000, reward total was -16.000000. running mean: -17.679195
resetting env. episode 423.000000, reward total was -17.000000. running mean: -17.672403
resetting env. episode 424.000000, reward total was -17.000000. running mean: -17.665679
resetting env. episode 425.000000, reward total was -20.000000. running mean: -17.689022
resetting env. episode 426.000000, reward total was -17.000000. running mean: -17.682132
resetting env. episode 427.000000, reward total was -21.000000. running mean: -17.715311
resetting env. episode 428.000000, reward total was -18.000000. running mean: -17.718158
resetting env. episode 429.000000, reward total was -17.000000. running mean: -17.710976
resetting env. episode 430.000000, reward total was -18.000000. running mean: -17.713866
resetting env. episode 431.000000, reward total was -15.000000. running mean: -17.686728
resetting env. episode 432.000000, reward total was -13.000000. running mean: -17.639861
resetting env. episode 433.000000, reward total was -19.000000. running mean: -17.653462
resetting env. episode 434.000000, reward total was -17.000000. running mean: -17.646927
resetting env. episode 435.000000, reward total was -17.000000. running mean: -17.640458
resetting env. episode 436.000000, reward total was -21.000000. running mean: -17.674053
resetting env. episode 437.000000, reward total was -19.000000. running mean: -17.687313
resetting env. episode 438.000000, reward total was -18.000000. running mean: -17.690440
resetting env. episode 439.000000, reward total was -16.000000. running mean: -17.673535
resetting env. episode 440.000000, reward total was -19.000000. running mean: -17.686800
resetting env. episode 441.000000, reward total was -19.000000. running mean: -17.699932
resetting env. episode 442.000000, reward total was -17.000000. running mean: -17.692933
resetting env. episode 443.000000, reward total was -16.000000. running mean: -17.676003
resetting env. episode 444.000000, reward total was -12.000000. running mean: -17.619243
resetting env. episode 445.000000, reward total was -13.000000. running mean: -17.573051
resetting env. episode 446.000000, reward total was -14.000000. running mean: -17.537320
resetting env. episode 447.000000, reward total was -19.000000. running mean: -17.551947
resetting env. episode 448.000000, reward total was -16.000000. running mean: -17.536428
resetting env. episode 449.000000, reward total was -17.000000. running mean: -17.531063
resetting env. episode 450.000000, reward total was -16.000000. running mean: -17.515753
resetting env. episode 451.000000, reward total was -14.000000. running mean: -17.480595
resetting env. episode 452.000000, reward total was -15.000000. running mean: -17.455789
resetting env. episode 453.000000, reward total was -12.000000. running mean: -17.401231
resetting env. episode 454.000000, reward total was -21.000000. running mean: -17.437219
resetting env. episode 455.000000, reward total was -19.000000. running mean: -17.452847
resetting env. episode 456.000000, reward total was -17.000000. running mean: -17.448318
resetting env. episode 457.000000, reward total was -19.000000. running mean: -17.463835
resetting env. episode 458.000000, reward total was -16.000000. running mean: -17.449197
resetting env. episode 459.000000, reward total was -17.000000. running mean: -17.444705
resetting env. episode 460.000000, reward total was -18.000000. running mean: -17.450258
resetting env. episode 461.000000, reward total was -15.000000. running mean: -17.425755
resetting env. episode 462.000000, reward total was -17.000000. running mean: -17.421498
resetting env. episode 463.000000, reward total was -15.000000. running mean: -17.397283
resetting env. episode 464.000000, reward total was -11.000000. running mean: -17.333310
resetting env. episode 465.000000, reward total was -18.000000. running mean: -17.339977
resetting env. episode 466.000000, reward total was -18.000000. running mean: -17.346577
resetting env. episode 467.000000, reward total was -17.000000. running mean: -17.343111
resetting env. episode 468.000000, reward total was -14.000000. running mean: -17.309680
resetting env. episode 469.000000, reward total was -15.000000. running mean: -17.286583
resetting env. episode 470.000000, reward total was -19.000000. running mean: -17.303718
resetting env. episode 471.000000, reward total was -16.000000. running mean: -17.290680
resetting env. episode 472.000000, reward total was -17.000000. running mean: -17.287774
resetting env. episode 473.000000, reward total was -20.000000. running mean: -17.314896
resetting env. episode 474.000000, reward total was -15.000000. running mean: -17.291747
resetting env. episode 475.000000, reward total was -13.000000. running mean: -17.248829
resetting env. episode 476.000000, reward total was -17.000000. running mean: -17.246341
resetting env. episode 477.000000, reward total was -18.000000. running mean: -17.253878
resetting env. episode 478.000000, reward total was -15.000000. running mean: -17.231339
resetting env. episode 479.000000, reward total was -18.000000. running mean: -17.239026
resetting env. episode 480.000000, reward total was -17.000000. running mean: -17.236635
resetting env. episode 481.000000, reward total was -19.000000. running mean: -17.254269
resetting env. episode 482.000000, reward total was -20.000000. running mean: -17.281726
resetting env. episode 483.000000, reward total was -11.000000. running mean: -17.218909
resetting env. episode 484.000000, reward total was -19.000000. running mean: -17.236720
resetting env. episode 485.000000, reward total was -18.000000. running mean: -17.244353
resetting env. episode 486.000000, reward total was -19.000000. running mean: -17.261909
resetting env. episode 487.000000, reward total was -18.000000. running mean: -17.269290
resetting env. episode 488.000000, reward total was -15.000000. running mean: -17.246597
resetting env. episode 489.000000, reward total was -17.000000. running mean: -17.244131
resetting env. episode 490.000000, reward total was -18.000000. running mean: -17.251690
resetting env. episode 491.000000, reward total was -13.000000. running mean: -17.209173
resetting env. episode 492.000000, reward total was -17.000000. running mean: -17.207081
resetting env. episode 493.000000, reward total was -18.000000. running mean: -17.215010
resetting env. episode 494.000000, reward total was -19.000000. running mean: -17.232860
resetting env. episode 495.000000, reward total was -17.000000. running mean: -17.230532
resetting env. episode 496.000000, reward total was -17.000000. running mean: -17.228226
resetting env. episode 497.000000, reward total was -16.000000. running mean: -17.215944
resetting env. episode 498.000000, reward total was -19.000000. running mean: -17.233785
resetting env. episode 499.000000, reward total was -17.000000. running mean: -17.231447
resetting env. episode 500.000000, reward total was -18.000000. running mean: -17.239132
CPU times: user 49min 13s, sys: 17min 8s, total: 1h 6min 21s
Wall time: 33min 35s

In [0]:
play_game(env, model)


Episode finished without success, accumulated reward = -4.0


Once Loop Reflect

In [0]:
%time hist3 = train_model(env, model, total_episodes=1000)


resetting env. episode 1.000000, reward total was -18.000000. running mean: -18.000000
resetting env. episode 2.000000, reward total was -11.000000. running mean: -17.930000
resetting env. episode 3.000000, reward total was -11.000000. running mean: -17.860700
resetting env. episode 4.000000, reward total was -12.000000. running mean: -17.802093
resetting env. episode 5.000000, reward total was -17.000000. running mean: -17.794072
resetting env. episode 6.000000, reward total was -15.000000. running mean: -17.766131
resetting env. episode 7.000000, reward total was -14.000000. running mean: -17.728470
resetting env. episode 8.000000, reward total was -13.000000. running mean: -17.681185
resetting env. episode 9.000000, reward total was -19.000000. running mean: -17.694373
resetting env. episode 10.000000, reward total was -17.000000. running mean: -17.687430
resetting env. episode 11.000000, reward total was -5.000000. running mean: -17.560555
resetting env. episode 12.000000, reward total was -19.000000. running mean: -17.574950
resetting env. episode 13.000000, reward total was -13.000000. running mean: -17.529200
resetting env. episode 14.000000, reward total was -12.000000. running mean: -17.473908
resetting env. episode 15.000000, reward total was -12.000000. running mean: -17.419169
resetting env. episode 16.000000, reward total was -16.000000. running mean: -17.404978
resetting env. episode 17.000000, reward total was -7.000000. running mean: -17.300928
resetting env. episode 18.000000, reward total was -10.000000. running mean: -17.227919
resetting env. episode 19.000000, reward total was -12.000000. running mean: -17.175639
resetting env. episode 20.000000, reward total was -11.000000. running mean: -17.113883
resetting env. episode 21.000000, reward total was -11.000000. running mean: -17.052744
resetting env. episode 22.000000, reward total was -8.000000. running mean: -16.962217
resetting env. episode 23.000000, reward total was -16.000000. running mean: -16.952595
resetting env. episode 24.000000, reward total was -10.000000. running mean: -16.883069
resetting env. episode 25.000000, reward total was -5.000000. running mean: -16.764238
resetting env. episode 26.000000, reward total was -14.000000. running mean: -16.736596
resetting env. episode 27.000000, reward total was -13.000000. running mean: -16.699230
resetting env. episode 28.000000, reward total was -10.000000. running mean: -16.632237
resetting env. episode 29.000000, reward total was -17.000000. running mean: -16.635915
resetting env. episode 30.000000, reward total was -16.000000. running mean: -16.629556
resetting env. episode 31.000000, reward total was -13.000000. running mean: -16.593260
resetting env. episode 32.000000, reward total was -9.000000. running mean: -16.517328
resetting env. episode 33.000000, reward total was -11.000000. running mean: -16.462154
resetting env. episode 34.000000, reward total was -11.000000. running mean: -16.407533
resetting env. episode 35.000000, reward total was -11.000000. running mean: -16.353457
resetting env. episode 36.000000, reward total was -17.000000. running mean: -16.359923
resetting env. episode 37.000000, reward total was -5.000000. running mean: -16.246324
resetting env. episode 38.000000, reward total was -13.000000. running mean: -16.213860
resetting env. episode 39.000000, reward total was -7.000000. running mean: -16.121722
resetting env. episode 40.000000, reward total was -15.000000. running mean: -16.110505
resetting env. episode 41.000000, reward total was -16.000000. running mean: -16.109400
resetting env. episode 42.000000, reward total was -15.000000. running mean: -16.098306
resetting env. episode 43.000000, reward total was -9.000000. running mean: -16.027322
resetting env. episode 44.000000, reward total was -13.000000. running mean: -15.997049
resetting env. episode 45.000000, reward total was -19.000000. running mean: -16.027079
resetting env. episode 46.000000, reward total was -13.000000. running mean: -15.996808
resetting env. episode 47.000000, reward total was -14.000000. running mean: -15.976840
resetting env. episode 48.000000, reward total was -13.000000. running mean: -15.947072
resetting env. episode 49.000000, reward total was -8.000000. running mean: -15.867601
resetting env. episode 50.000000, reward total was -12.000000. running mean: -15.828925
resetting env. episode 51.000000, reward total was -12.000000. running mean: -15.790636
resetting env. episode 52.000000, reward total was -15.000000. running mean: -15.782729
resetting env. episode 53.000000, reward total was -16.000000. running mean: -15.784902
resetting env. episode 54.000000, reward total was -14.000000. running mean: -15.767053
resetting env. episode 55.000000, reward total was -11.000000. running mean: -15.719382
resetting env. episode 56.000000, reward total was -14.000000. running mean: -15.702189
resetting env. episode 57.000000, reward total was -10.000000. running mean: -15.645167
resetting env. episode 58.000000, reward total was -20.000000. running mean: -15.688715
resetting env. episode 59.000000, reward total was -5.000000. running mean: -15.581828
resetting env. episode 60.000000, reward total was -13.000000. running mean: -15.556010
resetting env. episode 61.000000, reward total was -11.000000. running mean: -15.510449
resetting env. episode 62.000000, reward total was -10.000000. running mean: -15.455345
resetting env. episode 63.000000, reward total was -6.000000. running mean: -15.360792
resetting env. episode 64.000000, reward total was -13.000000. running mean: -15.337184
resetting env. episode 65.000000, reward total was -12.000000. running mean: -15.303812
resetting env. episode 66.000000, reward total was -19.000000. running mean: -15.340774
resetting env. episode 67.000000, reward total was -18.000000. running mean: -15.367366
resetting env. episode 68.000000, reward total was -10.000000. running mean: -15.313692
resetting env. episode 69.000000, reward total was -17.000000. running mean: -15.330555
resetting env. episode 70.000000, reward total was -13.000000. running mean: -15.307250
resetting env. episode 71.000000, reward total was -7.000000. running mean: -15.224177
resetting env. episode 72.000000, reward total was -15.000000. running mean: -15.221935
resetting env. episode 73.000000, reward total was -11.000000. running mean: -15.179716
resetting env. episode 74.000000, reward total was -12.000000. running mean: -15.147919
resetting env. episode 75.000000, reward total was -13.000000. running mean: -15.126440
resetting env. episode 76.000000, reward total was -19.000000. running mean: -15.165175
resetting env. episode 77.000000, reward total was -8.000000. running mean: -15.093524
resetting env. episode 78.000000, reward total was -13.000000. running mean: -15.072588
resetting env. episode 79.000000, reward total was -12.000000. running mean: -15.041863
resetting env. episode 80.000000, reward total was -13.000000. running mean: -15.021444
resetting env. episode 81.000000, reward total was -15.000000. running mean: -15.021229
resetting env. episode 82.000000, reward total was -9.000000. running mean: -14.961017
resetting env. episode 83.000000, reward total was -9.000000. running mean: -14.901407
resetting env. episode 84.000000, reward total was -5.000000. running mean: -14.802393
resetting env. episode 85.000000, reward total was -11.000000. running mean: -14.764369
resetting env. episode 86.000000, reward total was -9.000000. running mean: -14.706725
resetting env. episode 87.000000, reward total was -12.000000. running mean: -14.679658
resetting env. episode 88.000000, reward total was -11.000000. running mean: -14.642861
resetting env. episode 89.000000, reward total was -11.000000. running mean: -14.606433
resetting env. episode 90.000000, reward total was -13.000000. running mean: -14.590369
resetting env. episode 91.000000, reward total was -14.000000. running mean: -14.584465
resetting env. episode 92.000000, reward total was -13.000000. running mean: -14.568620
resetting env. episode 93.000000, reward total was -13.000000. running mean: -14.552934
resetting env. episode 94.000000, reward total was -5.000000. running mean: -14.457405
resetting env. episode 95.000000, reward total was -17.000000. running mean: -14.482831
resetting env. episode 96.000000, reward total was -17.000000. running mean: -14.508002
resetting env. episode 97.000000, reward total was -11.000000. running mean: -14.472922
resetting env. episode 98.000000, reward total was -15.000000. running mean: -14.478193
resetting env. episode 99.000000, reward total was -11.000000. running mean: -14.443411
resetting env. episode 100.000000, reward total was -7.000000. running mean: -14.368977
resetting env. episode 101.000000, reward total was -13.000000. running mean: -14.355287
resetting env. episode 102.000000, reward total was -8.000000. running mean: -14.291734
resetting env. episode 103.000000, reward total was -11.000000. running mean: -14.258817
resetting env. episode 104.000000, reward total was -3.000000. running mean: -14.146229
resetting env. episode 105.000000, reward total was -15.000000. running mean: -14.154767
resetting env. episode 106.000000, reward total was -15.000000. running mean: -14.163219
resetting env. episode 107.000000, reward total was -13.000000. running mean: -14.151587
resetting env. episode 108.000000, reward total was -14.000000. running mean: -14.150071
resetting env. episode 109.000000, reward total was -13.000000. running mean: -14.138570
resetting env. episode 110.000000, reward total was -7.000000. running mean: -14.067184
resetting env. episode 111.000000, reward total was -16.000000. running mean: -14.086513
resetting env. episode 112.000000, reward total was -14.000000. running mean: -14.085647
resetting env. episode 113.000000, reward total was -14.000000. running mean: -14.084791
resetting env. episode 114.000000, reward total was -10.000000. running mean: -14.043943
resetting env. episode 115.000000, reward total was -14.000000. running mean: -14.043504
resetting env. episode 116.000000, reward total was -15.000000. running mean: -14.053069
resetting env. episode 117.000000, reward total was -8.000000. running mean: -13.992538
resetting env. episode 118.000000, reward total was -9.000000. running mean: -13.942613
resetting env. episode 119.000000, reward total was -12.000000. running mean: -13.923186
resetting env. episode 120.000000, reward total was -17.000000. running mean: -13.953955
resetting env. episode 121.000000, reward total was -14.000000. running mean: -13.954415
resetting env. episode 122.000000, reward total was -9.000000. running mean: -13.904871
resetting env. episode 123.000000, reward total was -17.000000. running mean: -13.935822
resetting env. episode 124.000000, reward total was -9.000000. running mean: -13.886464
resetting env. episode 125.000000, reward total was -14.000000. running mean: -13.887599
resetting env. episode 126.000000, reward total was -15.000000. running mean: -13.898723
resetting env. episode 127.000000, reward total was -17.000000. running mean: -13.929736
resetting env. episode 128.000000, reward total was -15.000000. running mean: -13.940439
resetting env. episode 129.000000, reward total was -6.000000. running mean: -13.861034
resetting env. episode 130.000000, reward total was -10.000000. running mean: -13.822424
resetting env. episode 131.000000, reward total was -17.000000. running mean: -13.854200
resetting env. episode 132.000000, reward total was -10.000000. running mean: -13.815658
resetting env. episode 133.000000, reward total was -9.000000. running mean: -13.767501
resetting env. episode 134.000000, reward total was -13.000000. running mean: -13.759826
resetting env. episode 135.000000, reward total was -16.000000. running mean: -13.782228
resetting env. episode 136.000000, reward total was -8.000000. running mean: -13.724406
resetting env. episode 137.000000, reward total was -13.000000. running mean: -13.717162
resetting env. episode 138.000000, reward total was -11.000000. running mean: -13.689990
resetting env. episode 139.000000, reward total was -12.000000. running mean: -13.673090
resetting env. episode 140.000000, reward total was -11.000000. running mean: -13.646359
resetting env. episode 141.000000, reward total was -13.000000. running mean: -13.639896
resetting env. episode 142.000000, reward total was -16.000000. running mean: -13.663497
resetting env. episode 143.000000, reward total was -11.000000. running mean: -13.636862
resetting env. episode 144.000000, reward total was -19.000000. running mean: -13.690493
resetting env. episode 145.000000, reward total was -6.000000. running mean: -13.613588
resetting env. episode 146.000000, reward total was -17.000000. running mean: -13.647452
resetting env. episode 147.000000, reward total was -11.000000. running mean: -13.620978
resetting env. episode 148.000000, reward total was -5.000000. running mean: -13.534768
resetting env. episode 149.000000, reward total was -12.000000. running mean: -13.519420
resetting env. episode 150.000000, reward total was -8.000000. running mean: -13.464226
resetting env. episode 151.000000, reward total was -11.000000. running mean: -13.439584
resetting env. episode 152.000000, reward total was -13.000000. running mean: -13.435188
resetting env. episode 153.000000, reward total was -12.000000. running mean: -13.420836
resetting env. episode 154.000000, reward total was -9.000000. running mean: -13.376628
resetting env. episode 155.000000, reward total was -15.000000. running mean: -13.392861
resetting env. episode 156.000000, reward total was -15.000000. running mean: -13.408933
resetting env. episode 157.000000, reward total was -16.000000. running mean: -13.434843
resetting env. episode 158.000000, reward total was -19.000000. running mean: -13.490495
resetting env. episode 159.000000, reward total was -4.000000. running mean: -13.395590
resetting env. episode 160.000000, reward total was -16.000000. running mean: -13.421634
resetting env. episode 161.000000, reward total was -13.000000. running mean: -13.417418
resetting env. episode 162.000000, reward total was -5.000000. running mean: -13.333244
resetting env. episode 163.000000, reward total was -12.000000. running mean: -13.319911
resetting env. episode 164.000000, reward total was -17.000000. running mean: -13.356712
resetting env. episode 165.000000, reward total was -14.000000. running mean: -13.363145
resetting env. episode 166.000000, reward total was -11.000000. running mean: -13.339514
resetting env. episode 167.000000, reward total was -15.000000. running mean: -13.356118
resetting env. episode 168.000000, reward total was -12.000000. running mean: -13.342557
resetting env. episode 169.000000, reward total was -6.000000. running mean: -13.269132
resetting env. episode 170.000000, reward total was -13.000000. running mean: -13.266440
resetting env. episode 171.000000, reward total was -16.000000. running mean: -13.293776
resetting env. episode 172.000000, reward total was -15.000000. running mean: -13.310838
resetting env. episode 173.000000, reward total was -13.000000. running mean: -13.307730
resetting env. episode 174.000000, reward total was -17.000000. running mean: -13.344652
resetting env. episode 175.000000, reward total was -13.000000. running mean: -13.341206
resetting env. episode 176.000000, reward total was -3.000000. running mean: -13.237794
resetting env. episode 177.000000, reward total was -7.000000. running mean: -13.175416
resetting env. episode 178.000000, reward total was -10.000000. running mean: -13.143662
resetting env. episode 179.000000, reward total was -8.000000. running mean: -13.092225
resetting env. episode 180.000000, reward total was -13.000000. running mean: -13.091303
resetting env. episode 181.000000, reward total was -15.000000. running mean: -13.110390
resetting env. episode 182.000000, reward total was -5.000000. running mean: -13.029286
resetting env. episode 183.000000, reward total was -16.000000. running mean: -13.058993
resetting env. episode 184.000000, reward total was -13.000000. running mean: -13.058403
resetting env. episode 185.000000, reward total was -16.000000. running mean: -13.087819
resetting env. episode 186.000000, reward total was -13.000000. running mean: -13.086941
resetting env. episode 187.000000, reward total was -9.000000. running mean: -13.046072
resetting env. episode 188.000000, reward total was -7.000000. running mean: -12.985611
resetting env. episode 189.000000, reward total was -13.000000. running mean: -12.985755
resetting env. episode 190.000000, reward total was -18.000000. running mean: -13.035897
resetting env. episode 191.000000, reward total was -14.000000. running mean: -13.045538
resetting env. episode 192.000000, reward total was -11.000000. running mean: -13.025083
resetting env. episode 193.000000, reward total was -17.000000. running mean: -13.064832
resetting env. episode 194.000000, reward total was -15.000000. running mean: -13.084184
resetting env. episode 195.000000, reward total was -15.000000. running mean: -13.103342
resetting env. episode 196.000000, reward total was -7.000000. running mean: -13.042308
resetting env. episode 197.000000, reward total was -9.000000. running mean: -13.001885
resetting env. episode 198.000000, reward total was -10.000000. running mean: -12.971867
resetting env. episode 199.000000, reward total was -3.000000. running mean: -12.872148
resetting env. episode 200.000000, reward total was -13.000000. running mean: -12.873426
resetting env. episode 201.000000, reward total was -11.000000. running mean: -12.854692
resetting env. episode 202.000000, reward total was -3.000000. running mean: -12.756145
resetting env. episode 203.000000, reward total was -15.000000. running mean: -12.778584
resetting env. episode 204.000000, reward total was -13.000000. running mean: -12.780798
resetting env. episode 205.000000, reward total was -11.000000. running mean: -12.762990
resetting env. episode 206.000000, reward total was -11.000000. running mean: -12.745360
resetting env. episode 207.000000, reward total was -15.000000. running mean: -12.767906
resetting env. episode 208.000000, reward total was -10.000000. running mean: -12.740227
resetting env. episode 209.000000, reward total was -17.000000. running mean: -12.782825
resetting env. episode 210.000000, reward total was -15.000000. running mean: -12.804997
resetting env. episode 211.000000, reward total was -11.000000. running mean: -12.786947
resetting env. episode 212.000000, reward total was -3.000000. running mean: -12.689077
resetting env. episode 213.000000, reward total was -13.000000. running mean: -12.692187
resetting env. episode 214.000000, reward total was -14.000000. running mean: -12.705265
resetting env. episode 215.000000, reward total was -13.000000. running mean: -12.708212
resetting env. episode 216.000000, reward total was -5.000000. running mean: -12.631130
resetting env. episode 217.000000, reward total was -10.000000. running mean: -12.604819
resetting env. episode 218.000000, reward total was -12.000000. running mean: -12.598770
resetting env. episode 219.000000, reward total was -15.000000. running mean: -12.622783
resetting env. episode 220.000000, reward total was -10.000000. running mean: -12.596555
resetting env. episode 221.000000, reward total was -6.000000. running mean: -12.530589
resetting env. episode 222.000000, reward total was -7.000000. running mean: -12.475284
resetting env. episode 223.000000, reward total was -16.000000. running mean: -12.510531
resetting env. episode 224.000000, reward total was -19.000000. running mean: -12.575425
resetting env. episode 225.000000, reward total was -10.000000. running mean: -12.549671
resetting env. episode 226.000000, reward total was -12.000000. running mean: -12.544174
resetting env. episode 227.000000, reward total was -11.000000. running mean: -12.528733
resetting env. episode 228.000000, reward total was -7.000000. running mean: -12.473445
resetting env. episode 229.000000, reward total was -13.000000. running mean: -12.478711
resetting env. episode 230.000000, reward total was -14.000000. running mean: -12.493924
resetting env. episode 231.000000, reward total was -5.000000. running mean: -12.418985
resetting env. episode 232.000000, reward total was -11.000000. running mean: -12.404795
resetting env. episode 233.000000, reward total was -12.000000. running mean: -12.400747
resetting env. episode 234.000000, reward total was -13.000000. running mean: -12.406739
resetting env. episode 235.000000, reward total was -15.000000. running mean: -12.432672
resetting env. episode 236.000000, reward total was -19.000000. running mean: -12.498345
resetting env. episode 237.000000, reward total was -14.000000. running mean: -12.513362
resetting env. episode 238.000000, reward total was -17.000000. running mean: -12.558228
resetting env. episode 239.000000, reward total was -16.000000. running mean: -12.592646
resetting env. episode 240.000000, reward total was -9.000000. running mean: -12.556719
resetting env. episode 241.000000, reward total was -10.000000. running mean: -12.531152
resetting env. episode 242.000000, reward total was -5.000000. running mean: -12.455841
resetting env. episode 243.000000, reward total was -9.000000. running mean: -12.421282
resetting env. episode 244.000000, reward total was -18.000000. running mean: -12.477069
resetting env. episode 245.000000, reward total was -13.000000. running mean: -12.482299
resetting env. episode 246.000000, reward total was -8.000000. running mean: -12.437476
resetting env. episode 247.000000, reward total was -9.000000. running mean: -12.403101
resetting env. episode 248.000000, reward total was -8.000000. running mean: -12.359070
resetting env. episode 249.000000, reward total was -19.000000. running mean: -12.425479
resetting env. episode 250.000000, reward total was -9.000000. running mean: -12.391224
resetting env. episode 251.000000, reward total was -14.000000. running mean: -12.407312
resetting env. episode 252.000000, reward total was -6.000000. running mean: -12.343239
resetting env. episode 253.000000, reward total was -16.000000. running mean: -12.379807
resetting env. episode 254.000000, reward total was -10.000000. running mean: -12.356009
resetting env. episode 255.000000, reward total was -11.000000. running mean: -12.342449
resetting env. episode 256.000000, reward total was -11.000000. running mean: -12.329024
resetting env. episode 257.000000, reward total was -18.000000. running mean: -12.385734
resetting env. episode 258.000000, reward total was -11.000000. running mean: -12.371876
resetting env. episode 259.000000, reward total was -12.000000. running mean: -12.368158
resetting env. episode 260.000000, reward total was -15.000000. running mean: -12.394476
resetting env. episode 261.000000, reward total was -10.000000. running mean: -12.370531
resetting env. episode 262.000000, reward total was -12.000000. running mean: -12.366826
resetting env. episode 263.000000, reward total was -17.000000. running mean: -12.413158
resetting env. episode 264.000000, reward total was -9.000000. running mean: -12.379026
resetting env. episode 265.000000, reward total was -15.000000. running mean: -12.405236
resetting env. episode 266.000000, reward total was -14.000000. running mean: -12.421184
resetting env. episode 267.000000, reward total was -12.000000. running mean: -12.416972
resetting env. episode 268.000000, reward total was -11.000000. running mean: -12.402802
resetting env. episode 269.000000, reward total was -8.000000. running mean: -12.358774
resetting env. episode 270.000000, reward total was -14.000000. running mean: -12.375186
resetting env. episode 271.000000, reward total was -15.000000. running mean: -12.401434
resetting env. episode 272.000000, reward total was -10.000000. running mean: -12.377420
resetting env. episode 273.000000, reward total was -11.000000. running mean: -12.363646
resetting env. episode 274.000000, reward total was -18.000000. running mean: -12.420009
resetting env. episode 275.000000, reward total was -17.000000. running mean: -12.465809
resetting env. episode 276.000000, reward total was -10.000000. running mean: -12.441151
resetting env. episode 277.000000, reward total was -11.000000. running mean: -12.426740
resetting env. episode 278.000000, reward total was -9.000000. running mean: -12.392472
resetting env. episode 279.000000, reward total was -8.000000. running mean: -12.348548
resetting env. episode 280.000000, reward total was -5.000000. running mean: -12.275062
resetting env. episode 281.000000, reward total was -15.000000. running mean: -12.302312
resetting env. episode 282.000000, reward total was -12.000000. running mean: -12.299288
resetting env. episode 283.000000, reward total was -9.000000. running mean: -12.266296
resetting env. episode 284.000000, reward total was -13.000000. running mean: -12.273633
resetting env. episode 285.000000, reward total was -13.000000. running mean: -12.280896
resetting env. episode 286.000000, reward total was -4.000000. running mean: -12.198087
resetting env. episode 287.000000, reward total was -19.000000. running mean: -12.266106
resetting env. episode 288.000000, reward total was -10.000000. running mean: -12.243445
resetting env. episode 289.000000, reward total was -14.000000. running mean: -12.261011
resetting env. episode 290.000000, reward total was -5.000000. running mean: -12.188401
resetting env. episode 291.000000, reward total was -11.000000. running mean: -12.176517
resetting env. episode 292.000000, reward total was -12.000000. running mean: -12.174752
resetting env. episode 293.000000, reward total was -8.000000. running mean: -12.133004
resetting env. episode 294.000000, reward total was -12.000000. running mean: -12.131674
resetting env. episode 295.000000, reward total was -17.000000. running mean: -12.180357
resetting env. episode 296.000000, reward total was -8.000000. running mean: -12.138554
resetting env. episode 297.000000, reward total was -16.000000. running mean: -12.177168
resetting env. episode 298.000000, reward total was -13.000000. running mean: -12.185397
resetting env. episode 299.000000, reward total was -15.000000. running mean: -12.213543
resetting env. episode 300.000000, reward total was -13.000000. running mean: -12.221407
resetting env. episode 301.000000, reward total was -20.000000. running mean: -12.299193
resetting env. episode 302.000000, reward total was -15.000000. running mean: -12.326201
resetting env. episode 303.000000, reward total was -16.000000. running mean: -12.362939
resetting env. episode 304.000000, reward total was -11.000000. running mean: -12.349310
resetting env. episode 305.000000, reward total was -13.000000. running mean: -12.355817
resetting env. episode 306.000000, reward total was -1.000000. running mean: -12.242258
resetting env. episode 307.000000, reward total was -3.000000. running mean: -12.149836
resetting env. episode 308.000000, reward total was -11.000000. running mean: -12.138338
resetting env. episode 309.000000, reward total was -15.000000. running mean: -12.166954
resetting env. episode 310.000000, reward total was -17.000000. running mean: -12.215285
resetting env. episode 311.000000, reward total was -13.000000. running mean: -12.223132
resetting env. episode 312.000000, reward total was -11.000000. running mean: -12.210900
resetting env. episode 313.000000, reward total was -9.000000. running mean: -12.178791
resetting env. episode 314.000000, reward total was -11.000000. running mean: -12.167004
resetting env. episode 315.000000, reward total was -12.000000. running mean: -12.165333
resetting env. episode 316.000000, reward total was -12.000000. running mean: -12.163680
resetting env. episode 317.000000, reward total was -7.000000. running mean: -12.112043
resetting env. episode 318.000000, reward total was -9.000000. running mean: -12.080923
resetting env. episode 319.000000, reward total was -15.000000. running mean: -12.110114
resetting env. episode 320.000000, reward total was -15.000000. running mean: -12.139013
resetting env. episode 321.000000, reward total was -8.000000. running mean: -12.097622
resetting env. episode 322.000000, reward total was -14.000000. running mean: -12.116646
resetting env. episode 323.000000, reward total was -6.000000. running mean: -12.055480
resetting env. episode 324.000000, reward total was -16.000000. running mean: -12.094925
resetting env. episode 325.000000, reward total was -12.000000. running mean: -12.093976
resetting env. episode 326.000000, reward total was -13.000000. running mean: -12.103036
resetting env. episode 327.000000, reward total was -7.000000. running mean: -12.052006
resetting env. episode 328.000000, reward total was -11.000000. running mean: -12.041486
resetting env. episode 329.000000, reward total was -14.000000. running mean: -12.061071
resetting env. episode 330.000000, reward total was -11.000000. running mean: -12.050460
resetting env. episode 331.000000, reward total was -15.000000. running mean: -12.079955
resetting env. episode 332.000000, reward total was -11.000000. running mean: -12.069156
resetting env. episode 333.000000, reward total was -12.000000. running mean: -12.068464
resetting env. episode 334.000000, reward total was -15.000000. running mean: -12.097780
resetting env. episode 335.000000, reward total was -18.000000. running mean: -12.156802
resetting env. episode 336.000000, reward total was -17.000000. running mean: -12.205234
resetting env. episode 337.000000, reward total was -9.000000. running mean: -12.173181
resetting env. episode 338.000000, reward total was -16.000000. running mean: -12.211450
resetting env. episode 339.000000, reward total was -12.000000. running mean: -12.209335
resetting env. episode 340.000000, reward total was -16.000000. running mean: -12.247242
resetting env. episode 341.000000, reward total was -16.000000. running mean: -12.284769
resetting env. episode 342.000000, reward total was -14.000000. running mean: -12.301922
resetting env. episode 343.000000, reward total was -11.000000. running mean: -12.288902
resetting env. episode 344.000000, reward total was -9.000000. running mean: -12.256013
resetting env. episode 345.000000, reward total was -14.000000. running mean: -12.273453
resetting env. episode 346.000000, reward total was -13.000000. running mean: -12.280719
resetting env. episode 347.000000, reward total was -7.000000. running mean: -12.227912
resetting env. episode 348.000000, reward total was -12.000000. running mean: -12.225632
resetting env. episode 349.000000, reward total was -8.000000. running mean: -12.183376
resetting env. episode 350.000000, reward total was -7.000000. running mean: -12.131542
resetting env. episode 351.000000, reward total was -19.000000. running mean: -12.200227
resetting env. episode 352.000000, reward total was -11.000000. running mean: -12.188225
resetting env. episode 353.000000, reward total was -10.000000. running mean: -12.166342
resetting env. episode 354.000000, reward total was -9.000000. running mean: -12.134679
resetting env. episode 355.000000, reward total was -11.000000. running mean: -12.123332
resetting env. episode 356.000000, reward total was -12.000000. running mean: -12.122099
resetting env. episode 357.000000, reward total was -15.000000. running mean: -12.150878
resetting env. episode 358.000000, reward total was -4.000000. running mean: -12.069369
resetting env. episode 359.000000, reward total was -9.000000. running mean: -12.038675
resetting env. episode 360.000000, reward total was -19.000000. running mean: -12.108289
resetting env. episode 361.000000, reward total was -18.000000. running mean: -12.167206
resetting env. episode 362.000000, reward total was -9.000000. running mean: -12.135534
resetting env. episode 363.000000, reward total was -16.000000. running mean: -12.174178
resetting env. episode 364.000000, reward total was -4.000000. running mean: -12.092437
resetting env. episode 365.000000, reward total was -6.000000. running mean: -12.031512
resetting env. episode 366.000000, reward total was -14.000000. running mean: -12.051197
resetting env. episode 367.000000, reward total was -11.000000. running mean: -12.040685
resetting env. episode 368.000000, reward total was -10.000000. running mean: -12.020278
resetting env. episode 369.000000, reward total was -17.000000. running mean: -12.070076
resetting env. episode 370.000000, reward total was -9.000000. running mean: -12.039375
resetting env. episode 371.000000, reward total was -11.000000. running mean: -12.028981
resetting env. episode 372.000000, reward total was -10.000000. running mean: -12.008691
resetting env. episode 373.000000, reward total was -14.000000. running mean: -12.028604
resetting env. episode 374.000000, reward total was -13.000000. running mean: -12.038318
resetting env. episode 375.000000, reward total was -10.000000. running mean: -12.017935
resetting env. episode 376.000000, reward total was -11.000000. running mean: -12.007756
resetting env. episode 377.000000, reward total was -10.000000. running mean: -11.987678
resetting env. episode 378.000000, reward total was -9.000000. running mean: -11.957801
resetting env. episode 379.000000, reward total was -11.000000. running mean: -11.948223
resetting env. episode 380.000000, reward total was -11.000000. running mean: -11.938741
resetting env. episode 381.000000, reward total was -15.000000. running mean: -11.969354
resetting env. episode 382.000000, reward total was -7.000000. running mean: -11.919660
resetting env. episode 383.000000, reward total was -5.000000. running mean: -11.850464
resetting env. episode 384.000000, reward total was -15.000000. running mean: -11.881959
resetting env. episode 385.000000, reward total was -13.000000. running mean: -11.893139
resetting env. episode 386.000000, reward total was -12.000000. running mean: -11.894208
resetting env. episode 387.000000, reward total was -13.000000. running mean: -11.905266
resetting env. episode 388.000000, reward total was -13.000000. running mean: -11.916213
resetting env. episode 389.000000, reward total was -7.000000. running mean: -11.867051
resetting env. episode 390.000000, reward total was -6.000000. running mean: -11.808381
resetting env. episode 391.000000, reward total was -5.000000. running mean: -11.740297
resetting env. episode 392.000000, reward total was -8.000000. running mean: -11.702894
resetting env. episode 393.000000, reward total was -12.000000. running mean: -11.705865
resetting env. episode 394.000000, reward total was -13.000000. running mean: -11.718806
resetting env. episode 395.000000, reward total was -9.000000. running mean: -11.691618
resetting env. episode 396.000000, reward total was -9.000000. running mean: -11.664702
resetting env. episode 397.000000, reward total was -11.000000. running mean: -11.658055
resetting env. episode 398.000000, reward total was -16.000000. running mean: -11.701474
resetting env. episode 399.000000, reward total was -12.000000. running mean: -11.704460
resetting env. episode 400.000000, reward total was -9.000000. running mean: -11.677415
resetting env. episode 401.000000, reward total was -16.000000. running mean: -11.720641
resetting env. episode 402.000000, reward total was -8.000000. running mean: -11.683435
resetting env. episode 403.000000, reward total was -14.000000. running mean: -11.706600
resetting env. episode 404.000000, reward total was -9.000000. running mean: -11.679534
resetting env. episode 405.000000, reward total was -11.000000. running mean: -11.672739
resetting env. episode 406.000000, reward total was -10.000000. running mean: -11.656011
resetting env. episode 407.000000, reward total was -9.000000. running mean: -11.629451
resetting env. episode 408.000000, reward total was -11.000000. running mean: -11.623157
resetting env. episode 409.000000, reward total was -8.000000. running mean: -11.586925
resetting env. episode 410.000000, reward total was -7.000000. running mean: -11.541056
resetting env. episode 411.000000, reward total was -6.000000. running mean: -11.485645
resetting env. episode 412.000000, reward total was -11.000000. running mean: -11.480789
resetting env. episode 413.000000, reward total was -10.000000. running mean: -11.465981
resetting env. episode 414.000000, reward total was -17.000000. running mean: -11.521321
resetting env. episode 415.000000, reward total was -15.000000. running mean: -11.556108
resetting env. episode 416.000000, reward total was -9.000000. running mean: -11.530547
resetting env. episode 417.000000, reward total was -9.000000. running mean: -11.505242
resetting env. episode 418.000000, reward total was -12.000000. running mean: -11.510189
resetting env. episode 419.000000, reward total was -16.000000. running mean: -11.555087
resetting env. episode 420.000000, reward total was -11.000000. running mean: -11.549536
resetting env. episode 421.000000, reward total was -12.000000. running mean: -11.554041
resetting env. episode 422.000000, reward total was -9.000000. running mean: -11.528501
resetting env. episode 423.000000, reward total was -10.000000. running mean: -11.513216
resetting env. episode 424.000000, reward total was -11.000000. running mean: -11.508083
resetting env. episode 425.000000, reward total was -3.000000. running mean: -11.423003
resetting env. episode 426.000000, reward total was -12.000000. running mean: -11.428773
resetting env. episode 427.000000, reward total was -11.000000. running mean: -11.424485
resetting env. episode 428.000000, reward total was -15.000000. running mean: -11.460240
resetting env. episode 429.000000, reward total was -12.000000. running mean: -11.465638
resetting env. episode 430.000000, reward total was -12.000000. running mean: -11.470981
resetting env. episode 431.000000, reward total was -9.000000. running mean: -11.446271
resetting env. episode 432.000000, reward total was -15.000000. running mean: -11.481809
resetting env. episode 433.000000, reward total was -12.000000. running mean: -11.486991
resetting env. episode 434.000000, reward total was -18.000000. running mean: -11.552121
resetting env. episode 435.000000, reward total was -15.000000. running mean: -11.586599
resetting env. episode 436.000000, reward total was -15.000000. running mean: -11.620733
resetting env. episode 437.000000, reward total was -13.000000. running mean: -11.634526
resetting env. episode 438.000000, reward total was -7.000000. running mean: -11.588181
resetting env. episode 439.000000, reward total was -12.000000. running mean: -11.592299
resetting env. episode 440.000000, reward total was -10.000000. running mean: -11.576376
resetting env. episode 441.000000, reward total was -15.000000. running mean: -11.610612
resetting env. episode 442.000000, reward total was -15.000000. running mean: -11.644506
resetting env. episode 443.000000, reward total was -20.000000. running mean: -11.728061
resetting env. episode 444.000000, reward total was -15.000000. running mean: -11.760781
resetting env. episode 445.000000, reward total was -12.000000. running mean: -11.763173
resetting env. episode 446.000000, reward total was -5.000000. running mean: -11.695541
resetting env. episode 447.000000, reward total was -17.000000. running mean: -11.748586
resetting env. episode 448.000000, reward total was -12.000000. running mean: -11.751100
resetting env. episode 449.000000, reward total was -14.000000. running mean: -11.773589
resetting env. episode 450.000000, reward total was -8.000000. running mean: -11.735853
resetting env. episode 451.000000, reward total was -17.000000. running mean: -11.788494
resetting env. episode 452.000000, reward total was -8.000000. running mean: -11.750609
resetting env. episode 453.000000, reward total was -13.000000. running mean: -11.763103
resetting env. episode 454.000000, reward total was -10.000000. running mean: -11.745472
resetting env. episode 455.000000, reward total was -15.000000. running mean: -11.778018
resetting env. episode 456.000000, reward total was -7.000000. running mean: -11.730237
resetting env. episode 457.000000, reward total was -9.000000. running mean: -11.702935
resetting env. episode 458.000000, reward total was -15.000000. running mean: -11.735906
resetting env. episode 459.000000, reward total was -12.000000. running mean: -11.738547
resetting env. episode 460.000000, reward total was -11.000000. running mean: -11.731161
resetting env. episode 461.000000, reward total was -15.000000. running mean: -11.763849
resetting env. episode 462.000000, reward total was -12.000000. running mean: -11.766211
resetting env. episode 463.000000, reward total was -6.000000. running mean: -11.708549
resetting env. episode 464.000000, reward total was -12.000000. running mean: -11.711463
resetting env. episode 465.000000, reward total was -13.000000. running mean: -11.724349
resetting env. episode 466.000000, reward total was -15.000000. running mean: -11.757105
resetting env. episode 467.000000, reward total was -8.000000. running mean: -11.719534
resetting env. episode 468.000000, reward total was -13.000000. running mean: -11.732339
resetting env. episode 469.000000, reward total was -13.000000. running mean: -11.745015
resetting env. episode 470.000000, reward total was -13.000000. running mean: -11.757565
resetting env. episode 471.000000, reward total was -10.000000. running mean: -11.739990
resetting env. episode 472.000000, reward total was -13.000000. running mean: -11.752590
resetting env. episode 473.000000, reward total was -12.000000. running mean: -11.755064
resetting env. episode 474.000000, reward total was -7.000000. running mean: -11.707513
resetting env. episode 475.000000, reward total was -8.000000. running mean: -11.670438
resetting env. episode 476.000000, reward total was -15.000000. running mean: -11.703734
resetting env. episode 477.000000, reward total was -8.000000. running mean: -11.666696
resetting env. episode 478.000000, reward total was -9.000000. running mean: -11.640029
resetting env. episode 479.000000, reward total was -17.000000. running mean: -11.693629
resetting env. episode 480.000000, reward total was 2.000000. running mean: -11.556693
resetting env. episode 481.000000, reward total was -16.000000. running mean: -11.601126
resetting env. episode 482.000000, reward total was -17.000000. running mean: -11.655115
resetting env. episode 483.000000, reward total was -5.000000. running mean: -11.588564
resetting env. episode 484.000000, reward total was -15.000000. running mean: -11.622678
resetting env. episode 485.000000, reward total was -15.000000. running mean: -11.656451
resetting env. episode 486.000000, reward total was -18.000000. running mean: -11.719887
resetting env. episode 487.000000, reward total was -15.000000. running mean: -11.752688
resetting env. episode 488.000000, reward total was -16.000000. running mean: -11.795161
resetting env. episode 489.000000, reward total was -17.000000. running mean: -11.847209
resetting env. episode 490.000000, reward total was -15.000000. running mean: -11.878737
resetting env. episode 491.000000, reward total was -12.000000. running mean: -11.879950
resetting env. episode 492.000000, reward total was -13.000000. running mean: -11.891150
resetting env. episode 493.000000, reward total was -2.000000. running mean: -11.792239
resetting env. episode 494.000000, reward total was -10.000000. running mean: -11.774316
resetting env. episode 495.000000, reward total was -8.000000. running mean: -11.736573
resetting env. episode 496.000000, reward total was -13.000000. running mean: -11.749207
resetting env. episode 497.000000, reward total was -9.000000. running mean: -11.721715
resetting env. episode 498.000000, reward total was -12.000000. running mean: -11.724498
resetting env. episode 499.000000, reward total was -15.000000. running mean: -11.757253
resetting env. episode 500.000000, reward total was -15.000000. running mean: -11.789681
CPU times: user 1h 25min 29s, sys: 29min 24s, total: 1h 54min 54s
Wall time: 58min 19s

In [0]:
play_game(env, model)


Episode finished without success, accumulated reward = -1.0


Once Loop Reflect

In [0]:
# import pickle
# pickle.dump(model, open('model.pkl', 'wb'))

In [0]: