异步调参-checkpoint



In [2]:
import multiprocessing
import threading
import numpy as np
import tensorflow as tf
from emulator.main import Account
from agent.access import Access
from agent.main import ExplorerFramework

In [3]:
NUMS_CPU = multiprocessing.cpu_count()
state_size = [50, 58, 5]
action_size = 3
max_episodes = 100
GD = {}

In [4]:
class Worker(ExplorerFramework):
    def __init__(self, access, name, observation, action_size):
        super().__init__(access, name, observation, action_size)

    def run(self, sess, max_episodes, t_max=32):
        episode_score_list = []
        episode = 0
        while episode < max_episodes:
            episode += 1
            episode_socre = self.run_episode(sess, t_max)
            episode_score_list.append(episode_socre)
            GD[str(self.name)] = episode_score_list
            if self.name == 'W0':
                print('Episode: %f, score: %f' % (episode, episode_socre))
                print('\n')

In [5]:
with tf.Session() as sess:
    with tf.device("/cpu:0"):
        A = Access(state_size, action_size)
        F_list = []
        for i in range(NUMS_CPU):
            F_list.append(Worker(A, 'W%i' % i, state_size, action_size))
        COORD = tf.train.Coordinator()
        sess.run(tf.global_variables_initializer())
        sess.graph.finalize()

        threads_list = []
        for ac in F_list:
            job = lambda: ac.run(sess, max_episodes)
            t = threading.Thread(target=job)
            t.start()
            threads_list.append(t)
        COORD.join(threads_list)
        A.save(sess, 'model/saver_1.ckpt')


graph W0
graph W1
graph W2
graph W3
graph W4
graph W5
graph W6
graph W7
actor: 0.090625, actor_grad: 1.362861, policy mean: 0.893512, policy: 0.118618, entropy: -0.077558, actor_norm: 0.049565, critic: 4.667977, critic_grad: 3.430487, value: 0.042507, critic_norm: 4.625470, value_mean: -0.150253, advantage: 0.148631
Episode: 1.000000, score: -0.038315


actor: -0.003320, actor_grad: 0.374540, policy mean: 0.895223, policy: 0.043664, entropy: -0.096179, actor_norm: 0.049195, critic: 3.585539, critic_grad: 0.765224, value: 0.003829, critic_norm: 3.581710, value_mean: -0.044602, advantage: 0.055342
Episode: 2.000000, score: -0.092119


actor: -0.014347, actor_grad: 0.307685, policy mean: 1.207541, policy: 0.042874, entropy: -0.105553, actor_norm: 0.048333, critic: 2.414108, critic_grad: 0.378342, value: 0.002138, critic_norm: 2.411970, value_mean: -0.040465, advantage: 0.037759
Episode: 3.000000, score: -0.130915


actor: -0.029574, actor_grad: 0.110584, policy mean: 0.866080, policy: 0.030808, entropy: -0.106616, actor_norm: 0.046234, critic: 1.364013, critic_grad: 0.231983, value: 0.001505, critic_norm: 1.362509, value_mean: -0.039831, advantage: 0.036117
Episode: 4.000000, score: 0.021072


actor: -0.064033, actor_grad: 0.045339, policy mean: 1.059762, policy: 0.002519, entropy: -0.109302, actor_norm: 0.042751, critic: 0.648284, critic_grad: 0.115139, value: 0.000106, critic_norm: 0.648179, value_mean: -0.007665, advantage: 0.002344
Episode: 5.000000, score: -0.026771


actor: -0.083422, actor_grad: 0.041921, policy mean: 1.136698, policy: -0.012068, entropy: -0.109517, actor_norm: 0.038163, critic: 0.261566, critic_grad: 0.076375, value: 0.000204, critic_norm: 0.261362, value_mean: 0.000022, advantage: -0.010622
Episode: 6.000000, score: 0.045137


actor: -0.098065, actor_grad: 0.033546, policy mean: 1.119958, policy: -0.020578, entropy: -0.109656, actor_norm: 0.032169, critic: 0.085231, critic_grad: 0.056549, value: 0.000383, critic_norm: 0.084848, value_mean: 0.015533, advantage: -0.018208
Episode: 7.000000, score: 0.062518


actor: -0.090054, actor_grad: 0.009999, policy mean: 1.131094, policy: -0.005937, entropy: -0.109748, actor_norm: 0.025631, critic: 0.019245, critic_grad: 0.022052, value: 0.000043, critic_norm: 0.019201, value_mean: 0.007863, advantage: -0.005230
Episode: 8.000000, score: -0.081231


actor: -0.100973, actor_grad: 0.009670, policy mean: 1.118159, policy: -0.010677, entropy: -0.109807, actor_norm: 0.019511, critic: 0.003040, critic_grad: 0.020638, value: 0.000094, critic_norm: 0.002946, value_mean: 0.006565, advantage: -0.009542
Episode: 9.000000, score: 0.046182


actor: -0.113456, actor_grad: 0.011693, policy mean: 1.115513, policy: -0.017993, entropy: -0.109826, actor_norm: 0.014363, critic: 0.000682, critic_grad: 0.032488, value: 0.000275, critic_norm: 0.000408, value_mean: 0.007028, advantage: -0.016091
Episode: 10.000000, score: -0.100948


actor: -0.111334, actor_grad: 0.006125, policy mean: 1.122158, policy: -0.011550, entropy: -0.109829, actor_norm: 0.010045, critic: 0.000205, critic_grad: 0.020550, value: 0.000121, critic_norm: 0.000084, value_mean: 0.000377, advantage: -0.010251
Episode: 11.000000, score: 0.041007


actor: -0.097188, actor_grad: 0.003278, policy mean: 1.086937, policy: 0.005714, entropy: -0.109834, actor_norm: 0.006932, critic: 0.000063, critic_grad: 0.010636, value: 0.000049, critic_norm: 0.000014, value_mean: -0.000104, advantage: 0.005306
Episode: 12.000000, score: -0.053010


actor: -0.109619, actor_grad: 0.004121, policy mean: 1.112065, policy: -0.004401, entropy: -0.109837, actor_norm: 0.004618, critic: 0.000051, critic_grad: 0.007758, value: 0.000051, critic_norm: 0.000000, value_mean: -0.000695, advantage: -0.003882
Episode: 13.000000, score: 0.047703


actor: -0.116173, actor_grad: 0.004152, policy mean: 1.114945, policy: -0.009393, entropy: -0.109844, actor_norm: 0.003064, critic: 0.000098, critic_grad: 0.016748, value: 0.000098, critic_norm: 0.000000, value_mean: 0.000198, advantage: -0.008373
Episode: 14.000000, score: -0.091210


actor: -0.105367, actor_grad: 0.001131, policy mean: 1.099029, policy: 0.002501, entropy: -0.109855, actor_norm: 0.001988, critic: 0.000022, critic_grad: 0.004596, value: 0.000022, critic_norm: 0.000000, value_mean: -0.000469, advantage: 0.002296
Episode: 15.000000, score: 0.002441


actor: -0.111205, actor_grad: 0.000795, policy mean: 1.098648, policy: -0.002613, entropy: -0.109858, actor_norm: 0.001266, critic: 0.000026, critic_grad: 0.004721, value: 0.000026, critic_norm: 0.000000, value_mean: -0.000521, advantage: -0.002363
Episode: 16.000000, score: 0.114238


actor: -0.107344, actor_grad: 0.001372, policy mean: 1.095976, policy: 0.001735, entropy: -0.109859, actor_norm: 0.000780, critic: 0.000028, critic_grad: 0.003206, value: 0.000028, critic_norm: 0.000000, value_mean: -0.000804, advantage: 0.001599
Episode: 17.000000, score: -0.153728


actor: -0.107230, actor_grad: 0.000470, policy mean: 1.096913, policy: 0.002150, entropy: -0.109860, actor_norm: 0.000480, critic: 0.000009, critic_grad: 0.003913, value: 0.000009, critic_norm: 0.000000, value_mean: 0.000860, advantage: 0.001961
Episode: 18.000000, score: 0.122254


actor: -0.097284, actor_grad: 0.000763, policy mean: 1.097986, policy: 0.012281, entropy: -0.109861, actor_norm: 0.000296, critic: 0.000151, critic_grad: 0.022378, value: 0.000151, critic_norm: 0.000000, value_mean: -0.000280, advantage: 0.011188
Episode: 19.000000, score: 0.086509


actor: -0.108588, actor_grad: 0.001108, policy mean: 1.097704, policy: 0.001095, entropy: -0.109861, actor_norm: 0.000178, critic: 0.000043, critic_grad: 0.001997, value: 0.000043, critic_norm: 0.000000, value_mean: 0.000336, advantage: 0.001000
Episode: 20.000000, score: -0.009732


actor: -0.111335, actor_grad: 0.001163, policy mean: 1.099067, policy: -0.001581, entropy: -0.109861, actor_norm: 0.000106, critic: 0.000015, critic_grad: 0.002867, value: 0.000015, critic_norm: 0.000000, value_mean: -0.000325, advantage: -0.001435
Episode: 21.000000, score: 0.070722


actor: -0.112314, actor_grad: 0.001755, policy mean: 1.097527, policy: -0.002518, entropy: -0.109861, actor_norm: 0.000064, critic: 0.000019, critic_grad: 0.004575, value: 0.000019, critic_norm: 0.000000, value_mean: -0.001282, advantage: -0.002294
Episode: 22.000000, score: -0.117025


actor: -0.111387, actor_grad: 0.000430, policy mean: 1.098313, policy: -0.001566, entropy: -0.109861, actor_norm: 0.000039, critic: 0.000016, critic_grad: 0.002854, value: 0.000016, critic_norm: 0.000000, value_mean: 0.000368, advantage: -0.001425
Episode: 23.000000, score: 0.234179


actor: -0.101199, actor_grad: 0.002176, policy mean: 1.098879, policy: 0.008638, entropy: -0.109861, actor_norm: 0.000024, critic: 0.000099, critic_grad: 0.015725, value: 0.000099, critic_norm: 0.000000, value_mean: -0.000302, advantage: 0.007861
Episode: 24.000000, score: -0.076099


actor: -0.110954, actor_grad: 0.000620, policy mean: 1.098174, policy: -0.001108, entropy: -0.109861, actor_norm: 0.000015, critic: 0.000002, critic_grad: 0.002024, value: 0.000002, critic_norm: 0.000000, value_mean: 0.000442, advantage: -0.001010
Episode: 25.000000, score: 0.052271


actor: -0.098429, actor_grad: 0.001182, policy mean: 1.098822, policy: 0.011423, entropy: -0.109861, actor_norm: 0.000009, critic: 0.000132, critic_grad: 0.020789, value: 0.000132, critic_norm: 0.000000, value_mean: 0.000311, advantage: 0.010396
Episode: 26.000000, score: 0.067228


actor: -0.119894, actor_grad: 0.003107, policy mean: 1.098836, policy: -0.010039, entropy: -0.109861, actor_norm: 0.000006, critic: 0.000101, critic_grad: 0.018265, value: 0.000101, critic_norm: 0.000000, value_mean: -0.000519, advantage: -0.009135
Episode: 27.000000, score: 0.024435


actor: -0.106872, actor_grad: 0.001228, policy mean: 1.098447, policy: 0.002985, entropy: -0.109861, actor_norm: 0.000004, critic: 0.000017, critic_grad: 0.005445, value: 0.000017, critic_norm: 0.000000, value_mean: -0.000536, advantage: 0.002720
Episode: 28.000000, score: 0.028256


actor: -0.105251, actor_grad: 0.000839, policy mean: 1.098120, policy: 0.004607, entropy: -0.109861, actor_norm: 0.000003, critic: 0.000049, critic_grad: 0.008376, value: 0.000049, critic_norm: 0.000000, value_mean: 0.001055, advantage: 0.004193
Episode: 29.000000, score: 0.019067


actor: -0.114023, actor_grad: 0.002207, policy mean: 1.098320, policy: -0.004165, entropy: -0.109861, actor_norm: 0.000002, critic: 0.000073, critic_grad: 0.007565, value: 0.000073, critic_norm: 0.000000, value_mean: -0.001395, advantage: -0.003790
Episode: 30.000000, score: -0.174625


actor: -0.106601, actor_grad: 0.000944, policy mean: 1.097628, policy: 0.003258, entropy: -0.109861, actor_norm: 0.000002, critic: 0.000031, critic_grad: 0.005941, value: 0.000031, critic_norm: 0.000000, value_mean: -0.000480, advantage: 0.002968
Episode: 31.000000, score: -0.051041


actor: -0.117062, actor_grad: 0.002202, policy mean: 1.098984, policy: -0.007203, entropy: -0.109861, actor_norm: 0.000002, critic: 0.000053, critic_grad: 0.013100, value: 0.000053, critic_norm: 0.000000, value_mean: -0.000350, advantage: -0.006552
Episode: 32.000000, score: -0.264264


actor: -0.111070, actor_grad: 0.001499, policy mean: 1.098882, policy: -0.001211, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000023, critic_grad: 0.002191, value: 0.000023, critic_norm: 0.000000, value_mean: -0.000551, advantage: -0.001098
Episode: 33.000000, score: -0.067908


actor: -0.101469, actor_grad: 0.000279, policy mean: 1.098647, policy: 0.008391, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000071, critic_grad: 0.015273, value: 0.000071, critic_norm: 0.000000, value_mean: 0.000221, advantage: 0.007637
Episode: 34.000000, score: 0.061542


actor: -0.116780, actor_grad: 0.002231, policy mean: 1.098713, policy: -0.006921, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000066, critic_grad: 0.012591, value: 0.000066, critic_norm: 0.000000, value_mean: -0.000495, advantage: -0.006298
Episode: 35.000000, score: -0.189293


actor: -0.108452, actor_grad: 0.000960, policy mean: 1.098487, policy: 0.001408, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000044, critic_grad: 0.002576, value: 0.000044, critic_norm: 0.000000, value_mean: -0.001116, advantage: 0.001282
Episode: 36.000000, score: -0.218858


actor: -0.109246, actor_grad: 0.000751, policy mean: 1.097474, policy: 0.000614, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000019, critic_grad: 0.001115, value: 0.000019, critic_norm: 0.000000, value_mean: 0.000243, advantage: 0.000559
Episode: 37.000000, score: -0.010917


actor: -0.110934, actor_grad: 0.003317, policy mean: 1.098463, policy: -0.001074, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000062, critic_grad: 0.001952, value: 0.000062, critic_norm: 0.000000, value_mean: -0.000154, advantage: -0.000977
Episode: 38.000000, score: -0.020311


actor: -0.109961, actor_grad: 0.000342, policy mean: 1.098625, policy: -0.000101, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000005, critic_grad: 0.000189, value: 0.000005, critic_norm: 0.000000, value_mean: 0.000534, advantage: -0.000092
Episode: 39.000000, score: 0.065591


actor: -0.104962, actor_grad: 0.001002, policy mean: 1.098762, policy: 0.004898, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000037, critic_grad: 0.008922, value: 0.000037, critic_norm: 0.000000, value_mean: -0.000362, advantage: 0.004459
Episode: 40.000000, score: -0.035784


actor: -0.114373, actor_grad: 0.001475, policy mean: 1.098479, policy: -0.004513, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000022, critic_grad: 0.008218, value: 0.000022, critic_norm: 0.000000, value_mean: -0.000226, advantage: -0.004110
Episode: 41.000000, score: -0.028043


actor: -0.102715, actor_grad: 0.002246, policy mean: 1.098444, policy: 0.007144, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000067, critic_grad: 0.013020, value: 0.000067, critic_norm: 0.000000, value_mean: -0.001179, advantage: 0.006504
Episode: 42.000000, score: -0.057307


actor: -0.120382, actor_grad: 0.001208, policy mean: 1.098519, policy: -0.010523, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000114, critic_grad: 0.019132, value: 0.000114, critic_norm: 0.000000, value_mean: -0.001906, advantage: -0.009576
Episode: 43.000000, score: -0.024327


actor: -0.110295, actor_grad: 0.001687, policy mean: 1.098611, policy: -0.000436, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000021, critic_grad: 0.000774, value: 0.000021, critic_norm: 0.000000, value_mean: -0.000912, advantage: -0.000391
Episode: 44.000000, score: 0.078726


actor: -0.112649, actor_grad: 0.001351, policy mean: 1.098368, policy: -0.002789, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000022, critic_grad: 0.005071, value: 0.000022, critic_norm: 0.000000, value_mean: -0.000243, advantage: -0.002537
Episode: 45.000000, score: 0.085748


actor: -0.109934, actor_grad: 0.001732, policy mean: 1.098503, policy: -0.000074, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000014, critic_grad: 0.000131, value: 0.000014, critic_norm: 0.000000, value_mean: -0.000207, advantage: -0.000067
Episode: 46.000000, score: -0.003181


actor: -0.109341, actor_grad: 0.001170, policy mean: 1.098771, policy: 0.000519, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000022, critic_grad: 0.000945, value: 0.000022, critic_norm: 0.000000, value_mean: 0.000076, advantage: 0.000473
Episode: 47.000000, score: 0.118758


actor: -0.110400, actor_grad: 0.000978, policy mean: 1.098404, policy: -0.000540, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000022, critic_grad: 0.000984, value: 0.000022, critic_norm: 0.000000, value_mean: -0.000163, advantage: -0.000493
Episode: 48.000000, score: -0.006583


actor: -0.111844, actor_grad: 0.000972, policy mean: 1.098664, policy: -0.001984, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000020, critic_grad: 0.003607, value: 0.000020, critic_norm: 0.000000, value_mean: -0.000569, advantage: -0.001806
Episode: 49.000000, score: -0.029312


actor: -0.104940, actor_grad: 0.001968, policy mean: 1.098361, policy: 0.004919, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000036, critic_grad: 0.008956, value: 0.000036, critic_norm: 0.000000, value_mean: 0.000003, advantage: 0.004478
Episode: 50.000000, score: -0.083589


actor: -0.115840, actor_grad: 0.001499, policy mean: 1.098495, policy: -0.005980, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000050, critic_grad: 0.010883, value: 0.000050, critic_norm: 0.000000, value_mean: -0.000489, advantage: -0.005444
Episode: 51.000000, score: -0.131779


actor: -0.112767, actor_grad: 0.001141, policy mean: 1.098716, policy: -0.002907, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000025, critic_grad: 0.005285, value: 0.000025, critic_norm: 0.000000, value_mean: -0.000475, advantage: -0.002645
Episode: 52.000000, score: -0.029785


actor: -0.112899, actor_grad: 0.000433, policy mean: 1.098657, policy: -0.003039, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000014, critic_grad: 0.005534, value: 0.000014, critic_norm: 0.000000, value_mean: 0.000074, advantage: -0.002767
Episode: 53.000000, score: 0.021675


actor: -0.113804, actor_grad: 0.001691, policy mean: 1.098922, policy: -0.003944, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000046, critic_grad: 0.007163, value: 0.000046, critic_norm: 0.000000, value_mean: -0.000796, advantage: -0.003586
Episode: 54.000000, score: 0.112462


actor: -0.105444, actor_grad: 0.001438, policy mean: 1.098840, policy: 0.004416, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000026, critic_grad: 0.008046, value: 0.000026, critic_norm: 0.000000, value_mean: -0.000717, advantage: 0.004019
Episode: 55.000000, score: -0.061438


actor: -0.111201, actor_grad: 0.000653, policy mean: 1.099082, policy: -0.001342, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000011, critic_grad: 0.002432, value: 0.000011, critic_norm: 0.000000, value_mean: -0.000807, advantage: -0.001220
Episode: 56.000000, score: 0.014535


actor: -0.110883, actor_grad: 0.000309, policy mean: 1.098593, policy: -0.001023, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000006, critic_grad: 0.001864, value: 0.000006, critic_norm: 0.000000, value_mean: 0.000090, advantage: -0.000931
Episode: 57.000000, score: 0.137157


actor: -0.104750, actor_grad: 0.000481, policy mean: 1.098316, policy: 0.005110, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000031, critic_grad: 0.009314, value: 0.000031, critic_norm: 0.000000, value_mean: -0.000845, advantage: 0.004653
Episode: 58.000000, score: -0.198937


actor: -0.096076, actor_grad: 0.003454, policy mean: 1.098959, policy: 0.013783, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000191, critic_grad: 0.025087, value: 0.000191, critic_norm: 0.000000, value_mean: -0.000590, advantage: 0.012541
Episode: 59.000000, score: -0.027661


actor: -0.107911, actor_grad: 0.001587, policy mean: 1.097980, policy: 0.001949, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000026, critic_grad: 0.003557, value: 0.000026, critic_norm: 0.000000, value_mean: 0.000027, advantage: 0.001779
Episode: 60.000000, score: -0.011603


actor: -0.111786, actor_grad: 0.000805, policy mean: 1.098691, policy: -0.001926, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000021, critic_grad: 0.003507, value: 0.000021, critic_norm: 0.000000, value_mean: 0.000703, advantage: -0.001750
Episode: 61.000000, score: -0.078751


actor: -0.104125, actor_grad: 0.001721, policy mean: 1.097852, policy: 0.005734, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000047, critic_grad: 0.010450, value: 0.000047, critic_norm: 0.000000, value_mean: -0.000294, advantage: 0.005224
Episode: 62.000000, score: -0.001721


actor: -0.108500, actor_grad: 0.000324, policy mean: 1.099066, policy: 0.001360, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000012, critic_grad: 0.002473, value: 0.000012, critic_norm: 0.000000, value_mean: 0.000423, advantage: 0.001238
Episode: 63.000000, score: -0.172519


actor: -0.115317, actor_grad: 0.002216, policy mean: 1.100138, policy: -0.005457, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000030, critic_grad: 0.009928, value: 0.000030, critic_norm: 0.000000, value_mean: 0.000595, advantage: -0.004961
Episode: 64.000000, score: 0.118222


actor: -0.110430, actor_grad: 0.000997, policy mean: 1.098683, policy: -0.000570, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000019, critic_grad: 0.001041, value: 0.000019, critic_norm: 0.000000, value_mean: 0.000516, advantage: -0.000518
Episode: 65.000000, score: -0.011608


actor: -0.108101, actor_grad: 0.000878, policy mean: 1.098614, policy: 0.001759, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000016, critic_grad: 0.003198, value: 0.000016, critic_norm: 0.000000, value_mean: 0.000425, advantage: 0.001601
Episode: 66.000000, score: -0.118618


actor: -0.113371, actor_grad: 0.000290, policy mean: 1.098932, policy: -0.003511, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000016, critic_grad: 0.006378, value: 0.000016, critic_norm: 0.000000, value_mean: -0.001394, advantage: -0.003196
Episode: 67.000000, score: -0.107813


actor: -0.120266, actor_grad: 0.000283, policy mean: 1.098659, policy: -0.010406, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000110, critic_grad: 0.018937, value: 0.000110, critic_norm: 0.000000, value_mean: -0.000646, advantage: -0.009472
Episode: 68.000000, score: 0.002184


actor: -0.109005, actor_grad: 0.001762, policy mean: 1.098535, policy: 0.000855, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000020, critic_grad: 0.001557, value: 0.000020, critic_norm: 0.000000, value_mean: 0.000404, advantage: 0.000781
Episode: 69.000000, score: 0.028318


actor: -0.102241, actor_grad: 0.001063, policy mean: 1.098717, policy: 0.007619, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000063, critic_grad: 0.013877, value: 0.000063, critic_norm: 0.000000, value_mean: -0.000770, advantage: 0.006935
Episode: 70.000000, score: 0.004425


actor: -0.104143, actor_grad: 0.001010, policy mean: 1.098918, policy: 0.005717, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000053, critic_grad: 0.010407, value: 0.000053, critic_norm: 0.000000, value_mean: 0.000216, advantage: 0.005205
Episode: 71.000000, score: -0.026283


actor: -0.107227, actor_grad: 0.001340, policy mean: 1.098659, policy: 0.002633, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000020, critic_grad: 0.004788, value: 0.000020, critic_norm: 0.000000, value_mean: 0.000631, advantage: 0.002397
Episode: 72.000000, score: 0.081358


actor: -0.104126, actor_grad: 0.000748, policy mean: 1.098419, policy: 0.005734, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000049, critic_grad: 0.010437, value: 0.000049, critic_norm: 0.000000, value_mean: 0.000316, advantage: 0.005220
Episode: 73.000000, score: 0.047008


actor: -0.116782, actor_grad: 0.002880, policy mean: 1.098402, policy: -0.006922, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000067, critic_grad: 0.012604, value: 0.000067, critic_norm: 0.000000, value_mean: -0.000060, advantage: -0.006302
Episode: 74.000000, score: 0.016060


actor: -0.101296, actor_grad: 0.003135, policy mean: 1.098493, policy: 0.008564, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000083, critic_grad: 0.015599, value: 0.000083, critic_norm: 0.000000, value_mean: -0.000689, advantage: 0.007796
Episode: 75.000000, score: -0.126999


actor: -0.110208, actor_grad: 0.000253, policy mean: 1.098819, policy: -0.000348, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000002, critic_grad: 0.000628, value: 0.000002, critic_norm: 0.000000, value_mean: -0.000555, advantage: -0.000317
Episode: 76.000000, score: -0.204502


actor: -0.105105, actor_grad: 0.000445, policy mean: 1.098609, policy: 0.004755, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000035, critic_grad: 0.008658, value: 0.000035, critic_norm: 0.000000, value_mean: -0.000225, advantage: 0.004328
Episode: 77.000000, score: 0.081165


actor: -0.111759, actor_grad: 0.002006, policy mean: 1.098528, policy: -0.001899, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000025, critic_grad: 0.003461, value: 0.000025, critic_norm: 0.000000, value_mean: 0.000119, advantage: -0.001730
Episode: 78.000000, score: 0.033838


actor: -0.110928, actor_grad: 0.000836, policy mean: 1.098140, policy: -0.001068, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000014, critic_grad: 0.001933, value: 0.000014, critic_norm: 0.000000, value_mean: -0.000849, advantage: -0.000971
Episode: 79.000000, score: 0.040269


actor: -0.096554, actor_grad: 0.003120, policy mean: 1.098778, policy: 0.013306, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000186, critic_grad: 0.024220, value: 0.000186, critic_norm: 0.000000, value_mean: 0.000261, advantage: 0.012112
Episode: 80.000000, score: 0.019160


actor: -0.118507, actor_grad: 0.004113, policy mean: 1.098172, policy: -0.008647, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000090, critic_grad: 0.015747, value: 0.000090, critic_norm: 0.000000, value_mean: -0.000042, advantage: -0.007873
Episode: 81.000000, score: 0.041398


actor: -0.108791, actor_grad: 0.001689, policy mean: 1.098770, policy: 0.001069, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000028, critic_grad: 0.001953, value: 0.000028, critic_norm: 0.000000, value_mean: -0.000860, advantage: 0.000972
Episode: 82.000000, score: -0.027606


actor: -0.117999, actor_grad: 0.001365, policy mean: 1.098617, policy: -0.008139, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000087, critic_grad: 0.014809, value: 0.000087, critic_norm: 0.000000, value_mean: -0.000747, advantage: -0.007408
Episode: 83.000000, score: -0.053554


actor: -0.111485, actor_grad: 0.000337, policy mean: 1.098209, policy: -0.001625, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000009, critic_grad: 0.002947, value: 0.000009, critic_norm: 0.000000, value_mean: -0.001268, advantage: -0.001480
Episode: 84.000000, score: -0.224663


actor: -0.111787, actor_grad: 0.001387, policy mean: 1.098702, policy: -0.001927, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000034, critic_grad: 0.003515, value: 0.000034, critic_norm: 0.000000, value_mean: 0.000578, advantage: -0.001755
Episode: 85.000000, score: -0.011791


actor: -0.106720, actor_grad: 0.002009, policy mean: 1.099003, policy: 0.003141, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000046, critic_grad: 0.005712, value: 0.000046, critic_norm: 0.000000, value_mean: 0.000177, advantage: 0.002857
Episode: 86.000000, score: -0.160227


actor: -0.123860, actor_grad: 0.005419, policy mean: 1.098781, policy: -0.014000, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000197, critic_grad: 0.025488, value: 0.000197, critic_norm: 0.000000, value_mean: 0.000892, advantage: -0.012740
Episode: 87.000000, score: -0.028916


actor: -0.109825, actor_grad: 0.000764, policy mean: 1.098712, policy: 0.000035, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000009, critic_grad: 0.000055, value: 0.000009, critic_norm: 0.000000, value_mean: 0.000726, advantage: 0.000031
Episode: 88.000000, score: 0.125197


actor: -0.107545, actor_grad: 0.001632, policy mean: 1.098666, policy: 0.002316, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000040, critic_grad: 0.004229, value: 0.000040, critic_norm: 0.000000, value_mean: -0.001413, advantage: 0.002107
Episode: 89.000000, score: -0.112442


actor: -0.097322, actor_grad: 0.003270, policy mean: 1.098759, policy: 0.012538, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000160, critic_grad: 0.022828, value: 0.000160, critic_norm: 0.000000, value_mean: -0.000674, advantage: 0.011411
Episode: 90.000000, score: -0.076686


actor: -0.106065, actor_grad: 0.001327, policy mean: 1.098170, policy: 0.003795, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000031, critic_grad: 0.006910, value: 0.000031, critic_norm: 0.000000, value_mean: -0.000083, advantage: 0.003455
Episode: 91.000000, score: -0.132038


actor: -0.103674, actor_grad: 0.000376, policy mean: 1.098644, policy: 0.006187, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000041, critic_grad: 0.011267, value: 0.000041, critic_norm: 0.000000, value_mean: -0.000429, advantage: 0.005631
Episode: 92.000000, score: 0.134345


actor: -0.110360, actor_grad: 0.000570, policy mean: 1.098609, policy: -0.000500, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000003, critic_grad: 0.000914, value: 0.000003, critic_norm: 0.000000, value_mean: 0.000513, advantage: -0.000454
Episode: 93.000000, score: 0.012099


actor: -0.109639, actor_grad: 0.000683, policy mean: 1.098223, policy: 0.000221, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000008, critic_grad: 0.000406, value: 0.000008, critic_norm: 0.000000, value_mean: -0.000131, advantage: 0.000202
Episode: 94.000000, score: -0.023067


actor: -0.105525, actor_grad: 0.002549, policy mean: 1.099086, policy: 0.004335, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000029, critic_grad: 0.007891, value: 0.000029, critic_norm: 0.000000, value_mean: -0.000475, advantage: 0.003943
Episode: 95.000000, score: 0.034441


actor: -0.115369, actor_grad: 0.001771, policy mean: 1.099104, policy: -0.005508, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000042, critic_grad: 0.010023, value: 0.000042, critic_norm: 0.000000, value_mean: 0.000070, advantage: -0.005011
Episode: 96.000000, score: -0.097904


actor: -0.113831, actor_grad: 0.000798, policy mean: 1.098880, policy: -0.003970, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000024, critic_grad: 0.007222, value: 0.000024, critic_norm: 0.000000, value_mean: -0.000372, advantage: -0.003613
Episode: 97.000000, score: 0.149293


actor: -0.101732, actor_grad: 0.001512, policy mean: 1.098362, policy: 0.008128, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000074, critic_grad: 0.014798, value: 0.000074, critic_norm: 0.000000, value_mean: 0.000347, advantage: 0.007400
Episode: 98.000000, score: 0.024589


actor: -0.115677, actor_grad: 0.002468, policy mean: 1.099227, policy: -0.005817, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000049, critic_grad: 0.010582, value: 0.000049, critic_norm: 0.000000, value_mean: 0.000182, advantage: -0.005290
Episode: 99.000000, score: 0.114627


actor: -0.113314, actor_grad: 0.001595, policy mean: 1.098539, policy: -0.003454, entropy: -0.109861, actor_norm: 0.000001, critic: 0.000038, critic_grad: 0.006274, value: 0.000038, critic_norm: 0.000000, value_mean: -0.000798, advantage: -0.003141
Episode: 100.000000, score: -0.127978



In [6]:
import pandas as pd
import seaborn as sns
%matplotlib inline

tmp = pd.DataFrame(GD)
tmp.iloc[:500,:1].plot(figsize=(16,6), legend=False)


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ca10252e10>

In [7]:
tmp.plot(figsize=(16,6))


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c9c6311a90>

In [ ]: