估值修正-checkpoint



In [1]:
import numpy as np
import pandas as pd
from emulator.main import Account

In [2]:
A = Account()

In [3]:
actions = np.random.randint(0,3, size=(32))

In [4]:
rewrads_list = []
for i in actions:
    r,n,d = A.step(i)
    rewrads_list.append(r)

In [5]:
GAMMA = 0.9
R_list = []
R = 0
for i in rewrads_list[::-1]:
    R = i + GAMMA * R
    R_list.append(R)
R_list.reverse()

In [6]:
tmp = pd.DataFrame()
tmp['reward'] = rewrads_list
tmp['discount'] = R_list

In [7]:
tmp


Out[7]:
reward discount
0 0.006588 -0.011630
1 -0.009998 -0.020242
2 -0.000981 -0.011382
3 -0.003235 -0.011556
4 -0.001571 -0.009246
5 -0.003855 -0.008527
6 0.002484 -0.005191
7 0.000000 -0.008528
8 0.004114 -0.009475
9 0.000672 -0.015099
10 0.001422 -0.017523
11 -0.006369 -0.021049
12 0.000362 -0.016311
13 -0.009769 -0.018525
14 0.000150 -0.009730
15 -0.003153 -0.010978
16 -0.001789 -0.008694
17 -0.003495 -0.007672
18 -0.008901 -0.004641
19 0.009744 0.004733
20 0.003367 -0.005568
21 0.001658 -0.009927
22 0.000683 -0.012872
23 -0.003734 -0.015061
24 0.001064 -0.012586
25 -0.001857 -0.015167
26 -0.002752 -0.014789
27 -0.004749 -0.013374
28 -0.006626 -0.009584
29 -0.000160 -0.003286
30 0.000511 -0.003474
31 -0.004427 -0.004427

In [ ]: