Each timestep $t$:
state is Markov, ie: $$ P(x_{t+1} \mid x_t) = P(x_{t+1} \mid x_1, x_2, \dots, x_t) $$
Therefore we can model as a Markov decision process (MDP).
Means the state captures all you need to know about the system
Dont want to choose deterministically, since we want to explore different ways of playing
Must satisfy $P(a_L \mid x) + P(a_R \mid x) = 1$ for all observations, ie agent must choose exactly one of $a_L$ or $a_R$.
Definitions:
Draw $\theta$ from a Gaussian distribution, with mean $\mathbf{\mu} = \{ \mu_1, \mu_2, \dots, \mu_K\}$, and axis-aligned variance, $\mathbf{\sigma} = \{\sigma_1, \sigma_2, \dots, \sigma_K\}$.
Continued to use parametrised policy:
$$ \pi(a \mid x; \theta) $$Take the derivative with respect to $\theta$:
$$ \nabla_\theta \E_\tau[R_\tau \mid \pi; \theta] = \nabla_\theta \int_\tau P(\tau \mid \pi; \theta)R_\tau \, d\tau \\ = \int_\tau \nabla_\theta P(\tau \mid \pi; \theta)R_\tau \, d\tau $$How to compute $\nabla_\theta \E_\tau[R_\tau]$?
We have:
$$ \E_\tau[R_\tau] = \int_\tau R_\tau P(\tau \mid \pi;\theta)\,d\tau $$Let's compute the gradient for a single concrete directory $\tau = (x_0, a_0, r_1, x_1, \dots, x_{T-1}, a_{T-1}, r_T, x_T)$:
The only terms that depend on $\theta$ are the action terms, so we have:
$$ \nabla_\theta P(\tau \mid x; \theta) = p(x_0 \mid \pi; \theta) \, \prod_{t=0}^{T-1} \left( p(r_t \mid a_t, x_t)\, p(x_{t+1} \mid a_t, x_t) \right) \prod_{t=0}^{T-1} \nabla_\theta p(a_t \mid x_t; \theta) $$Similarly the reward for the trajectory is:
$$ R_\tau = \sum_{t=1}^T r_t $$And so the gradient for this trajectory is:
$$ \hat{g}(\tau) := R_\tau \nabla_\theta \log P(\tau \mid \pi; \theta ) $$That is:
$$ \E_\tau[\hat{g}(\tau) \mid \pi; \theta] = \nabla_\theta \E_\tau[ R_\tau \mid \pi; \theta ] $$We are using a neural net to represent the policy, ie to represent $P(a \mid x; \theta)$.
The gradient $\nabla_\theta \log P(a \mid x; \theta)$ can therefore be obtained by back-propping the... well we want to do:
$$ \theta \leftarrow \theta + \alpha \nabla_\theta \E_\tau[R_\tau] $$So, we want to draw a sample of:
$$ \nabla_\theta \E_\tau[R_\tau] $$This sample is $\hat{g}(\tau)$, ie:
$$ \hat{g}(\tau) = R_\tau \nabla_\theta \log P(\tau \mid \pi; \theta) \\ = \left( \sum_{t'=1}^T r_{t'} \right) \sum_{t=0}^{T-1} \nabla_\theta \log P(a_t \mid x_t; \theta) $$
In [73]:
import torch
from torch import autograd, nn, optim
import torch.nn.functional as F
class Policy(nn.Module):
def __init__(self, input_size, output_size):
super().__init__()
self.h1 = nn.Linear(input_size, output_size)
def forward(self, x):
x = self.h1(x)
p = F.softmax(x)
a = torch.multinomial(p)
return a
torch.manual_seed(123)
policy = Policy(2, 2)
# let's imagine that if the input is [1, 0], action should be 0
# if the input is [0, 1], action should be 1
# just something simple for now...
# we can do this supervised...
training = [
{'input': [1, 0], 'target': 0},
{'input': [0, 1], 'target': 1},
]
opt = optim.Adam(params=policy.parameters(), lr=0.1)
correct_since_last_print = 0
num_epochs = 100
print_every = num_epochs // 10
for epoch in range(num_epochs):
for ex in training:
policy.zero_grad()
input = torch.FloatTensor(ex['input']).view(1, -1)
target = ex['target']
a = policy(autograd.Variable(input))
reward = 0
if a.data[0][0] == target:
reward = 1
correct_since_last_print += 1
if epoch % print_every == 0:
print('input', ex['input'], 'target', target, 'a', a.data[0][0], a.data[0][0] == target)
a.reinforce(torch.FloatTensor([[reward]]))
autograd.backward([a], [None])
opt.step()
if epoch % print_every == 0:
print('acc', correct_since_last_print / print_every / 2)
correct_since_last_print = 0
In [25]:
import torch
from torch import autograd
a = autograd.Variable(torch.FloatTensor([1.5]), requires_grad=True)
print('a', a)
b = a + 1
b.backward()
print('a.grad', a.grad)
a = autograd.Variable(torch.FloatTensor([1.5]), requires_grad=True)
b = torch.log(a)
b.backward()
print('a.grad', a.grad)
a = autograd.Variable(torch.FloatTensor([1.5, 2.5]), requires_grad=True)
b = torch.log(a[0])
b.backward()
print('a.grad', a.grad)
a = autograd.Variable(torch.FloatTensor([1.5, 2.5]), requires_grad=True)
b = torch.log(a[1])
b.backward()
print('a.grad', a.grad)
In [ ]:
import torch
from torch import autograd, nn, optim
import torch.nn.functional as F
class Policy(nn.Module):
def __init__(self, input_size, output_size):
super().__init__()
self.h1 = nn.Linear(input_size, output_size)
def forward(self, x):
x = self.h1(x)
p = F.softmax(x)
a = torch.multinomial(p)
return a
torch.manual_seed(123)
policy = Policy(2, 2)
training = [
{'input': [1, 0], 'target': 0},
{'input': [0, 1], 'target': 1},
]
opt = optim.Adam(params=policy.parameters(), lr=0.1)
correct_since_last_print = 0
num_epochs = 100
print_every = num_epochs // 10
for epoch in range(num_epochs):
for ex in training:
policy.zero_grad()
input = torch.FloatTensor(ex['input']).view(1, -1)
target = ex['target']
a = policy(autograd.Variable(input))
reward = 0
if a.data[0][0] == target:
reward = 1
correct_since_last_print += 1
if epoch % print_every == 0:
print('input', ex['input'], 'target', target, 'a', a.data[0][0], a.data[0][0] == target)
a.reinforce(torch.FloatTensor([[reward]]))
autograd.backward([a], [None])
opt.step()
if epoch % print_every == 0:
print('acc', correct_since_last_print / print_every / 2)
correct_since_last_print = 0