Federated learning - Train a reinforcement learning agent in a CartPole environment

This turorial demonstrates training of a reinforcement learning agent using federated learning in a CartPole environment. Before running this program you would need to install OpenAI gym.

To train our agent we would be using a policy which uses a simple neural network that maps the CartPole environment's state space to an action space. This policy is trained using federated learning with the help of the Pysyft library. The program simulates that the policy training happens on a remote machine (represented by the remote worker Bob).

Author: Amit Rastogi Github: @amit-rastogi Twitter: @amitrastogi

Import Dependencies

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.distributions import Categorical
import gym
import numpy as np
import syft as sy

Create CartPole environment

env = gym.make('CartPole-v0')

Hook Torch and create a virtual remote worker

hook = sy.TorchHook(torch)
bob = sy.VirtualWorker(hook, id="bob")

Implement our neural network policy

class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.input = nn.Linear(4, 4)
        self.output = nn.Linear(4, 2)

        self.episode_log_probs = []
        self.episode_raw_rewards = []

    def forward(self, x):
        x = self.input(x)
        x = F.relu(x)
        x = self.output(x)
        x = F.softmax(x, dim=1)
        return x

policy = Policy()
optimizer = optim.SGD(params=policy.parameters(), lr=0.03)
#discount rate to be used for action score calculation
discount_rate = 0.95

def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    #send the environment state to bob
    state = state.send(bob)
    probs = policy(state)
    #we need to get the estimated probabilities back to sample the action since Categorical does not yet
    #support remote tensor operations as of now
    probs = probs.get()
    m = Categorical(probs)
    action = m.sample()
    #get the state back as we would be sending the new state to bob
    return action.item()

def discount_and_normailze_rewards():
    discounted_rewards = []
    cumulative_rewards = 0
    for reward in policy.episode_raw_rewards[::-1]:
        cumulative_rewards = reward + discount_rate * cumulative_rewards
        discounted_rewards.insert(0, cumulative_rewards)
    discounted_rewards = torch.tensor(discounted_rewards)
    discounted_rewards = (discounted_rewards - discounted_rewards.mean())/discounted_rewards.std()
    return discounted_rewards

def update_policy():
    policy_loss = []
    discounted_rewards = discount_and_normailze_rewards()
    for log_prob, action_score in zip(policy.episode_log_probs, discounted_rewards):
        policy_loss.append(-log_prob * action_score)
    policy_loss = torch.cat(policy_loss).sum()
    del policy.episode_log_probs[:]
    del policy.episode_raw_rewards[:]

Train our Policy

total_rewards = []
# send the policy to bob for training
for episode in range(500):
    state = env.reset()
    episode_rewards = 0
    for step in range(1000):
        action = select_action(state)
        state, reward, done, _ = env.step(action)
        #env.render()  #uncomment to render the current environment
        episode_rewards += reward
        if done:
    #to keep track of rewards earned in each episode

print('Average reward: {:.2f}\tMax reward: {:.2f}'.format(np.mean(total_rewards), np.max(total_rewards)))

Average reward: 19.17	Max reward: 83.00

Well Done!

Our agent managed to keep the pole upright for a maximum of 83 consecutive steps using a very simple neural network policy trained using federated learning with Pysyft.


In select_state method we have to get the estimated probabilities back to our local worker to sample the action since Categorical does not support remote tensor operations as of now.

