based on the paper 1995 StorckHochreiterSchmidhuber - Reinforcement driven information acquisition in nondeterministc environments

Create the Agent that will explore the MDP!


In [51]:
#==================================================================
# 
# Author: Luca Celotti, 
# Supervisor: Jean Rouat,
# PhD student at Universitè de Sherbrooke (Canada)
# Electrical Engineering Department
# funded by CHIST-ERA/FRQNT Québec for IGLU
# 
# work finished the 22⋅02⋅2017 in the context of a course on
# Reinforcement Learning
#
# based on the paper "Reinforcement driven information acquisition in 
# nondeterministc environments" (1995 Storck, Hochreiter, Schmidhuber)
# published in Proc. ICANN'95 vol.2
#
#==================================================================

from itertools import product
from __future__ import division
import numpy as np


class RDIA:
    def __init__(self, n_states = n_states, n_actions = n_actions):
        self.n_s = n_states; self.n_a = n_actions
        self.Q = np.zeros((self.n_a,self.n_s))
        self.c_ijk = np.zeros((self.n_a, self.n_s, self.n_s))
        self.P_experimental = np.zeros((self.n_a, self.n_s, self.n_s))

        # start from a random place
        self.St = np.random.randint(0, self.n_s)
        self.Stplus = np.random.randint(0, self.n_s)
 
    def prob_ijk(self,at,c_ijk):
        c_ij = np.sum(c_ijk[at,self.St,:])
        p_ijk = c_ijk[at,self.St,:]/c_ij

        if c_ij == 0:
            p_ijk = c_ijk[at,self.St,:]*0
        return p_ijk

    def information_gain_D(self,at,c_ijk):
        p_ijk_t = self.prob_ijk(at,c_ijk)

        c_ijk[at,self.St,self.Stplus] += 1
        p_ijt_tplus = self.prob_ijk(at,c_ijk)
        return sum(abs(p_ijt_tplus-p_ijk_t))    

    def update_Q(self,at,c_ijk):
        alpha = .5
        gamma = .45
        D = self.information_gain_D(at,c_ijk)
        
        maxQ = max(self.Q[:,self.Stplus])
        self.Q[at,self.St] = (1-alpha)*self.Q[at,self.St] + alpha*(D+gamma*maxQ)  
    
    def reconstruct_P(self):
         # reconstruct the final experimental transition matrix    
        for i,j in product(np.arange(self.n_a), np.arange(self.n_s)):
            c_ij = np.sum(self.c_ijk[i,j,:])
            for k in np.arange(self.n_s):
                self.P_experimental[i,j,k] = self.c_ijk[i,j,k]/c_ij
                if c_ij == 0:
                    self.P_experimental[i,j,k] = 0
                    
    
    def learner_life(self, terminal_time = 1000, 
                     epsilon = .5, transition_M=P):
        
        count = 0
        for t in np.arange(terminal_time):

            # 1. pick at
            # epsilon greedy action
            if np.random.rand() < epsilon:
                count += 1
                at = np.random.randint(0, self.n_a)
            else:
                at = np.argmax(self.Q[:,self.St])

            # 2. execute at and figure out where you end up, in which S(t+1)
            self.Stplus = np.random.choice(self.n_s, 1, p=P[at,self.St,:])[0] 

            # 3. update Q value
            self.update_Q(at,c_ijk)
            self.c_ijk[at,self.St,self.Stplus] += 1
            self.St = self.Stplus
            
        self.reconstruct_P()

Create the MDP that the Agent will explore!


In [58]:
# generate an MDP
import mdptoolbox, mdptoolbox.example

n_states = 4; n_actions = 3
P, R = mdptoolbox.example.rand(n_states, n_actions)
fh = mdptoolbox.mdp.FiniteHorizon(P, R, 0.9, 4)
fh.run()

print '________________________value function'
print fh.V
print
print '________________________optimal policy'
print fh.policy
print
print '________________________transition matrix'
print P
print
print '________________________reward matrix shape'
print R.shape


________________________value function
[[ 1.5408514   1.12557695  0.71262179  0.33257408  0.        ]
 [ 1.35104173  0.93635475  0.49994657  0.15174813  0.        ]
 [ 2.06243583  1.62524022  1.13946731  0.59971964  0.        ]
 [ 2.22742992  1.7902343   1.3044614   0.76471372  0.        ]]

________________________optimal policy
[[2 2 2 2]
 [1 1 1 2]
 [0 0 0 0]
 [2 2 2 2]]

________________________transition matrix
[[[ 0.          1.          0.          0.        ]
  [ 0.          1.          0.          0.        ]
  [ 0.          0.          1.          0.        ]
  [ 0.22316471  0.40515529  0.11081943  0.26086057]]

 [[ 0.          0.38463411  0.61536589  0.        ]
  [ 0.34345463  0.          0.65654537  0.        ]
  [ 0.65828347  0.34171653  0.          0.        ]
  [ 0.          0.          1.          0.        ]]

 [[ 0.11072487  0.33007622  0.55919891  0.        ]
  [ 0.          1.          0.          0.        ]
  [ 0.46263903  0.          0.53736097  0.        ]
  [ 0.          0.          1.          0.        ]]]

________________________reward matrix shape
(3, 4, 4)

Run the Agent! Start the exploration!


In [56]:
MyAgent = RDIA(n_states = n_states, n_actions = n_actions)
MyAgent.learner_life(transition_M=P)


# compare if the algorithm was successgul
print '_______original______________'
print
print P
print '_____________________________'
print
print
print '_______reconstruction________'
print
print MyAgent.P_experimental
print '_____________________________'


_______original______________

[[[ 0.          1.          0.          0.        ]
  [ 0.          0.47334267  0.49178996  0.03486738]
  [ 0.          0.69933796  0.30066204  0.        ]
  [ 0.35021047  0.03046641  0.38850726  0.23081586]]

 [[ 0.31455356  0.29418361  0.05678908  0.33447376]
  [ 0.          0.34927368  0.17393582  0.4767905 ]
  [ 0.36538103  0.02244378  0.45029022  0.16188496]
  [ 0.          0.          0.          1.        ]]

 [[ 0.12296407  0.31172657  0.56530935  0.        ]
  [ 0.          0.          0.45556413  0.54443587]
  [ 0.          0.          1.          0.        ]
  [ 0.30654109  0.29942345  0.19841147  0.19562399]]]
_____________________________


_______reconstruction________

[[[ 0.          1.          0.          0.        ]
  [ 0.          0.53608247  0.42268041  0.04123711]
  [ 0.          0.76699029  0.23300971  0.        ]
  [ 0.45121951  0.01219512  0.36585366  0.17073171]]

 [[ 0.26582278  0.3164557   0.07594937  0.34177215]
  [ 0.          0.37037037  0.13888889  0.49074074]
  [ 0.40677966  0.01129944  0.43502825  0.14689266]
  [ 0.          0.          0.          1.        ]]

 [[ 0.07843137  0.25490196  0.66666667  0.        ]
  [ 0.          0.          0.52857143  0.47142857]
  [ 0.          0.          1.          0.        ]
  [ 0.27956989  0.35483871  0.16129032  0.20430108]]]
_____________________________