based on the paper 1995 StorckHochreiterSchmidhuber - Reinforcement driven information acquisition in nondeterministc environments
In [51]:
#==================================================================
#
# Author: Luca Celotti,
# Supervisor: Jean Rouat,
# PhD student at Universitè de Sherbrooke (Canada)
# Electrical Engineering Department
# funded by CHIST-ERA/FRQNT Québec for IGLU
#
# work finished the 22⋅02⋅2017 in the context of a course on
# Reinforcement Learning
#
# based on the paper "Reinforcement driven information acquisition in
# nondeterministc environments" (1995 Storck, Hochreiter, Schmidhuber)
# published in Proc. ICANN'95 vol.2
#
#==================================================================
from itertools import product
from __future__ import division
import numpy as np
class RDIA:
def __init__(self, n_states = n_states, n_actions = n_actions):
self.n_s = n_states; self.n_a = n_actions
self.Q = np.zeros((self.n_a,self.n_s))
self.c_ijk = np.zeros((self.n_a, self.n_s, self.n_s))
self.P_experimental = np.zeros((self.n_a, self.n_s, self.n_s))
# start from a random place
self.St = np.random.randint(0, self.n_s)
self.Stplus = np.random.randint(0, self.n_s)
def prob_ijk(self,at,c_ijk):
c_ij = np.sum(c_ijk[at,self.St,:])
p_ijk = c_ijk[at,self.St,:]/c_ij
if c_ij == 0:
p_ijk = c_ijk[at,self.St,:]*0
return p_ijk
def information_gain_D(self,at,c_ijk):
p_ijk_t = self.prob_ijk(at,c_ijk)
c_ijk[at,self.St,self.Stplus] += 1
p_ijt_tplus = self.prob_ijk(at,c_ijk)
return sum(abs(p_ijt_tplus-p_ijk_t))
def update_Q(self,at,c_ijk):
alpha = .5
gamma = .45
D = self.information_gain_D(at,c_ijk)
maxQ = max(self.Q[:,self.Stplus])
self.Q[at,self.St] = (1-alpha)*self.Q[at,self.St] + alpha*(D+gamma*maxQ)
def reconstruct_P(self):
# reconstruct the final experimental transition matrix
for i,j in product(np.arange(self.n_a), np.arange(self.n_s)):
c_ij = np.sum(self.c_ijk[i,j,:])
for k in np.arange(self.n_s):
self.P_experimental[i,j,k] = self.c_ijk[i,j,k]/c_ij
if c_ij == 0:
self.P_experimental[i,j,k] = 0
def learner_life(self, terminal_time = 1000,
epsilon = .5, transition_M=P):
count = 0
for t in np.arange(terminal_time):
# 1. pick at
# epsilon greedy action
if np.random.rand() < epsilon:
count += 1
at = np.random.randint(0, self.n_a)
else:
at = np.argmax(self.Q[:,self.St])
# 2. execute at and figure out where you end up, in which S(t+1)
self.Stplus = np.random.choice(self.n_s, 1, p=P[at,self.St,:])[0]
# 3. update Q value
self.update_Q(at,c_ijk)
self.c_ijk[at,self.St,self.Stplus] += 1
self.St = self.Stplus
self.reconstruct_P()
In [58]:
# generate an MDP
import mdptoolbox, mdptoolbox.example
n_states = 4; n_actions = 3
P, R = mdptoolbox.example.rand(n_states, n_actions)
fh = mdptoolbox.mdp.FiniteHorizon(P, R, 0.9, 4)
fh.run()
print '________________________value function'
print fh.V
print
print '________________________optimal policy'
print fh.policy
print
print '________________________transition matrix'
print P
print
print '________________________reward matrix shape'
print R.shape
In [56]:
MyAgent = RDIA(n_states = n_states, n_actions = n_actions)
MyAgent.learner_life(transition_M=P)
# compare if the algorithm was successgul
print '_______original______________'
print
print P
print '_____________________________'
print
print
print '_______reconstruction________'
print
print MyAgent.P_experimental
print '_____________________________'