In [1]:
import pprint
import random
In [2]:
#Initialisation
q = [[0,0,0,0,0,0],
[0,0,0,0,0,0],
[0,0,0,0,0,0],
[0,0,0,0,0,0],
[0,0,0,0,0,0],
[0,0,0,0,0,0]]
r = [[-1,-1,-1,-1, 0, -1],
[-1,-1,-1, 0,-1,100],
[-1,-1,-1, 0,-1, -1],
[-1, 0, 0,-1, 0, -1],
[ 0,-1,-1, 0,-1,100],
[-1, 0,-1,-1, 0,100]]
goal_state = 5
current_state = None
In [3]:
#Util functions
def max_q(state):
possibilities = []
max_val = None
max_action = None
for index, val in enumerate(q[state]):
if r[state][index]!=-1:
possibilities.append(index)
'''
if max_action is None or max_val<val:
max_val = val
max_action = index
'''
ind = random.choice(possibilities)
val = q[state][ind]
return val, ind
#return max_val, max_action
def inc_q(state, action):
gamma = 0.8
q[state][action] = r[state][action]+gamma*max(q[action])
def do_action(max_action):
global current_state
current_state = max_action
In [4]:
#Training
def train():
epochs = 100
for epoch in range(epochs):
global current_state
current_state = random.choice([0,1,2,3,4,5])
while current_state != goal_state:
s = current_state
#1. Choose the best action
max_val, max_action = max_q(s)
#2. Do that Action
do_action(max_action)
#3. Update Q
inc_q(s, max_action)
print 'Q matrix:'
pprint.pprint(q)
train()
In [5]:
#Testing
def max_q(state):
max_val = None
max_action = None
for index, val in enumerate(q[state]):
if r[state][index]!=-1:
if max_action is None or max_val<val:
max_val = val
max_action = index
return max_val, max_action
current_state = 2
print 'Best Path: ',current_state,'-->',
while current_state != goal_state:
max_val, max_action = max_q(current_state)
do_action(max_action)
print current_state,'-->',