In [1]:
import pprint
import random

In [2]:
#Initialisation

q = [[0,0,0,0,0,0], 
      [0,0,0,0,0,0], 
      [0,0,0,0,0,0], 
      [0,0,0,0,0,0],
      [0,0,0,0,0,0],
      [0,0,0,0,0,0]]

r = [[-1,-1,-1,-1, 0, -1], 
     [-1,-1,-1, 0,-1,100], 
     [-1,-1,-1, 0,-1, -1], 
     [-1, 0, 0,-1, 0, -1],
     [ 0,-1,-1, 0,-1,100],
     [-1, 0,-1,-1, 0,100]]

goal_state = 5
current_state = None

In [3]:
#Util functions

def max_q(state):
	possibilities = []
	
	max_val = None
	max_action = None
	for index, val in enumerate(q[state]):
		if r[state][index]!=-1:
			possibilities.append(index)
			'''
			if max_action is None or max_val<val:
				max_val = val
				max_action = index
			'''
	ind = random.choice(possibilities)

	val = q[state][ind]
	return val, ind
	#return max_val, max_action

def inc_q(state, action):
	gamma = 0.8
	q[state][action] = r[state][action]+gamma*max(q[action])	

def do_action(max_action):
	global current_state
	current_state = max_action

In [4]:
#Training

def train():
    epochs = 100

    for epoch in range(epochs): 
        global current_state
        current_state = random.choice([0,1,2,3,4,5])

        while current_state != goal_state:
            s = current_state
            #1. Choose the best action
            max_val, max_action = max_q(s)
            #2. Do that Action
            do_action(max_action)
            #3. Update Q
            inc_q(s, max_action)
    
    print 'Q matrix:'
    pprint.pprint(q)
    
train()


Q matrix:
[[0, 0, 0, 0, 80.0, 0],
 [0, 0, 0, 64.0, 0, 100.0],
 [0, 0, 0, 64.0, 0, 0],
 [0, 80.0, 51.2, 0, 80.0, 0],
 [64.0, 0, 0, 64.0, 0, 100.0],
 [0, 0, 0, 0, 0, 0]]

In [5]:
#Testing

def max_q(state):
    max_val = None
    max_action = None
    for index, val in enumerate(q[state]):
        if r[state][index]!=-1:
            if max_action is None or max_val<val:
                max_val = val
                max_action = index
    return max_val, max_action

current_state = 2

print 'Best Path: ',current_state,'-->',
while current_state != goal_state:
        max_val, max_action = max_q(current_state)
        do_action(max_action)
        print current_state,'-->',


Best Path:  2 --> 3 --> 1 --> 5 -->