In [6]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import json

import matplotlib as mpl
label_size = 20
mpl.rcParams['xtick.labelsize'] = label_size
mpl.rcParams['ytick.labelsize'] = label_size

Plot results for all environments except reward corruption


In [ ]:
agent_labels = {'BayesAgent':(r'AI$\xi$','red'),
          'MC-AIXI':('MC-AIXI','red'),
          'MC-AIMU':('MC-AIMU','blue'),
          'MDL Agent':('MDL','blue'),
          'MC-AIXI-Dirichlet':('MC-AIXI-Dirichlet','blue'),
          'Knowledge-seeking agent':('Kullback-Leibler','blue'),
          'KullbackLeiblerKSA':('Kullback-Leibler','blue'),
          'ShannonKSA':('Shannon','green'),
          'SquareKSA':('Square','red'),
          'Shannon KSA':('Shannon','orange'),
          'Square KSA':('Square','red'),
          'ThompsonAgent':('Thompson Sampling','blue'),
          'Thompson Sampling':('Thompson Sampling','blue'),
          'QLearn':('Q-Learning','black'),
          'Q-Learning':('Q-Learning','black'),
          'KSA-Dirichlet': ('Kullback-Leibler','blue'),
          'Entropy-seeking agent': ('Shannon','orange'),
          'Square KSA-Dirichlet': ('Square','red')}

def plot_results(directory,
                 filename='results-1',
                 objective=None,
                 outfile=None,
                 show_optimal=False,
                 show_variance=True,
                 show_maxmin=False):
    
    # some cruft to add default labels
    if not objective:
        if 'ksa' in directory:
            objective = 'explored'
        else:
            objective = 'rewards'
    if objective == 'rewards':
        y_axis = 'Average Reward'
    elif objective == 'explored':
        y_axis = 'Exploration (%)'
    
    file = open(directory + '/' + filename + '.json')
    data = json.load(file)
    file.close()

    fig = plt.figure(figsize=(12,8),dpi=200)
    # iterate over configs
    for i,k in enumerate(data):
        try:
            d = data[k]
        except KeyError:
            continue
        cycles = d[0]['cycles']
        runs = len(d)

        A = np.zeros((cycles,runs))
        for j in range(runs):
            A[:,j] = np.array(d[j][objective][:cycles])
        mu = np.mean(A,1)
        sigma = np.std(A,1)
        a = np.max(np.vstack((mu-sigma,np.min(A,1))),0)
        b = np.min(np.vstack((mu+sigma,np.array(cycles*[100]))),0)

        if k in agent_labels:
            lab = agent_labels[k][0]

        color = agent_labels[k][1]
        alpha = 0.1
        
        if show_variance:
            plt.plot(a,color=color,alpha=alpha)
            plt.plot(b,color=color,alpha=alpha)
            plt.fill_between(np.arange(cycles),a,b,alpha=alpha,color=color)
        
        if show_maxmin:
            plt.plot(np.max(A,axis=1),color=color,linestyle='-.')
            plt.plot(np.min(A,axis=1),color=color,linestyle='-.')
        
        plt.plot(mu,label=lab,color=color,lw=3)
        

    if objective=='rewards' and show_optimal:
        # NOTE: hardcoded for optimal policy in one gridworld
        xs = np.array(range(cycles))
        ys = np.zeros(cycles)
        ys[:11] = -1.
        ys[11:] = 75.
        ys = np.cumsum(ys)

        ys[1:] /= xs[1:]
        plt.plot(xs,ys,'k--',lw=3,label='Optimal')
        
    plt.xlabel('Cycles',fontsize=30)
    plt.ylabel(y_axis,fontsize=30)
    plt.legend(fontsize=25,loc='lower right')
    plt.margins(0.01,0)
    #plt.ylim([-1,100])
       
    if outfile:
        plt.savefig(directory + '/' + outfile + '.png', bbox_inches='tight')
        plt.close()

plot_results('aixi-models','results-3',show_optimal=True)

Plot results for reward corruption


In [13]:
def plot_rc_results(directory,
                 filename='results',
                 objective='rewards',
                 outfile=None,
                 show_variance=True,
                 runs=5,
                 cycles=1000000,
                 color='red',
                 ls='solid',
                 label='Q-learning'):
    if objective == 'rewards':
        y_axis = 'Average Observed Reward'
    elif objective == 'corrupt_rewards':
        y_axis = 'Average Corrupt Reward'
    elif objective == 'true_rewards':
        y_axis = 'Average True Reward'
    
    A = np.zeros((cycles,runs))
    for j in range(runs):
        file = open(directory + '/' + filename + '-' + str(j+1) + '.json')
        data = json.load(file)
        file.close()
        A[:,j] = np.array(data['Reward Corruption'][0][objective])

    mu = np.mean(A,1)
    sigma = np.std(A,1)
    a = np.max(np.vstack((mu-sigma,np.min(A,1))),0)
    b = np.min(np.vstack((mu+sigma,np.array(cycles*[100]))),0)

    alpha = 0.1
    if show_variance:
        plt.plot(a, color=color, alpha=alpha, ls=ls)
        plt.plot(b, color=color, alpha=alpha, ls=ls)
        plt.fill_between(np.arange(cycles), a, b, alpha=alpha, color=color)

    plt.plot(mu, color=color, label=label, lw=3, ls=ls)
    
    plt.xscale('log')
    plt.xlabel('Cycles', fontsize=20)
    
    if outfile:
        plt.savefig(directory + '/' + outfile + '.png', bbox_inches='tight')
        plt.close()

In [14]:
### plot results for different agents on the same plot
runs=100 # number of runs
goals='4' # number of goal tiles
for rew in ('true_', ''):
    plot_rc_results('reward-corruption/goals' + goals + '_qlearning', 'results', rew+'rewards', runs=runs, color='red')
    plot_rc_results('reward-corruption/goals' + goals + '_softmax', 'results', rew+'rewards', runs=runs, color='orange', label='Softmax')
    plot_rc_results('reward-corruption/goals' + goals + '_quantiliser_delta.2', 'results', rew+'rewards', runs=runs, color='black', label='Quantiliser (.2)')
    plot_rc_results('reward-corruption/goals' + goals + '_quantiliser_delta.5', 'results', rew+'rewards', runs=runs, color='blue', label='Quantiliser (.5)')
    plot_rc_results('reward-corruption/goals' + goals + '_quantiliser_delta.8', 'results', rew+'rewards', runs=runs, color='green', label='Quantiliser (.8)')
    plt.ylim([0, 1])
    plt.legend(loc=2)
    plt.savefig('reward-corruption/goals' + goals + '_' + rew + '.png', bbox_inches='tight', format='png')
    plt.close()

In [23]:
# compute average observed and true rewards
def round_to_n(x, n):
    return round(x, -int(np.floor(np.log10(x))) + (n - 1))

def comb(a, b):
    return str(round_to_n(a,3)) + ' +- ' + str(round_to_n(b,2))

combine = np.vectorize(comb)

def average_results(directory,
                 filename='results',
                 runs=100,
                 cycles=1000000):
    A = np.zeros((2,runs))
    for j in range(runs):
        file = open(directory + '/' + filename + '-' + str(j+1) + '.json')
        data = json.load(file)
        file.close()
        A[0,j] = data['Reward Corruption'][0]['rewards'][cycles-1]
        A[1,j] = data['Reward Corruption'][0]['true_rewards'][cycles-1]
    res = combine(np.mean(A,1), np.std(A, 1))
    print (directory + '\t observed rewards ' + res[0] + '\t true rewards ' + res[1])

average_results('reward-corruption/goals' + goals + '_qlearning')
average_results('reward-corruption/goals' + goals + '_softmax')
average_results('reward-corruption/goals' + goals + '_quantiliser_delta.2')
average_results('reward-corruption/goals' + goals + '_quantiliser_delta.5')
average_results('reward-corruption/goals' + goals + '_quantiliser_delta.8')


reward-corruption/goals4_quantiliser_delta.5	 observed rewards 0.919 +- 0.04	 true rewards 0.719 +- 0.36

In [ ]: