In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(42)
import gym
from gym import wrappers
In [2]:
# 環境の生成
#env = gym.make('CartPole-v0')
env = gym.make('FrozenLake-v0')
env.seed(42)
env = wrappers.Monitor(env, './frozenlake-v0-q-learning-softmax-01', force=True)
In [3]:
env.render()
Out[3]:
In [4]:
def softmax(q_vector, beta=1.0):
assert beta >= 0.0
q_tilde = q_vector - np.max(q_vector)
factors = np.exp(beta * q_tilde)
return factors / np.sum(factors)
In [5]:
def select_a_with_softmax(q_vector, beta=1.0):
prob_a = softmax(q_vector, beta=beta)
cumsum_a = np.cumsum(prob_a)
return np.where(np.random.rand() < cumsum_a)[0][0]
In [6]:
nb_episode = 20000
In [7]:
nb_o = env.observation_space.n
nb_a = env.action_space.n
# 状態行動価値の推定関数を初期化
q_value = np.zeros([nb_o, nb_a])
# 楽観主義的な初期化(optimistic initialization)
#q_value = 0.5 + 0.1 * np.random.randn(nb_o, nb_a)
# 学習率 (learning rate)
alpha = 0.1
# 逆温度 (inverse temperature)
beta = 0.
inc_beta = 250./nb_episode
# 減衰率 (discount rate)
gamma = 0.99
returns = []
In [8]:
for episode in xrange(nb_episode):
# 報酬の初期化
cum_r = 0
# 環境を初期化し、最初の観測を得る
curr_o = env.reset()
# ソフトマックスによる行動選択
curr_a = select_a_with_softmax(q_value[curr_o, :], beta)
while True:
# 選択された行動の実行
next_o, reward, done, info = env.step(curr_a)
# ソフトマックスによる行動選択
next_a = select_a_with_softmax(q_value[next_o, :], beta)
# Q学習 (Q learning)
if done:
q_target = reward
else:
q_target = reward + gamma * np.max(q_value[next_o, :])
delta = q_target - q_value[curr_o, curr_a]
q_value[curr_o, curr_a] += alpha * delta
# 累積報酬
cum_r = reward + gamma * cum_r
# 次のステップの準備
curr_o = next_o
curr_a = next_a
if done:
break
# 累積報酬を記録
returns.append(cum_r)
# 逆温度を上げる
beta += inc_beta
print(np.mean(returns))
In [9]:
# 累積報酬の移動平均をプロット
window = 100
plt.plot(np.convolve(np.array(returns), np.ones((window,))/window, mode='valid'))
plt.xlabel('Episode')
plt.ylabel('Cumulative reward')
Out[9]:
In [10]:
q_value
Out[10]:
In [11]:
#q_table = np.array([agent.q[ii] for ii in range(env.observation_space.n)])
plt.figure(figsize=(5, 5))
#plt.imshow(q_value, interpolation='nearest', aspect='auto', cmap='bwr_r')
plt.imshow(q_value, interpolation='nearest', aspect='auto', cmap='gray')
#plt.clim(0.0, 1.0)
Out[11]:
In [12]:
plt.figure(figsize=(10, 5))
#plt.imshow(q_table, interpolation='nearest', aspect='auto', cmap='bwr_r')
#plt.clim(-1.0, 1.0)
#plt.clim(0.0, 1.0)
plt.plot(q_value, 'o-');
plt.legend([0, 1, 2, 3])
Out[12]:
In [13]:
env.render()
Out[13]:
In [14]:
env.close()
In [15]:
#api_key = '********'
#gym.upload('./frozenlake-v0-q-learning-softmax-01/', api_key=api_key)
In [ ]: