In [1]:
import gym
import keras
import numpy as np
In [2]:
# Hyperparameters
H = 200 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-3
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?
render = False
D = 80 * 80 # input dimensionality: 80x80 grid
running_reward = 21.0 # Default start at zero
D = 80 * 80 # input dimensionality: 80x80 grid
def prepro(I):
""" prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
I = I[35:195] # crop
I = I[::2,::2,0] # downsample by factor of 2
I[I == 144] = 0 # erase background (background type 1)
I[I == 109] = 0 # erase background (background type 2)
I[I != 0] = 1 # everything else (paddles, ball) just set to 1
return I.astype(np.float).ravel()
def discount_rewards(r):
""" take 1D float array of rewards and compute discounted reward """
discounted_r = np.zeros_like(r)
running_add = 0
for t in reversed(xrange(0, r.size)):
if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
return discounted_r
In [3]:
def prepro(I):
""" prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
I = I[35:195] # crop
I = I[::2,::2,0] # downsample by factor of 2
I[I == 144] = 0 # erase background (background type 1)
I[I == 109] = 0 # erase background (background type 2)
I[I != 0] = 1 # everything else (paddles, ball) just set to 1
return I.astype(np.float32)
def get_dense_model():
"""Make keras model"""
learning_rate=1e-4
inp = Input(shape=(80*80,))
h = Dense(200, activation='relu')(inp)
out = Dense(1, activation='sigmoid')(h)
model = Model(inp, out)
optim = RMSprop(learning_rate)
model.compile(optim, 'binary_crossentropy')
try:
model.load_weights('mod_weights_binary.h5')
print('weights loaded')
except:
pass
return model
In [4]:
from keras.models import Sequential
from keras.layers import Dense, Activation
kmodel = Sequential()
kmodel.add(Dense(output_dim=200,input_dim=6400, activation="relu"))
kmodel.add(Dense(1, activation="sigmoid"))
kmodel.compile(loss='binary_crossentropy',
optimizer=keras.optimizers.RMSprop(lr=0.01, rho=0.9, epsilon=1e-08, decay=0.0))
kmodel.save("model/model.h5")
from keras.models import load_model
def get_dense_model():
return load_model('model/model.h5')
In [5]:
game = "Pong-v0"
def run_parallel_episodes(pp,n_episodes = 3):
X,ACTION,REWARD = [],[],[]
outs=pp.map(run_episodes,n_episodes*[0])
for o in outs:
X.extend(o[0])
ACTION.extend(o[1])
REWARD.extend(o[2])
X = np.vstack(X)
ACTION = np.vstack(ACTION)
REWARD = np.vstack(REWARD)
return X,ACTION,REWARD
def run_episodes(thr = [1,2,3]):
n_episodes=3
D=80*80
model=get_dense_model()
env=gym.make(game)
observation = env.reset()
prev_x = None # used in computing the difference frame
X,ACTION,REWARD = [],[],[]
running_reward = None
reward_sum = 0
episode_number = 0
loc_len=0
while True:
# preprocess the observation, set input to network to be difference image
cur_x = prepro(observation)
x = cur_x - prev_x if prev_x is not None else np.zeros(D)
prev_x = cur_x
# forward the policy network and sample an action from the returned probability
aprob = model.predict(x.reshape((1, -1)))
action = 2 if np.random.uniform() < aprob else 3 # roll the dice!
# record various intermediates (needed later for backprop)
X.append(x.reshape((1, -1))) # observation
# y = 1 if action == 2 else 0 # a "fake label" giving the action chosen
ACTION.append(1 if action == 2 else 0) # a "fake label" giving the action chosen
# step the environment and get new measurements
observation, reward, done, info = env.step(action)
reward_sum += reward
REWARD.append(reward) # record reward (has to be done after we call step() to get reward for previous action)
loc_len+=1
if done: # an episode finished (one player has reached a score of 21)
episode_number += 1
#print(episode_number,reward_sum,loc_len)
reward_sum=0
loc_len=0
if episode_number>(n_episodes-1):
X = np.vstack(X)
ACTION = np.array(ACTION)
REWARD = np.array(REWARD)
return X,ACTION,REWARD
observation = env.reset()
In [6]:
class threadsafe_iter:
"""Takes an iterator/generator and makes it thread-safe by
serializing call to the `next` method of given iterator/generator.
"""
def __init__(self, it):
self.it = it
self.lock = threading.Lock()
def __iter__(self):
return self
def next(self):
with self.lock:
return self.it.next()
def threadsafe_generator(f):
"""A decorator that takes a generator function and makes it thread-safe.
"""
def g(*a, **kw):
return threadsafe_iter(f(*a, **kw))
return g
In [7]:
import threading
@threadsafe_generator
def game_generator():
while True:
obs = run_episodes()
yield obs
In [8]:
import datetime
In [9]:
start = datetime.datetime.now()
kmodel.fit_generator(generator = game_generator(),
steps_per_epoch = 10,
epochs=1,
workers = 1,
verbose=2,
pickle_safe=False)
print datetime.datetime.now() - start
#kmodel.save("model/model.h5")
In [10]:
start = datetime.datetime.now()
kmodel.fit_generator(generator = game_generator(),
steps_per_epoch = 10,
epochs=1,
workers = 5,
verbose=2,
pickle_safe=True)
print datetime.datetime.now() - start
#kmodel.save("model/model.h5")