In [ ]:
import numpy as np
import pandas as pd
from logging import getLogger, StreamHandler, DEBUG, INFO
import time
import os
import warnings
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, InputLayer
from keras.optimizers import Adam
from keras.initializers import TruncatedNormal

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

from debug_tools import DebugTools
from hist_data import HistData
from episode_logger import EpisodeLogger
from model_saver import ModelSaver
from my_tensor_board import MyTensorBoard
from fx_trade import FXTrade

In [ ]:
class DeepFX:
    def __init__(self, env, steps=50000,
              log_directory='./logs', model_directory='./models',
              model_filename='Keras-RL_DQN_FX_model_meanq{mean_q:e}_episode{episode:05d}',
              prepared_model_filename=None,
              weights_filename='Keras-RL_DQN_FX_weights.h5',
              logger=None):

        self._log_directory = log_directory
        self._model_directory = model_directory
        self._model_filename = model_filename
        self._prepared_model_filename = prepared_model_filename
        self._weights_filename = weights_filename
        self._load_model_path = self._relative_path(model_directory, prepared_model_filename) 
        self._save_model_path = self._relative_path(model_directory, model_filename)
        self._env = env
        self.steps = steps
        self._logger = logger
        

    def setup(self):
        self._agent, self._model, self._memory, self._policy = self._initialize_agent()
        self._agent.compile('adam')
        self._logger.info(self._model.summary())

    def train(self, is_for_time_measurement=False, wipe_instance_variables_after=True):
        self.setup()
        self._callbacks = self._get_callbacks()
        self._fit(self._agent, is_for_time_measurement, self._env, self._callbacks)
        if wipe_instance_variables_after:
            self._wipe_instance_variables()

    def test(self, callbacks=[], wipe_instance_variables_after=True):
        self.setup()
        self._agent.test(self._env, visualize=False, callbacks=callbacks)

        #%matplotlib inline
        #import matplotlib.pyplot as plt
#
        #for obs in callbacks[0].rewards.values():
        #    plt.plot([o for o in obs])
        #plt.xlabel("step")
        #plt.ylabel("reward")
        #if wipe_instance_variables_after:
        #    self._wipe_instance_variables()
        
    def _wipe_instance_variables(self):
         self._callbacks, self._agent, self._model, \
                self._memory, self._policy, self.env = [None] * 6
        
    def _relative_path(self, directory, filename):
        if directory is None or filename is None:
            return None
        return os.path.join(directory, filename)

    def _get_model(self, load_model_path, observation_space_shape, nb_actions):
        if load_model_path is None:
            # DQNのネットワーク定義
            # ref: https://github.com/googledatalab/notebooks/blob/master/samples/TensorFlow/Machine%20Learning%20with%20Financial%20Data.ipynb
            model = Sequential()
            model.add(Flatten(input_shape=(1,) + observation_space_shape))
            #model.add(InputLayer(input_shape=(1,) + observation_space_shape))
            model.add(Dense(50, activation='relu', kernel_initializer=TruncatedNormal(stddev=0.0001), bias_initializer='ones'))
            model.add(Dense(25, activation='relu', kernel_initializer=TruncatedNormal(stddev=0.0001), bias_initializer='ones'))
            model.add(Dense(nb_actions, activation='linear'))
        else:
            model = keras.models.load_model(load_model_path)
        return model

    def _initialize_agent(self):
        nb_actions = self._env.action_space.n
        observation_space_shape = self._env.observation_space.shape
        model = self._get_model(self._load_model_path, observation_space_shape, nb_actions)
        
        # experience replay用のmemory
        memory = SequentialMemory(limit=500000, window_length=1)
        # 行動方策はオーソドックスなepsilon-greedy。ほかに、各行動のQ値によって確率を決定するBoltzmannQPolicyが利用可能
        policy = EpsGreedyQPolicy(eps=0.1) 
        dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100,
                       policy=policy)
                       #target_model_update=1e-2, policy=policy)
        #dqn.compile(Adam(lr=1e-3))
        return (dqn, model, memory, policy)
        
    def _get_callbacks(self):
        tensor_board_callback = MyTensorBoard(log_dir=self._log_directory, histogram_freq=1, embeddings_layer_names=True, write_graph=True)
        model_saver_callback = ModelSaver(self._save_model_path, monitor='mean_q', mode='max', logger=self._logger)
        episode_logger_callback = EpisodeLogger(logger=self._logger)
        callbacks = [tensor_board_callback, model_saver_callback, episode_logger_callback]
        return callbacks

    def _fit(self, agent, is_for_time_measurement, env, callbacks=[]):
        if is_for_time_measurement:
            start = time.time()
            self._logger.info(DebugTools.now_str())
            history = agent.fit(env, nb_steps=self.steps, visualize=False, verbose=2, nb_max_episode_steps=None, \
                             callbacks=callbacks)
            elapsed_time = time.time() - start
            self._logger.warn(("elapsed_time:{0}".format(elapsed_time)) + "[sec]")
            self._logger.info(DebugTools.now_str())
        else:
            history = agent.fit(env, nb_steps=50000, visualize=True, verbose=2, nb_max_episode_steps=None)
        #学習の様子を描画したいときは、Envに_render()を実装して、visualize=True にします,
        
    def _render(self, mode='human', close=False):
        import pdb; pdb.set_trace()