GRUで発電量予測

やったこと

  • GRU(Gated Recurrent Unit)で発電量予測

In [ ]:
%matplotlib inline

In [ ]:
import os
from os import path, pardir
import sys
import re

In [ ]:
PROJECT_ROOT_DIRPATH = path.join(os.getcwd(), pardir)

In [ ]:
sys.path.append(PROJECT_ROOT_DIRPATH)

In [ ]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

In [ ]:
KWARGS_READ_CSV = {
    "sep": "\t",
    "header": 0,
    "parse_dates": [0],
    "index_col": 0
}
KWARGS_RESAMPLING = {
    "rule": "30T",
    "axis": 0,
    "closed": "right",
    "label": "right"
}

In [ ]:
import keras
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Model
from keras.layers import (
    BatchNormalization,
    Dense,
    Dropout,
    Input,
    GRU,
    Masking,
    TimeDistributed
)
from keras.optimizers import Adam
from keras.regularizers import l1_l2

In [ ]:
epochs = 10000
validation_split=0.2

gru_params = {
    "units": 48,
    "batch_size": 256,
    "activation": 'sigmoid',
    "recurrent_activation": 'sigmoid',
    # "kernel_regularizer": {"l1": 0.01, "l2": 0.005},
    "dropout": 0.1,
    "recurrent_dropout": 0.1,
}

In [ ]:
freq_minute = 10
validation_year = 2015

In [ ]:
LOCATIONS = (
    "ukishima",
    "ougishima",
    "yonekurayama"
)

In [ ]:
TRAIN_TEST_FILEPATH_PREFIX = path.join(PROJECT_ROOT_DIRPATH, "data", "processed", "dataset")
TRAIN_TEST_FILEPATH_EXTENTION = "tsv"

interim_serialize_filename_prefix = path.join(
    PROJECT_ROOT_DIRPATH, "models", "gru.interim", "fit_model.layer0"
)
model_serialize_filename_prefix = path.join(
    PROJECT_ROOT_DIRPATH, "models", "gru", "fit_model.layer0"
)
predict_serialize_filename_prefix = path.join(
    PROJECT_ROOT_DIRPATH, "models", "gru", "predict.layer0"
)
os.makedirs(path.dirname(interim_serialize_filename_prefix), exist_ok=True)
os.makedirs(path.dirname(model_serialize_filename_prefix), exist_ok=True)

In [ ]:
def get_dataset(target,
                location,
                prefix=TRAIN_TEST_FILEPATH_PREFIX,
                suffix=TRAIN_TEST_FILEPATH_EXTENTION):
    if target == "train":
        filepath = '.'.join([prefix, "train_X_y.every_10", location, suffix])
        df = pd.read_csv(filepath, **KWARGS_READ_CSV)

        return df.iloc[:, :-1], df.iloc[:, -1]
    elif target =="test":
        filepath = '.'.join([prefix, "test_X.every_10", location, suffix])

        return pd.read_csv(filepath, **KWARGS_READ_CSV)

In [ ]:
def gen_model_name_from_param_dict(param_dict):
    model_name = str()
    for k, v in sorted(gru_params.items()):
        if isinstance(v, dict):
            for k_inner, v_inner in v.items():
                model_name += "{ko}_{ki}_{v}.".format(ko=k, ki=k_inner, v=v_inner)
        else:
            model_name += "{k}_{v}.".format(k=k, v=v)

    return model_name[:-1]

In [ ]:
def gen_modified_param_dict(gru_params):
    modified_param_dict = dict()
    pt_regularizer = re.compile("regularizer")
    for k, v in gru_params.items():
        if pt_regularizer.search(k):
            modified_param_dict[k] = l1_l2(l1=v['l1'], l2=v['l2'])
        else:
            modified_param_dict[k] = v

    return modified_param_dict

In [ ]:
def gen_sequential(input_shape, gru_params):
    inputs = Input(shape=input_shape)
    x = Masking(mask_value=1.)(inputs)
    x = BatchNormalization()(x)
    x = GRU(return_sequences=True, **gru_params)(x)
    predictions = TimeDistributed(Dense(1, activation="linear"))(x)
    model = Model(inputs=inputs, outputs=predictions)

    optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(loss="mean_squared_error", optimizer=optimizer, metrics=["mae", ])

    return model

In [ ]:
def gen_callbacks(target, location):
    serialize_path = '.'.join([interim_serialize_filename_prefix,
                               "ep_{epoch:04d}.mae_{mean_absolute_error:.2f}.val_mae_{val_mean_absolute_error:.2f}",
                               "{t}.{l}.hdf5".format(t=target, l=location)])

    cp_best = ModelCheckpoint(filepath=serialize_path,
                              monitor='val_mean_absolute_error',
                              save_best_only=True,
                              mode='auto')
    cp_period = ModelCheckpoint(filepath=serialize_path, period=100)
    es = EarlyStopping(monitor='val_mean_absolute_error', min_delta=0, patience=100, mode='auto')

    return [cp_best, cp_period, es]

In [ ]:
def split_from_index(ndarray, begin_val_index, end_val_index):
    if ndarray.ndim == 2:
        return ndarray[:begin_val_index, :], ndarray[begin_val_index:end_val_index, :]
    elif ndarray.ndim == 3:
        return ndarray[:begin_val_index, :, :], ndarray[begin_val_index:end_val_index, :, :]

In [ ]:
class DummyScaler(BaseEstimator, TransformerMixin):
    def __init__(self, copy=True):
        self.copy = copy

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X

    def inverse_transform(self, X):
        return X

In [ ]:


浮島

定数の設定


In [ ]:
location = "ukishima"

トレーニングデータ取得


In [ ]:
df_x, df_y = get_dataset("train", location)

定数の定義


In [ ]:
len_series = 144
num_features = df_x.shape[1]
datetime_index = df_y.index.copy(deep=True)
is_null_y = df_y.isnull().copy(deep=True)

model_name = gen_model_name_from_param_dict(gru_params)
modified_param_dict = gen_modified_param_dict(gru_params)
modified_param_dict["name"] = model_name
target_val = "foldout_{y}".format(y=validation_year)

系列データ全体でmin-maxスケーリング


In [ ]:
ndarray_x = df_x.as_matrix()
ndarray_y = df_y.as_matrix().reshape(-1, 1)
is_nan_y_train = np.isnan(ndarray_y.reshape(-1))

ndarray_x[is_nan_y_train, :] = 1.
ndarray_y[is_nan_y_train, :] = 1.

x_scaler = MinMaxScaler()
ndarray_x = x_scaler.fit_transform(ndarray_x)

y_scaler = DummyScaler()
ndarray_y = y_scaler.fit_transform(ndarray_y)

ndarray_x[is_nan_y_train, :] = 1.
ndarray_y[is_nan_y_train, :] = 1.

データセットの分割


In [ ]:
ndarray_x = ndarray_x.reshape(-1, len_series, num_features).astype('float32')
ndarray_y = ndarray_y.reshape(-1, len_series, 1)

begin_val_datetime = pd.to_datetime("{y}-01-01 00:10:00".format(y=validation_year))
end_val_datetime = pd.to_datetime("{y}-01-01 00:10:00".format(y=validation_year+1))
begin_val_index = int(datetime_index.get_loc(begin_val_datetime) / len_series)
val_end_index = int(datetime_index.get_loc(end_val_datetime) / len_series)

x_train, x_val = split_from_index(ndarray_x, begin_val_index, val_end_index)
y_train, y_val = split_from_index(ndarray_y, begin_val_index, val_end_index)

欠損サンプルの除去


In [ ]:
num_sample_train = x_train.shape[0]
num_sample_val = x_val.shape[0]

remove_sample_list = [
    i for i in range(0, int(df_y.size/len_series))
    if is_null_y.iloc[np.arange(len_series*i, len_series*(i+1))].sum() == len_series
]
remove_samples_train = [
    elem
    for elem in remove_sample_list
    if elem < num_sample_train
]
remove_samples_val = [
    elem - num_sample_train
    for elem in remove_sample_list
    if num_sample_train <= elem < num_sample_train + num_sample_val
]

x_train = np.delete(x_train, remove_samples_train, 0)
y_train = np.delete(y_train, remove_samples_train, 0)
x_val = np.delete(x_val, remove_samples_val, 0)
y_val = np.delete(y_val, remove_samples_val, 0)

モデルの宣言


In [ ]:
model = gen_sequential((len_series, num_features), modified_param_dict)
model.summary()

モデリング for validaton data


In [ ]:
history = model.fit(x_train,
                    y_train,
                    epochs=epochs,
                    verbose=1,
                    validation_split=validation_split,
                    callbacks=gen_callbacks(target_val, location))

validation dataに対する結果の出力および保存


In [ ]:
y_pred = y_scaler.inverse_transform(model.predict(x_val).reshape(-1, 1))
y_true = y_scaler.inverse_transform(y_val.reshape(-1, 1))

index = pd.date_range(begin_val_datetime,
                      end_val_datetime + pd.Timedelta(-freq_minute, unit='m'),
                      freq=pd.offsets.Minute(freq_minute))
val_index = index.copy(deep=True)
for i in remove_samples_val:
    val_index = val_index.drop(index[len_series*i:len_series*(i+1)])

pd.DataFrame(
    y_pred, index=val_index, columns=[model_name,]
).to_csv(
    '.'.join([predict_serialize_filename_prefix,
              model_name,
              target_val,
              location,
              "tsv"]),
    sep="\t"
)
model.save(
    '.'.join([model_serialize_filename_prefix,
              model_name,
              target_val, 
              location,
              "hdf5"])
)

print("MAE convert into every 30 in {y}: ".format(y=validation_year),
      3 * mean_absolute_error(y_true, y_pred))

モデリング for test data


In [ ]:
history = model.fit(x_train,
                    y_train,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val),
                    callbacks=gen_callbacks("test", location))

test dataに対する結果の保存


In [ ]:
df_test = get_dataset("test", location)
x_test = x_scaler.transform(df_test.as_matrix())

y_pred = y_scaler.inverse_transform(
    model.predict(
        x_test.reshape(-1, len_series, num_features).astype('float32')
    ).reshape(-1, 1)
)
df_y_pred = pd.DataFrame(y_pred, index=df_test.index, columns=[model_name,])

df_y_pred.resample(
    **KWARGS_RESAMPLING
).sum().to_csv(
    '.'.join([predict_serialize_filename_prefix,
              model_name,
              "test",
              location,
              "tsv"]),
    sep="\t"
)
model.save(
    '.'.join([model_serialize_filename_prefix,
              model_name,
              "test",
              location,
              "hdf5"]),
)

In [ ]:


扇島

定数の設定


In [ ]:
location = "ougishima"

トレーニングデータ取得


In [ ]:
df_x, df_y = get_dataset("train", location)

定数の定義


In [ ]:
len_series = 144
num_features = df_x.shape[1]
datetime_index = df_y.index.copy(deep=True)
is_null_y = df_y.isnull().copy(deep=True)

model_name = gen_model_name_from_param_dict(gru_params)
modified_param_dict = gen_modified_param_dict(gru_params)
modified_param_dict["name"] = model_name
target_val = "foldout_{y}".format(y=validation_year)

系列データ全体でスケーリング


In [ ]:
ndarray_x = df_x.as_matrix()
ndarray_y = df_y.as_matrix().reshape(-1, 1)
is_nan_y_train = np.isnan(ndarray_y.reshape(-1))

ndarray_x[is_nan_y_train, :] = 1.
ndarray_y[is_nan_y_train, :] = 1.

x_scaler = MinMaxScaler()
ndarray_x = x_scaler.fit_transform(ndarray_x)

y_scaler = DummyScaler()
ndarray_y = y_scaler.fit_transform(ndarray_y)

ndarray_x[is_nan_y_train, :] = 1.
ndarray_y[is_nan_y_train, :] = 1.

データセットの分割


In [ ]:
ndarray_x = ndarray_x.reshape(-1, len_series, num_features).astype('float32')
ndarray_y = ndarray_y.reshape(-1, len_series, 1)

begin_val_datetime = pd.to_datetime("{y}-01-01 00:10:00".format(y=validation_year))
end_val_datetime = pd.to_datetime("{y}-01-01 00:10:00".format(y=validation_year+1))
begin_val_index = int(datetime_index.get_loc(begin_val_datetime) / len_series)
val_end_index = int(datetime_index.get_loc(end_val_datetime) / len_series)

x_train, x_val = split_from_index(ndarray_x, begin_val_index, val_end_index)
y_train, y_val = split_from_index(ndarray_y, begin_val_index, val_end_index)

欠損サンプルの除去


In [ ]:
num_sample_train = x_train.shape[0]
num_sample_val = x_val.shape[0]

remove_sample_list = [
    i for i in range(0, int(df_y.size/len_series))
    if is_null_y.iloc[np.arange(len_series*i, len_series*(i+1))].sum() == len_series
]
remove_samples_train = [
    elem
    for elem in remove_sample_list
    if elem < num_sample_train
]
remove_samples_val = [
    elem - num_sample_train
    for elem in remove_sample_list
    if num_sample_train <= elem < num_sample_train + num_sample_val
]

x_train = np.delete(x_train, remove_samples_train, 0)
y_train = np.delete(y_train, remove_samples_train, 0)
x_val = np.delete(x_val, remove_samples_val, 0)
y_val = np.delete(y_val, remove_samples_val, 0)

モデルの宣言


In [ ]:
model = gen_sequential((len_series, num_features), modified_param_dict)
model.summary()

モデリング for validaton data


In [ ]:
history = model.fit(x_train,
                    y_train,
                    epochs=epochs,
                    verbose=1,
                    validation_split=validation_split,
                    callbacks=gen_callbacks(target_val, location))

validation dataに対する結果の出力および保存


In [ ]:
y_pred = y_scaler.inverse_transform(model.predict(x_val).reshape(-1, 1))
y_true = y_scaler.inverse_transform(y_val.reshape(-1, 1))

index = pd.date_range(begin_val_datetime,
                      end_val_datetime + pd.Timedelta(-freq_minute, unit='m'),
                      freq=pd.offsets.Minute(freq_minute))
val_index = index.copy(deep=True)
for i in remove_samples_val:
    val_index = val_index.drop(index[len_series*i:len_series*(i+1)])

pd.DataFrame(
    y_pred, index=val_index, columns=[model_name,]
).to_csv(
    '.'.join([predict_serialize_filename_prefix,
              model_name,
              target_val,
              location,
              "tsv"]),
    sep="\t"
)
model.save(
    '.'.join([model_serialize_filename_prefix,
              model_name,
              target_val, 
              location,
              "hdf5"])
)

print("MAE convert into every 30 in {y}: ".format(y=validation_year),
      3 * mean_absolute_error(y_true, y_pred))

モデリング for test data


In [ ]:
history = model.fit(x_train,
                    y_train,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val),
                    callbacks=gen_callbacks("test", location))

test dataに対する結果の保存


In [ ]:
df_test = get_dataset("test", location)
x_test = x_scaler.transform(df_test.as_matrix())

y_pred = y_scaler.inverse_transform(
    model.predict(
        x_test.reshape(-1, len_series, num_features).astype('float32')
    ).reshape(-1, 1)
)
df_y_pred = pd.DataFrame(y_pred, index=df_test.index, columns=[model_name,])

df_y_pred.resample(
    **KWARGS_RESAMPLING
).sum().to_csv(
    '.'.join([predict_serialize_filename_prefix,
              model_name,
              "test",
              location,
              "tsv"]),
    sep="\t"
)
model.save(
    '.'.join([model_serialize_filename_prefix,
              model_name,
              "test",
              location,
              "hdf5"]),
)

In [ ]:


米倉山

定数の設定


In [ ]:
location = "yonekurayama"

トレーニングデータ取得


In [ ]:
df_x, df_y = get_dataset("train", location)

定数の定義


In [ ]:
len_series = 144
num_features = df_x.shape[1]
datetime_index = df_y.index.copy(deep=True)
is_null_y = df_y.isnull().copy(deep=True)

model_name = gen_model_name_from_param_dict(gru_params)
modified_param_dict = gen_modified_param_dict(gru_params)
modified_param_dict["name"] = model_name
target_val = "foldout_{y}".format(y=validation_year)

系列データ全体でmin-maxスケーリング


In [ ]:
ndarray_x = df_x.as_matrix()
ndarray_y = df_y.as_matrix().reshape(-1, 1)
is_nan_y_train = np.isnan(ndarray_y.reshape(-1))

ndarray_x[is_nan_y_train, :] = 1.
ndarray_y[is_nan_y_train, :] = 1.

x_scaler = MinMaxScaler()
ndarray_x = x_scaler.fit_transform(ndarray_x)

y_scaler = DummyScaler()
ndarray_y = y_scaler.fit_transform(ndarray_y)

ndarray_x[is_nan_y_train, :] = 1.
ndarray_y[is_nan_y_train, :] = 1.

データセットの分割


In [ ]:
ndarray_x = ndarray_x.reshape(-1, len_series, num_features).astype('float32')
ndarray_y = ndarray_y.reshape(-1, len_series, 1)

begin_val_datetime = pd.to_datetime("{y}-01-01 00:10:00".format(y=validation_year))
end_val_datetime = pd.to_datetime("{y}-01-01 00:10:00".format(y=validation_year+1))
begin_val_index = int(datetime_index.get_loc(begin_val_datetime) / len_series)
val_end_index = int(datetime_index.get_loc(end_val_datetime) / len_series)

x_train, x_val = split_from_index(ndarray_x, begin_val_index, val_end_index)
y_train, y_val = split_from_index(ndarray_y, begin_val_index, val_end_index)

欠損サンプルの除去


In [ ]:
num_sample_train = x_train.shape[0]
num_sample_val = x_val.shape[0]

remove_sample_list = [
    i for i in range(0, int(df_y.size/len_series))
    if is_null_y.iloc[np.arange(len_series*i, len_series*(i+1))].sum() == len_series
]
remove_samples_train = [
    elem
    for elem in remove_sample_list
    if elem < num_sample_train
]
remove_samples_val = [
    elem - num_sample_train
    for elem in remove_sample_list
    if num_sample_train <= elem < num_sample_train + num_sample_val
]

x_train = np.delete(x_train, remove_samples_train, 0)
y_train = np.delete(y_train, remove_samples_train, 0)
x_val = np.delete(x_val, remove_samples_val, 0)
y_val = np.delete(y_val, remove_samples_val, 0)

モデルの宣言


In [ ]:
model = gen_sequential((len_series, num_features), modified_param_dict)
model.summary()

モデリング for validaton data


In [ ]:
history = model.fit(x_train,
                    y_train,
                    epochs=epochs,
                    verbose=1,
                    validation_split=validation_split,
                    callbacks=gen_callbacks(target_val, location))

validation dataに対する結果の出力および保存


In [ ]:
y_pred = y_scaler.inverse_transform(model.predict(x_val).reshape(-1, 1))
y_true = y_scaler.inverse_transform(y_val.reshape(-1, 1))

index = pd.date_range(begin_val_datetime,
                      end_val_datetime + pd.Timedelta(-freq_minute, unit='m'),
                      freq=pd.offsets.Minute(freq_minute))
val_index = index.copy(deep=True)
for i in remove_samples_val:
    val_index = val_index.drop(index[len_series*i:len_series*(i+1)])

pd.DataFrame(
    y_pred, index=val_index, columns=[model_name,]
).to_csv(
    '.'.join([predict_serialize_filename_prefix,
              model_name,
              target_val,
              location,
              "tsv"]),
    sep="\t"
)
model.save(
    '.'.join([model_serialize_filename_prefix,
              model_name,
              target_val, 
              location,
              "hdf5"])
)

print("MAE convert into every 30 in {y}: ".format(y=validation_year),
      3 * mean_absolute_error(y_true, y_pred))

モデリング for test data


In [ ]:
history = model.fit(x_train,
                    y_train,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val),
                    callbacks=gen_callbacks("test", location))

test dataに対する結果の保存


In [ ]:
df_test = get_dataset("test", location)
x_test = x_scaler.transform(df_test.as_matrix())

y_pred = y_scaler.inverse_transform(
    model.predict(
        x_test.reshape(-1, len_series, num_features).astype('float32')
    ).reshape(-1, 1)
)
df_y_pred = pd.DataFrame(y_pred, index=df_test.index, columns=[model_name,])

df_y_pred.resample(
    **KWARGS_RESAMPLING
).sum().to_csv(
    '.'.join([predict_serialize_filename_prefix,
              model_name,
              "test",
              location,
              "tsv"]),
    sep="\t"
)
model.save(
    '.'.join([model_serialize_filename_prefix,
              model_name,
              "test",
              location,
              "hdf5"]),
)

In [ ]: