In [ ]:
%matplotlib inline
In [ ]:
import os
from os import path, pardir
import sys
import re
In [ ]:
PROJECT_ROOT_DIRPATH = path.join(os.getcwd(), pardir)
In [ ]:
sys.path.append(PROJECT_ROOT_DIRPATH)
In [ ]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
In [ ]:
KWARGS_READ_CSV = {
"sep": "\t",
"header": 0,
"parse_dates": [0],
"index_col": 0
}
KWARGS_RESAMPLING = {
"rule": "30T",
"axis": 0,
"closed": "right",
"label": "right"
}
In [ ]:
import keras
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Model
from keras.layers import (
BatchNormalization,
Dense,
Dropout,
Input,
GRU,
Masking,
TimeDistributed
)
from keras.optimizers import Adam
from keras.regularizers import l1_l2
In [ ]:
epochs = 10000
validation_split=0.2
gru_params = {
"units": 48,
"batch_size": 256,
"activation": 'sigmoid',
"recurrent_activation": 'sigmoid',
# "kernel_regularizer": {"l1": 0.01, "l2": 0.005},
"dropout": 0.1,
"recurrent_dropout": 0.1,
}
In [ ]:
freq_minute = 10
validation_year = 2015
In [ ]:
LOCATIONS = (
"ukishima",
"ougishima",
"yonekurayama"
)
In [ ]:
TRAIN_TEST_FILEPATH_PREFIX = path.join(PROJECT_ROOT_DIRPATH, "data", "processed", "dataset")
TRAIN_TEST_FILEPATH_EXTENTION = "tsv"
interim_serialize_filename_prefix = path.join(
PROJECT_ROOT_DIRPATH, "models", "gru.interim", "fit_model.layer0"
)
model_serialize_filename_prefix = path.join(
PROJECT_ROOT_DIRPATH, "models", "gru", "fit_model.layer0"
)
predict_serialize_filename_prefix = path.join(
PROJECT_ROOT_DIRPATH, "models", "gru", "predict.layer0"
)
os.makedirs(path.dirname(interim_serialize_filename_prefix), exist_ok=True)
os.makedirs(path.dirname(model_serialize_filename_prefix), exist_ok=True)
In [ ]:
def get_dataset(target,
location,
prefix=TRAIN_TEST_FILEPATH_PREFIX,
suffix=TRAIN_TEST_FILEPATH_EXTENTION):
if target == "train":
filepath = '.'.join([prefix, "train_X_y.every_10", location, suffix])
df = pd.read_csv(filepath, **KWARGS_READ_CSV)
return df.iloc[:, :-1], df.iloc[:, -1]
elif target =="test":
filepath = '.'.join([prefix, "test_X.every_10", location, suffix])
return pd.read_csv(filepath, **KWARGS_READ_CSV)
In [ ]:
def gen_model_name_from_param_dict(param_dict):
model_name = str()
for k, v in sorted(gru_params.items()):
if isinstance(v, dict):
for k_inner, v_inner in v.items():
model_name += "{ko}_{ki}_{v}.".format(ko=k, ki=k_inner, v=v_inner)
else:
model_name += "{k}_{v}.".format(k=k, v=v)
return model_name[:-1]
In [ ]:
def gen_modified_param_dict(gru_params):
modified_param_dict = dict()
pt_regularizer = re.compile("regularizer")
for k, v in gru_params.items():
if pt_regularizer.search(k):
modified_param_dict[k] = l1_l2(l1=v['l1'], l2=v['l2'])
else:
modified_param_dict[k] = v
return modified_param_dict
In [ ]:
def gen_sequential(input_shape, gru_params):
inputs = Input(shape=input_shape)
x = Masking(mask_value=1.)(inputs)
x = BatchNormalization()(x)
x = GRU(return_sequences=True, **gru_params)(x)
predictions = TimeDistributed(Dense(1, activation="linear"))(x)
model = Model(inputs=inputs, outputs=predictions)
optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss="mean_squared_error", optimizer=optimizer, metrics=["mae", ])
return model
In [ ]:
def gen_callbacks(target, location):
serialize_path = '.'.join([interim_serialize_filename_prefix,
"ep_{epoch:04d}.mae_{mean_absolute_error:.2f}.val_mae_{val_mean_absolute_error:.2f}",
"{t}.{l}.hdf5".format(t=target, l=location)])
cp_best = ModelCheckpoint(filepath=serialize_path,
monitor='val_mean_absolute_error',
save_best_only=True,
mode='auto')
cp_period = ModelCheckpoint(filepath=serialize_path, period=100)
es = EarlyStopping(monitor='val_mean_absolute_error', min_delta=0, patience=100, mode='auto')
return [cp_best, cp_period, es]
In [ ]:
def split_from_index(ndarray, begin_val_index, end_val_index):
if ndarray.ndim == 2:
return ndarray[:begin_val_index, :], ndarray[begin_val_index:end_val_index, :]
elif ndarray.ndim == 3:
return ndarray[:begin_val_index, :, :], ndarray[begin_val_index:end_val_index, :, :]
In [ ]:
class DummyScaler(BaseEstimator, TransformerMixin):
def __init__(self, copy=True):
self.copy = copy
def fit(self, X, y=None):
return self
def transform(self, X):
return X
def inverse_transform(self, X):
return X
In [ ]:
In [ ]:
location = "ukishima"
In [ ]:
df_x, df_y = get_dataset("train", location)
In [ ]:
len_series = 144
num_features = df_x.shape[1]
datetime_index = df_y.index.copy(deep=True)
is_null_y = df_y.isnull().copy(deep=True)
model_name = gen_model_name_from_param_dict(gru_params)
modified_param_dict = gen_modified_param_dict(gru_params)
modified_param_dict["name"] = model_name
target_val = "foldout_{y}".format(y=validation_year)
In [ ]:
ndarray_x = df_x.as_matrix()
ndarray_y = df_y.as_matrix().reshape(-1, 1)
is_nan_y_train = np.isnan(ndarray_y.reshape(-1))
ndarray_x[is_nan_y_train, :] = 1.
ndarray_y[is_nan_y_train, :] = 1.
x_scaler = MinMaxScaler()
ndarray_x = x_scaler.fit_transform(ndarray_x)
y_scaler = DummyScaler()
ndarray_y = y_scaler.fit_transform(ndarray_y)
ndarray_x[is_nan_y_train, :] = 1.
ndarray_y[is_nan_y_train, :] = 1.
In [ ]:
ndarray_x = ndarray_x.reshape(-1, len_series, num_features).astype('float32')
ndarray_y = ndarray_y.reshape(-1, len_series, 1)
begin_val_datetime = pd.to_datetime("{y}-01-01 00:10:00".format(y=validation_year))
end_val_datetime = pd.to_datetime("{y}-01-01 00:10:00".format(y=validation_year+1))
begin_val_index = int(datetime_index.get_loc(begin_val_datetime) / len_series)
val_end_index = int(datetime_index.get_loc(end_val_datetime) / len_series)
x_train, x_val = split_from_index(ndarray_x, begin_val_index, val_end_index)
y_train, y_val = split_from_index(ndarray_y, begin_val_index, val_end_index)
In [ ]:
num_sample_train = x_train.shape[0]
num_sample_val = x_val.shape[0]
remove_sample_list = [
i for i in range(0, int(df_y.size/len_series))
if is_null_y.iloc[np.arange(len_series*i, len_series*(i+1))].sum() == len_series
]
remove_samples_train = [
elem
for elem in remove_sample_list
if elem < num_sample_train
]
remove_samples_val = [
elem - num_sample_train
for elem in remove_sample_list
if num_sample_train <= elem < num_sample_train + num_sample_val
]
x_train = np.delete(x_train, remove_samples_train, 0)
y_train = np.delete(y_train, remove_samples_train, 0)
x_val = np.delete(x_val, remove_samples_val, 0)
y_val = np.delete(y_val, remove_samples_val, 0)
In [ ]:
model = gen_sequential((len_series, num_features), modified_param_dict)
model.summary()
In [ ]:
history = model.fit(x_train,
y_train,
epochs=epochs,
verbose=1,
validation_split=validation_split,
callbacks=gen_callbacks(target_val, location))
In [ ]:
y_pred = y_scaler.inverse_transform(model.predict(x_val).reshape(-1, 1))
y_true = y_scaler.inverse_transform(y_val.reshape(-1, 1))
index = pd.date_range(begin_val_datetime,
end_val_datetime + pd.Timedelta(-freq_minute, unit='m'),
freq=pd.offsets.Minute(freq_minute))
val_index = index.copy(deep=True)
for i in remove_samples_val:
val_index = val_index.drop(index[len_series*i:len_series*(i+1)])
pd.DataFrame(
y_pred, index=val_index, columns=[model_name,]
).to_csv(
'.'.join([predict_serialize_filename_prefix,
model_name,
target_val,
location,
"tsv"]),
sep="\t"
)
model.save(
'.'.join([model_serialize_filename_prefix,
model_name,
target_val,
location,
"hdf5"])
)
print("MAE convert into every 30 in {y}: ".format(y=validation_year),
3 * mean_absolute_error(y_true, y_pred))
In [ ]:
history = model.fit(x_train,
y_train,
epochs=epochs,
verbose=1,
validation_data=(x_val, y_val),
callbacks=gen_callbacks("test", location))
In [ ]:
df_test = get_dataset("test", location)
x_test = x_scaler.transform(df_test.as_matrix())
y_pred = y_scaler.inverse_transform(
model.predict(
x_test.reshape(-1, len_series, num_features).astype('float32')
).reshape(-1, 1)
)
df_y_pred = pd.DataFrame(y_pred, index=df_test.index, columns=[model_name,])
df_y_pred.resample(
**KWARGS_RESAMPLING
).sum().to_csv(
'.'.join([predict_serialize_filename_prefix,
model_name,
"test",
location,
"tsv"]),
sep="\t"
)
model.save(
'.'.join([model_serialize_filename_prefix,
model_name,
"test",
location,
"hdf5"]),
)
In [ ]:
In [ ]:
location = "ougishima"
In [ ]:
df_x, df_y = get_dataset("train", location)
In [ ]:
len_series = 144
num_features = df_x.shape[1]
datetime_index = df_y.index.copy(deep=True)
is_null_y = df_y.isnull().copy(deep=True)
model_name = gen_model_name_from_param_dict(gru_params)
modified_param_dict = gen_modified_param_dict(gru_params)
modified_param_dict["name"] = model_name
target_val = "foldout_{y}".format(y=validation_year)
In [ ]:
ndarray_x = df_x.as_matrix()
ndarray_y = df_y.as_matrix().reshape(-1, 1)
is_nan_y_train = np.isnan(ndarray_y.reshape(-1))
ndarray_x[is_nan_y_train, :] = 1.
ndarray_y[is_nan_y_train, :] = 1.
x_scaler = MinMaxScaler()
ndarray_x = x_scaler.fit_transform(ndarray_x)
y_scaler = DummyScaler()
ndarray_y = y_scaler.fit_transform(ndarray_y)
ndarray_x[is_nan_y_train, :] = 1.
ndarray_y[is_nan_y_train, :] = 1.
In [ ]:
ndarray_x = ndarray_x.reshape(-1, len_series, num_features).astype('float32')
ndarray_y = ndarray_y.reshape(-1, len_series, 1)
begin_val_datetime = pd.to_datetime("{y}-01-01 00:10:00".format(y=validation_year))
end_val_datetime = pd.to_datetime("{y}-01-01 00:10:00".format(y=validation_year+1))
begin_val_index = int(datetime_index.get_loc(begin_val_datetime) / len_series)
val_end_index = int(datetime_index.get_loc(end_val_datetime) / len_series)
x_train, x_val = split_from_index(ndarray_x, begin_val_index, val_end_index)
y_train, y_val = split_from_index(ndarray_y, begin_val_index, val_end_index)
In [ ]:
num_sample_train = x_train.shape[0]
num_sample_val = x_val.shape[0]
remove_sample_list = [
i for i in range(0, int(df_y.size/len_series))
if is_null_y.iloc[np.arange(len_series*i, len_series*(i+1))].sum() == len_series
]
remove_samples_train = [
elem
for elem in remove_sample_list
if elem < num_sample_train
]
remove_samples_val = [
elem - num_sample_train
for elem in remove_sample_list
if num_sample_train <= elem < num_sample_train + num_sample_val
]
x_train = np.delete(x_train, remove_samples_train, 0)
y_train = np.delete(y_train, remove_samples_train, 0)
x_val = np.delete(x_val, remove_samples_val, 0)
y_val = np.delete(y_val, remove_samples_val, 0)
In [ ]:
model = gen_sequential((len_series, num_features), modified_param_dict)
model.summary()
In [ ]:
history = model.fit(x_train,
y_train,
epochs=epochs,
verbose=1,
validation_split=validation_split,
callbacks=gen_callbacks(target_val, location))
In [ ]:
y_pred = y_scaler.inverse_transform(model.predict(x_val).reshape(-1, 1))
y_true = y_scaler.inverse_transform(y_val.reshape(-1, 1))
index = pd.date_range(begin_val_datetime,
end_val_datetime + pd.Timedelta(-freq_minute, unit='m'),
freq=pd.offsets.Minute(freq_minute))
val_index = index.copy(deep=True)
for i in remove_samples_val:
val_index = val_index.drop(index[len_series*i:len_series*(i+1)])
pd.DataFrame(
y_pred, index=val_index, columns=[model_name,]
).to_csv(
'.'.join([predict_serialize_filename_prefix,
model_name,
target_val,
location,
"tsv"]),
sep="\t"
)
model.save(
'.'.join([model_serialize_filename_prefix,
model_name,
target_val,
location,
"hdf5"])
)
print("MAE convert into every 30 in {y}: ".format(y=validation_year),
3 * mean_absolute_error(y_true, y_pred))
In [ ]:
history = model.fit(x_train,
y_train,
epochs=epochs,
verbose=1,
validation_data=(x_val, y_val),
callbacks=gen_callbacks("test", location))
In [ ]:
df_test = get_dataset("test", location)
x_test = x_scaler.transform(df_test.as_matrix())
y_pred = y_scaler.inverse_transform(
model.predict(
x_test.reshape(-1, len_series, num_features).astype('float32')
).reshape(-1, 1)
)
df_y_pred = pd.DataFrame(y_pred, index=df_test.index, columns=[model_name,])
df_y_pred.resample(
**KWARGS_RESAMPLING
).sum().to_csv(
'.'.join([predict_serialize_filename_prefix,
model_name,
"test",
location,
"tsv"]),
sep="\t"
)
model.save(
'.'.join([model_serialize_filename_prefix,
model_name,
"test",
location,
"hdf5"]),
)
In [ ]:
In [ ]:
location = "yonekurayama"
In [ ]:
df_x, df_y = get_dataset("train", location)
In [ ]:
len_series = 144
num_features = df_x.shape[1]
datetime_index = df_y.index.copy(deep=True)
is_null_y = df_y.isnull().copy(deep=True)
model_name = gen_model_name_from_param_dict(gru_params)
modified_param_dict = gen_modified_param_dict(gru_params)
modified_param_dict["name"] = model_name
target_val = "foldout_{y}".format(y=validation_year)
In [ ]:
ndarray_x = df_x.as_matrix()
ndarray_y = df_y.as_matrix().reshape(-1, 1)
is_nan_y_train = np.isnan(ndarray_y.reshape(-1))
ndarray_x[is_nan_y_train, :] = 1.
ndarray_y[is_nan_y_train, :] = 1.
x_scaler = MinMaxScaler()
ndarray_x = x_scaler.fit_transform(ndarray_x)
y_scaler = DummyScaler()
ndarray_y = y_scaler.fit_transform(ndarray_y)
ndarray_x[is_nan_y_train, :] = 1.
ndarray_y[is_nan_y_train, :] = 1.
In [ ]:
ndarray_x = ndarray_x.reshape(-1, len_series, num_features).astype('float32')
ndarray_y = ndarray_y.reshape(-1, len_series, 1)
begin_val_datetime = pd.to_datetime("{y}-01-01 00:10:00".format(y=validation_year))
end_val_datetime = pd.to_datetime("{y}-01-01 00:10:00".format(y=validation_year+1))
begin_val_index = int(datetime_index.get_loc(begin_val_datetime) / len_series)
val_end_index = int(datetime_index.get_loc(end_val_datetime) / len_series)
x_train, x_val = split_from_index(ndarray_x, begin_val_index, val_end_index)
y_train, y_val = split_from_index(ndarray_y, begin_val_index, val_end_index)
In [ ]:
num_sample_train = x_train.shape[0]
num_sample_val = x_val.shape[0]
remove_sample_list = [
i for i in range(0, int(df_y.size/len_series))
if is_null_y.iloc[np.arange(len_series*i, len_series*(i+1))].sum() == len_series
]
remove_samples_train = [
elem
for elem in remove_sample_list
if elem < num_sample_train
]
remove_samples_val = [
elem - num_sample_train
for elem in remove_sample_list
if num_sample_train <= elem < num_sample_train + num_sample_val
]
x_train = np.delete(x_train, remove_samples_train, 0)
y_train = np.delete(y_train, remove_samples_train, 0)
x_val = np.delete(x_val, remove_samples_val, 0)
y_val = np.delete(y_val, remove_samples_val, 0)
In [ ]:
model = gen_sequential((len_series, num_features), modified_param_dict)
model.summary()
In [ ]:
history = model.fit(x_train,
y_train,
epochs=epochs,
verbose=1,
validation_split=validation_split,
callbacks=gen_callbacks(target_val, location))
In [ ]:
y_pred = y_scaler.inverse_transform(model.predict(x_val).reshape(-1, 1))
y_true = y_scaler.inverse_transform(y_val.reshape(-1, 1))
index = pd.date_range(begin_val_datetime,
end_val_datetime + pd.Timedelta(-freq_minute, unit='m'),
freq=pd.offsets.Minute(freq_minute))
val_index = index.copy(deep=True)
for i in remove_samples_val:
val_index = val_index.drop(index[len_series*i:len_series*(i+1)])
pd.DataFrame(
y_pred, index=val_index, columns=[model_name,]
).to_csv(
'.'.join([predict_serialize_filename_prefix,
model_name,
target_val,
location,
"tsv"]),
sep="\t"
)
model.save(
'.'.join([model_serialize_filename_prefix,
model_name,
target_val,
location,
"hdf5"])
)
print("MAE convert into every 30 in {y}: ".format(y=validation_year),
3 * mean_absolute_error(y_true, y_pred))
In [ ]:
history = model.fit(x_train,
y_train,
epochs=epochs,
verbose=1,
validation_data=(x_val, y_val),
callbacks=gen_callbacks("test", location))
In [ ]:
df_test = get_dataset("test", location)
x_test = x_scaler.transform(df_test.as_matrix())
y_pred = y_scaler.inverse_transform(
model.predict(
x_test.reshape(-1, len_series, num_features).astype('float32')
).reshape(-1, 1)
)
df_y_pred = pd.DataFrame(y_pred, index=df_test.index, columns=[model_name,])
df_y_pred.resample(
**KWARGS_RESAMPLING
).sum().to_csv(
'.'.join([predict_serialize_filename_prefix,
model_name,
"test",
location,
"tsv"]),
sep="\t"
)
model.save(
'.'.join([model_serialize_filename_prefix,
model_name,
"test",
location,
"hdf5"]),
)
In [ ]: