In [ ]:
import os
from os import path, pardir
import sys
In [ ]:
PROJECT_ROOT_DIRPATH = path.join(os.getcwd(), pardir)
In [ ]:
sys.path.append(PROJECT_ROOT_DIRPATH)
In [ ]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
In [ ]:
from src.data.forecast import ForecastHandler
from src.features.dataset import DatasetHandler
from src.features.dummy import DummyFeatureHandler
from src.features.time_series import TimeSeriesReshaper
In [ ]:
KWARGS_READ_CSV = {
"sep": "\t",
"header": 0,
"parse_dates": [0],
"index_col": 0
}
In [ ]:
OBJECTIME_NAME = "kwh"
CIRCULAR_CATEGORICAL_VARIABLES = ("wv",)
additional_removal_columns = ["we",]
datetime_fmt = "(?P<year>\d{4})(?P<month>\d{1,2})(?P<day>\d{1,2})(?P<hour>\d{2})(?P<minute>\d{2})"
LOCATIONS = (
"ukishima",
"ougishima",
"yonekurayama",
)
loc_field_dict = {
"ukishima": "SOLA01",
"ougishima": "SOLA02",
"yonekurayama": "SOLA03"
}
In [ ]:
forecast = ForecastHandler()
maker = DatasetHandler(columns_y=[OBJECTIME_NAME, ])
categ = DummyFeatureHandler()
reshaper = TimeSeriesReshaper()
In [ ]:
objectives_filepath = path.join(maker.RAW_DATA_BASEPATH, "train_kwh.tsv")
In [ ]:
def get_train_test_X_y(location):
df_forecast = forecast.read_tsv(forecast.gen_filepath(location))
df_forecast_expanded = forecast.add_datetime_ticks(df_forecast)
whole_day_data_name_list = forecast.get_whole_day_data_columns(df_forecast.columns)
for whole_day_data_name in whole_day_data_name_list:
sr_expand_whole_day_data = \
forecast.expand_whole_day_data(df_forecast[whole_day_data_name])
df_forecast_expanded.loc[
sr_expand_whole_day_data.index,
whole_day_data_name
] = sr_expand_whole_day_data
time_ranged_data_name_list = \
forecast.get_time_ranged_data_columns(df_forecast.columns)
for time_ranged_data_name in time_ranged_data_name_list:
sr_expand_time_ranged_data = \
forecast.expand_time_ranged_data(df_forecast[time_ranged_data_name])
df_forecast_expanded.loc[
sr_expand_time_ranged_data.index,
forecast.extract_attribute_from_time_ranged_column_name(time_ranged_data_name)
] = sr_expand_time_ranged_data
df_forecast_expanded.drop(time_ranged_data_name_list, axis=1, inplace=True)
df_weather = pd.get_dummies(df_forecast_expanded["we"], prefix="we")
df_forecast_expanded = df_forecast_expanded.merge(df_weather, **maker.KWARGS_OUTER_MERGE)
df_month = categ.extract_month(df_forecast_expanded.index)
df_month = pd.get_dummies(df_month, prefix="month")
df_forecast_expanded = df_forecast_expanded.merge(df_month, **maker.KWARGS_OUTER_MERGE)
df_hour = categ.extract_hour(df_forecast_expanded.index)
df_hour = pd.get_dummies(df_hour, prefix="hour")
df_forecast_expanded = df_forecast_expanded.merge(df_hour, **maker.KWARGS_OUTER_MERGE)
for col_name, correspond_dict in categ.FORECAST_ATTRIBUTES.items():
df_forecast_expanded[col_name] = categ.convert_series_along_dict(df_forecast_expanded[col_name], correspond_dict)
for col_name in CIRCULAR_CATEGORICAL_VARIABLES:
df_temp_cos_sin = categ.convert_linear_to_circular(
df_forecast_expanded[col_name], len(categ.FORECAST_ATTRIBUTES[col_name])
)
df_forecast_expanded = df_forecast_expanded.merge(
df_temp_cos_sin, **maker.KWARGS_OUTER_MERGE
)
df_forecast_expanded.drop(col_name, axis=1, inplace=True)
df_forecast_expanded["past_time"] = pd.Series(np.arange(df_forecast_expanded.shape[0]),
index=df_forecast_expanded.index,
name="past_time")
drop_col_name_list = reshaper.DROP_LABEL_NAMES + additional_removal_columns
df_forecast_expanded.drop(drop_col_name_list, axis=1, inplace=True)
df_kwh = pd.read_csv(objectives_filepath, sep="\t", index_col=[0])
df_kwh.index = pd.to_datetime(
pd.Series(df_kwh.index).apply(str).str.extract(datetime_fmt, expand=False)
)
df_y = df_kwh[loc_field_dict[location]].to_frame()
df_y.rename(columns=lambda x: OBJECTIME_NAME, inplace=True)
df_X_y = df_forecast_expanded.merge(df_y, **maker.KWARGS_OUTER_MERGE)
df_train, df_test = maker.separate_train_test(df_X_y)
df_test, _ = maker.separate_X_y(df_test)
df_X_y.to_csv(
path.join(maker.PROCESSED_DATA_BASEPATH, "dataset.train_X_y.every_10.{l}.tsv".format(l=location)),
sep="\t"
)
df_test.to_csv(
path.join(maker.PROCESSED_DATA_BASEPATH, "dataset.test_X.every_10.{l}.tsv".format(l=location)),
sep="\t"
)
In [ ]:
for location in LOCATIONS:
get_train_test_X_y(location)
In [ ]: