In [ ]:
import os
from os import path, pardir
import sys

In [ ]:
PROJECT_ROOT_DIRPATH = path.join(os.getcwd(), pardir)

In [ ]:
sys.path.append(PROJECT_ROOT_DIRPATH)

In [ ]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [ ]:
from src.data.forecast import ForecastHandler
from src.features.dataset import DatasetHandler
from src.features.dummy import DummyFeatureHandler
from src.features.time_series import TimeSeriesReshaper

In [ ]:
KWARGS_READ_CSV = {
    "sep": "\t",
    "header": 0,
    "parse_dates": [0],
    "index_col": 0
}

In [ ]:
OBJECTIME_NAME = "kwh"
CIRCULAR_CATEGORICAL_VARIABLES = ("wv",)
additional_removal_columns = ["we",]
datetime_fmt = "(?P<year>\d{4})(?P<month>\d{1,2})(?P<day>\d{1,2})(?P<hour>\d{2})(?P<minute>\d{2})"
LOCATIONS = (
    "ukishima",
    "ougishima",
    "yonekurayama",
)
loc_field_dict = {
    "ukishima": "SOLA01",
    "ougishima": "SOLA02",
    "yonekurayama": "SOLA03"
}

In [ ]:
forecast = ForecastHandler()
maker = DatasetHandler(columns_y=[OBJECTIME_NAME, ])
categ = DummyFeatureHandler()
reshaper = TimeSeriesReshaper()

In [ ]:
objectives_filepath = path.join(maker.RAW_DATA_BASEPATH, "train_kwh.tsv")

In [ ]:
def get_train_test_X_y(location):
    df_forecast = forecast.read_tsv(forecast.gen_filepath(location))
    df_forecast_expanded = forecast.add_datetime_ticks(df_forecast)

    whole_day_data_name_list = forecast.get_whole_day_data_columns(df_forecast.columns)

    for whole_day_data_name in whole_day_data_name_list:
        sr_expand_whole_day_data = \
            forecast.expand_whole_day_data(df_forecast[whole_day_data_name])
        df_forecast_expanded.loc[
            sr_expand_whole_day_data.index,
            whole_day_data_name
        ] = sr_expand_whole_day_data

    time_ranged_data_name_list = \
        forecast.get_time_ranged_data_columns(df_forecast.columns)

    for time_ranged_data_name in time_ranged_data_name_list:
        sr_expand_time_ranged_data = \
            forecast.expand_time_ranged_data(df_forecast[time_ranged_data_name])
        df_forecast_expanded.loc[
            sr_expand_time_ranged_data.index,
            forecast.extract_attribute_from_time_ranged_column_name(time_ranged_data_name)
        ] = sr_expand_time_ranged_data

    df_forecast_expanded.drop(time_ranged_data_name_list, axis=1, inplace=True)

    df_weather = pd.get_dummies(df_forecast_expanded["we"], prefix="we")
    df_forecast_expanded = df_forecast_expanded.merge(df_weather, **maker.KWARGS_OUTER_MERGE)

    df_month = categ.extract_month(df_forecast_expanded.index)
    df_month = pd.get_dummies(df_month, prefix="month")
    df_forecast_expanded = df_forecast_expanded.merge(df_month, **maker.KWARGS_OUTER_MERGE)

    df_hour = categ.extract_hour(df_forecast_expanded.index)
    df_hour = pd.get_dummies(df_hour, prefix="hour")
    df_forecast_expanded = df_forecast_expanded.merge(df_hour, **maker.KWARGS_OUTER_MERGE)

    for col_name, correspond_dict in categ.FORECAST_ATTRIBUTES.items():
        df_forecast_expanded[col_name] = categ.convert_series_along_dict(df_forecast_expanded[col_name], correspond_dict)

    for col_name in CIRCULAR_CATEGORICAL_VARIABLES:
        df_temp_cos_sin = categ.convert_linear_to_circular(
            df_forecast_expanded[col_name], len(categ.FORECAST_ATTRIBUTES[col_name])
        )
        df_forecast_expanded = df_forecast_expanded.merge(
            df_temp_cos_sin, **maker.KWARGS_OUTER_MERGE
        )
        df_forecast_expanded.drop(col_name, axis=1, inplace=True)

    df_forecast_expanded["past_time"] = pd.Series(np.arange(df_forecast_expanded.shape[0]),
                                                  index=df_forecast_expanded.index,
                                                  name="past_time")

    drop_col_name_list = reshaper.DROP_LABEL_NAMES + additional_removal_columns
    df_forecast_expanded.drop(drop_col_name_list, axis=1, inplace=True)

    df_kwh = pd.read_csv(objectives_filepath, sep="\t", index_col=[0])
    df_kwh.index = pd.to_datetime(
        pd.Series(df_kwh.index).apply(str).str.extract(datetime_fmt, expand=False)
    )
    df_y = df_kwh[loc_field_dict[location]].to_frame()
    df_y.rename(columns=lambda x: OBJECTIME_NAME, inplace=True)

    df_X_y = df_forecast_expanded.merge(df_y, **maker.KWARGS_OUTER_MERGE)

    df_train, df_test = maker.separate_train_test(df_X_y)
    df_test, _ = maker.separate_X_y(df_test)
    
    df_X_y.to_csv(
        path.join(maker.PROCESSED_DATA_BASEPATH, "dataset.train_X_y.every_10.{l}.tsv".format(l=location)),
        sep="\t"
    )
    df_test.to_csv(
        path.join(maker.PROCESSED_DATA_BASEPATH, "dataset.test_X.every_10.{l}.tsv".format(l=location)),
        sep="\t"
    )

実働部分


In [ ]:
for location in LOCATIONS:
    get_train_test_X_y(location)

In [ ]: