In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from scipy.stats.mstats import zscore
import time
from datetime import date

In [2]:
with open('holidays.pickle', 'rb') as f:
    nerc6 = pickle.load(f)

def isHoliday(holiday, df):
    # New years, memorial, independence, labor day, Thanksgiving, Christmas
    m1 = None
    if holiday == "New Year's Day":
        m1 = (df["dates"].dt.month == 1) & (df["dates"].dt.day == 1)
    if holiday == "Independence Day":
        m1 = (df["dates"].dt.month == 7) & (df["dates"].dt.day == 4)
    if holiday == "Christmas Day":
        m1 = (df["dates"].dt.month == 12) & (df["dates"].dt.day == 25)
    m1 = df["dates"].dt.date.isin(nerc6[holiday]) if m1 is None else m1
    m2 = df["dates"].dt.date.isin(nerc6.get(holiday + " (Observed)", []))
    return m1 | m2

def makeUsefulDf(df):
    """
    Turn a dataframe of datetime and load data into a dataframe useful for
    machine learning. Normalize values and turn 
    Features are placed into r_df (return dataframe), creates the following columns

        YEARS SINCE 2000

        LOAD AT THIS TIME DAY BEFORE

        HOUR OF DAY
        - is12AM (0, 1)
        - is1AM (0, 1)
        ...
        - is11PM (0, 1)

        DAYS OF THE WEEK
        - isSunday (0, 1)
        - isMonday (0, 1)
        ...
        - isSaturday (0, 1)

        MONTHS OF THE YEAR
        - isJanuary (0, 1)
        - isFebruary (0, 1)
        ...
        - isDecember (0, 1)

        TEMPERATURE
        - Celcius (normalized from -1 to 1)

        PREVIOUS DAY'S LOAD 
        - 12AM of day previous (normalized from -1 to 1)
        - 1AM of day previous (normalized from -1 to 1)
        ...
        - 11PM of day previous (normalized from -1 to 1)

        HOLIDAYS (the nerc6 holidays)
        - isNewYears (0, 1)
        - isMemorialDay (0, 1)
        ...
        - is Christmas (0, 1)

    """

    def _normalizeCol(l):
        #s = l.max() - l.min()
        #return l if s == 0 else (l - l.mean()) / l.std()
        return zscore(l)
    def _chunks(l, n):
        return [l[i : i + n] for i in range(0, len(l), n)]

    r_df = pd.DataFrame()
    r_df["load_n"] = _normalizeCol(df["load"])
    r_df["years_n"] = _normalizeCol(df["dates"].dt.year - 2000)

    # fix outliers
    m = df["tempc"].replace([-9999], np.nan)
    m.ffill(inplace=True)
    # 2.5 degrees average std error for the national weather service
    temp_noise = m
    r_df["temp_n"] = _normalizeCol(temp_noise)
    r_df['temp_n^2'] = r_df["temp_n"]**2

    # add the value of the load 24hrs before
    r_df["load_prev_n"] = r_df["load_n"].shift(24)
    r_df["load_prev_n"].bfill(inplace=True)

    # create day of week vector
    r_df["day"] = df["dates"].dt.dayofweek  # 0 is Monday.
    w = ["S", "M", "T", "W", "R", "F", "A"]
    for i, d in enumerate(w):
        r_df[d] = (r_df["day"] == i).astype(int)

        # create hour of day vector
    r_df["hour"] = df["dates"].dt.hour
    d = [("h" + str(i)) for i in range(24)]
    for i, h in enumerate(d):
        r_df[h] = (r_df["hour"] == i).astype(int)

        # create month vector
    r_df["month"] = df["dates"].dt.month
    y = [("m" + str(i)) for i in range(12)]
    for i, m in enumerate(y):
        r_df[m] = (r_df["month"] == i).astype(int)

        # create 'load day before' vector
    n = np.array([val for val in _chunks(list(r_df["load_n"]), 24) for _ in range(24)])
    l = ["l" + str(i) for i in range(24)]
    for i, s in enumerate(l):
        r_df[s] = n[:, i]

        # create holiday booleans
    r_df["isNewYears"] = isHoliday("New Year's Day", df)
    r_df["isMemorialDay"] = isHoliday("Memorial Day", df)
    r_df["isIndependenceDay"] = isHoliday("Independence Day", df)
    r_df["isLaborDay"] = isHoliday("Labor Day", df)
    r_df["isThanksgiving"] = isHoliday("Thanksgiving", df)
    r_df["isChristmas"] = isHoliday("Christmas Day", df)

    m = r_df.drop(["month", "hour", "day", "load_n"], axis=1)
    return m

def neural_net_predictions(all_X, all_y):
    import tensorflow as tf
    from tensorflow.keras import layers
    tic = time.time()
    X_train, y_train = all_X[:-8760], all_y[:-8760]

    model = tf.keras.Sequential([
        layers.Dense(all_X.shape[1], activation=tf.nn.relu, input_shape=[len(X_train.keys())]),
        layers.Dense(all_X.shape[1], activation=tf.nn.relu),
        layers.Dense(all_X.shape[1], activation=tf.nn.relu),
        layers.Dense(all_X.shape[1], activation=tf.nn.relu),
        layers.Dense(all_X.shape[1], activation=tf.nn.relu),
        layers.Dense(1)
      ])

    optimizer = tf.keras.optimizers.RMSprop(0.001)

    model.compile(
        loss="mean_squared_error",
        optimizer=optimizer,
        metrics=["mean_absolute_error", "mean_squared_error"],
    )

    EPOCHS = 100

    early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)

    history = model.fit(
        X_train,
        y_train,
        epochs=EPOCHS,
        validation_split=0.2,
        verbose=0,
        callbacks=[early_stop],
    )
    
    def MAPE(predictions, answers):
        # THIS IS PROBLEMATIC IF Y is EVER 0
        assert len(predictions) == len(answers)
        return sum([abs(x-y)/(y+1e-5) for x, y in zip(predictions, answers)])/len(answers)*100   
    
    predictions = [float(f) for f in model.predict(all_X[-8760:])]
    train = [float(f) for f in model.predict(all_X[:-8760])]
    accuracy = {
        'test': MAPE(predictions, all_y[-8760:]),
        'train': MAPE(train, all_y[:-8760])
    }
    
    return predictions, accuracy, time.time() - tic

In [ ]:
for f in os.listdir('data/test'):
    if f.endswith('.csv'):
        df = pd.read_csv('data/test/' + f, parse_dates=['dates'])
        all_X = makeUsefulDf(df)
        all_y = df['load']
        predictions, accuracy, t = neural_net_predictions(all_X, all_y)
        print(f, accuracy, t)


SCENT.csv {'test': 1.681302700166577, 'train': 1.7087428784470091} 269.84409403800964
NCENT.csv {'test': 0.8791314198772925, 'train': 0.7945583441610236} 297.8987672328949
COAST.csv {'test': 1.5552924629527987, 'train': 1.7896520071636384} 224.67304968833923
FWEST.csv {'test': 2.514862726510828, 'train': 1.2584149693165272} 253.5422010421753
EAST.csv {'test': 0.9309198841164392, 'train': 0.8214532386860942} 374.4927089214325
SOUTH.csv {'test': 2.245935020968539, 'train': 2.075756950161136} 193.11839532852173
NORTH.csv {'test': 0.8792736540018945, 'train': 0.8475251789404536} 426.1031939983368

In [ ]: