notebook.community

Edit and run



In [50]:

    
import pandas as pd
import numpy as np
import os
import pickle
from scipy.stats.mstats import zscore
import time
from datetime import date



In [2]:

    
"""
What do I need to test:
train accuracy
test accuracy

What am I looking to change
- leaky_relu, relu, tanh, etc...
- number of layers
- number of nodes

Secondary:
- What about the SPP? Ignore for now
- How does model quality change when given fewer years?


----------------
First decide best structure:
layers [1, 5, 10] and nodes [36, 50, 72, 100]
2D matrix with train/test harmonic mean
"""









    Out[2]:





'\nWhat do I need to test:\ntrain accuracy\ntest accuracy\n\nWhat am I looking to change\n- leaky_relu, relu, tanh, etc...\n- number of layers\n- range or std\n- number of nodes\n- \n\nSecondary:\n- What about the SPP? Ignore for now\n- How does model quality change when given fewer years?\n\n\n----------------\nFirst decide best structure:\nlayers [1, 5, 10] and nodes [36, 50, 72, 100]\n2D matrix with train/test harmonic mean\n'



In [73]:

    
with open('holidays.pickle', 'rb') as f:
    nerc6 = pickle.load(f)

def isHoliday(holiday, df):
    # New years, memorial, independence, labor day, Thanksgiving, Christmas
    m1 = None
    if holiday == "New Year's Day":
        m1 = (df["dates"].dt.month == 1) & (df["dates"].dt.day == 1)
    if holiday == "Independence Day":
        m1 = (df["dates"].dt.month == 7) & (df["dates"].dt.day == 4)
    if holiday == "Christmas Day":
        m1 = (df["dates"].dt.month == 12) & (df["dates"].dt.day == 25)
    m1 = df["dates"].dt.date.isin(nerc6[holiday]) if m1 is None else m1
    m2 = df["dates"].dt.date.isin(nerc6.get(holiday + " (Observed)", []))
    return m1 | m2

def makeUsefulDf(df):
    """
    Turn a dataframe of datetime and load data into a dataframe useful for
    machine learning. Normalize values and turn 
    Features are placed into r_df (return dataframe), creates the following columns

        YEARS SINCE 2000

        LOAD AT THIS TIME DAY BEFORE

        HOUR OF DAY
        - is12AM (0, 1)
        - is1AM (0, 1)
        ...
        - is11PM (0, 1)

        DAYS OF THE WEEK
        - isSunday (0, 1)
        - isMonday (0, 1)
        ...
        - isSaturday (0, 1)

        MONTHS OF THE YEAR
        - isJanuary (0, 1)
        - isFebruary (0, 1)
        ...
        - isDecember (0, 1)

        TEMPERATURE
        - Celcius (normalized from -1 to 1)

        PREVIOUS DAY'S LOAD 
        - 12AM of day previous (normalized from -1 to 1)
        - 1AM of day previous (normalized from -1 to 1)
        ...
        - 11PM of day previous (normalized from -1 to 1)

        HOLIDAYS (the nerc6 holidays)
        - isNewYears (0, 1)
        - isMemorialDay (0, 1)
        ...
        - is Christmas (0, 1)

    """

    def _normalizeCol(l):
        #s = l.max() - l.min()
        #return l if s == 0 else (l - l.mean()) / l.std()
        return zscore(l)
    def _chunks(l, n):
        return [l[i : i + n] for i in range(0, len(l), n)]

    r_df = pd.DataFrame()
    r_df["load_n"] = _normalizeCol(df["load"])
    r_df["years_n"] = _normalizeCol(df["dates"].dt.year - 2000)

    # fix outliers
    m = df["tempc"].replace([-9999], np.nan)
    m.ffill(inplace=True)
    # 2.5 degrees average std error for the national weather service
    temp_noise = m
    r_df["temp_n"] = _normalizeCol(temp_noise)
    r_df['temp_n^2'] = r_df["temp_n"]**2

    # add the value of the load 24hrs before
    r_df["load_prev_n"] = r_df["load_n"].shift(24)
    r_df["load_prev_n"].bfill(inplace=True)

    # create day of week vector
    r_df["day"] = df["dates"].dt.dayofweek  # 0 is Monday.
    w = ["S", "M", "T", "W", "R", "F", "A"]
    for i, d in enumerate(w):
        r_df[d] = (r_df["day"] == i).astype(int)

        # create hour of day vector
    r_df["hour"] = df["dates"].dt.hour
    d = [("h" + str(i)) for i in range(24)]
    for i, h in enumerate(d):
        r_df[h] = (r_df["hour"] == i).astype(int)

        # create month vector
    r_df["month"] = df["dates"].dt.month
    y = [("m" + str(i)) for i in range(12)]
    for i, m in enumerate(y):
        r_df[m] = (r_df["month"] == i).astype(int)

        # create 'load day before' vector
    n = np.array([val for val in _chunks(list(r_df["load_n"]), 24) for _ in range(24)])
    l = ["l" + str(i) for i in range(24)]
    for i, s in enumerate(l):
        r_df[s] = n[:, i]

        # create holiday booleans
    r_df["isNewYears"] = isHoliday("New Year's Day", df)
    r_df["isMemorialDay"] = isHoliday("Memorial Day", df)
    r_df["isIndependenceDay"] = isHoliday("Independence Day", df)
    r_df["isLaborDay"] = isHoliday("Labor Day", df)
    r_df["isThanksgiving"] = isHoliday("Thanksgiving", df)
    r_df["isChristmas"] = isHoliday("Christmas Day", df)

    m = r_df.drop(["month", "hour", "day", "load_n"], axis=1)
    return m

def neural_net_predictions(all_X, all_y):
    import tensorflow as tf
    from tensorflow.keras import layers
    tic = time.time()
    X_train, y_train = all_X[:-8760], all_y[:-8760]

    model = tf.keras.Sequential([
        layers.Dense(all_X.shape[1], activation=tf.nn.relu, input_shape=[len(X_train.keys())]),
        layers.Dense(all_X.shape[1], activation=tf.nn.relu),
        layers.Dense(all_X.shape[1], activation=tf.nn.relu),
        layers.Dense(all_X.shape[1], activation=tf.nn.relu),
        layers.Dense(all_X.shape[1], activation=tf.nn.relu),
        layers.Dense(1)
      ])

    optimizer = tf.keras.optimizers.RMSprop(0.001)

    model.compile(
        loss="mean_squared_error",
        optimizer=optimizer,
        metrics=["mean_absolute_error", "mean_squared_error"],
    )

    EPOCHS = 100

    early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)

    history = model.fit(
        X_train,
        y_train,
        epochs=EPOCHS,
        validation_split=0.2,
        verbose=0,
        callbacks=[early_stop],
    )
    
    def MAPE(predictions, answers):
        # THIS IS PROBLEMATIC IF Y is EVER 0
        assert len(predictions) == len(answers)
        return sum([abs(x-y)/(y+1e-5) for x, y in zip(predictions, answers)])/len(answers)*100   
    
    predictions = [float(f) for f in model.predict(all_X[-8760:])]
    train = [float(f) for f in model.predict(all_X[:-8760])]
    accuracy = {
        'test': MAPE(predictions, all_y[-8760:]),
        'train': MAPE(train, all_y[:-8760])
    }
    
    return predictions, accuracy, time.time() - tic



In [74]:

    
# results = {}
# df_dict = {}
# df_dict['test3'] = {}

# for file in os.listdir('data/test'):
#     if not file.startswith('.'):
#         region = file[:-4]
#         df = pd.read_csv('data/test/' + file, parse_dates=['dates'])
#         all_X = makeUsefulDf(df)
#         all_y = df['load']
#         predictions, accuracy = neural_net_predictions(all_X, all_y)
#         results[region] = accuracy
#         df_dict['test3'][region] = (2/((1/accuracy['test']) + (1/accuracy['train'])))
#         print(accuracy, df_dict)



In [75]:

    
df = pd.read_csv('data/test/NCENT.csv', parse_dates=['dates'])
all_X = makeUsefulDf(df)
all_y = df['load']
predictions, accuracy, t = neural_net_predictions(all_X, all_y)
print(accuracy, t)









    



{'test': 0.854672380569408, 'train': 1.273000491187482} 562.1585657596588



In [76]:

    
df = pd.read_csv('data/test/NCENT.csv', parse_dates=['dates'])
all_X = makeUsefulDf(df)
all_y = df['load']
predictions, accuracy, t = neural_net_predictions(all_X, all_y)
print(accuracy, t)









    



{'test': 0.7357434840690646, 'train': 0.6403265775499171} 1073.6106476783752



In [ ]:

    
# YAYYYYYYYYYYY