It would be very convenient
In [42]:
import pandas as pd
import numpy as np
import os
import pickle
from scipy.stats.mstats import zscore
import time
from datetime import date
import matplotlib.pyplot as plt
import mpld3
mpld3.enable_notebook()
%matplotlib inline
In [43]:
with open('holidays.pickle', 'rb') as f:
nerc6 = pickle.load(f)
def MAPE(predictions, answers):
# THIS IS PROBLEMATIC IF Y is EVER 0
assert len(predictions) == len(answers)
return sum([abs(x-y)/(y+1e-5) for x, y in zip(predictions, answers)])/len(answers)*100
def isHoliday(holiday, df):
# New years, memorial, independence, labor day, Thanksgiving, Christmas
m1 = None
if holiday == "New Year's Day":
m1 = (df["dates"].dt.month == 1) & (df["dates"].dt.day == 1)
if holiday == "Independence Day":
m1 = (df["dates"].dt.month == 7) & (df["dates"].dt.day == 4)
if holiday == "Christmas Day":
m1 = (df["dates"].dt.month == 12) & (df["dates"].dt.day == 25)
m1 = df["dates"].dt.date.isin(nerc6[holiday]) if m1 is None else m1
m2 = df["dates"].dt.date.isin(nerc6.get(holiday + " (Observed)", []))
return m1 | m2
def makeUsefulDf(df):
"""
Turn a dataframe of datetime and load data into a dataframe useful for
machine learning. Normalize values and turn
Features are placed into r_df (return dataframe), creates the following columns
YEARS SINCE 2000
LOAD AT THIS TIME DAY BEFORE
HOUR OF DAY
- is12AM (0, 1)
- is1AM (0, 1)
...
- is11PM (0, 1)
DAYS OF THE WEEK
- isSunday (0, 1)
- isMonday (0, 1)
...
- isSaturday (0, 1)
MONTHS OF THE YEAR
- isJanuary (0, 1)
- isFebruary (0, 1)
...
- isDecember (0, 1)
TEMPERATURE
- Celcius (normalized from -1 to 1)
PREVIOUS DAY'S LOAD
- 12AM of day previous (normalized from -1 to 1)
- 1AM of day previous (normalized from -1 to 1)
...
- 11PM of day previous (normalized from -1 to 1)
HOLIDAYS (the nerc6 holidays)
- isNewYears (0, 1)
- isMemorialDay (0, 1)
...
- is Christmas (0, 1)
"""
def _normalizeCol(l):
#s = l.max() - l.min()
#return l if s == 0 else (l - l.mean()) / l.std()
return zscore(l)
def _chunks(l, n):
return [l[i : i + n] for i in range(0, len(l), n)]
r_df = pd.DataFrame()
r_df["load_n"] = _normalizeCol(df["load"])
r_df["years_n"] = _normalizeCol(df["dates"].dt.year - 2000)
# fix outliers
m = df["tempc"].replace([-9999], np.nan)
m.ffill(inplace=True)
# 2.5 degrees average std error for the national weather service
temp_noise = m
r_df["temp_n"] = _normalizeCol(temp_noise)
r_df['temp_n^2'] = r_df["temp_n"]**2
# add the value of the load 24hrs before
r_df["load_prev_n"] = r_df["load_n"].shift(24)
r_df["load_prev_n"].bfill(inplace=True)
# create day of week vector
r_df["day"] = df["dates"].dt.dayofweek # 0 is Monday.
w = ["S", "M", "T", "W", "R", "F", "A"]
for i, d in enumerate(w):
r_df[d] = (r_df["day"] == i).astype(int)
# create hour of day vector
r_df["hour"] = df["dates"].dt.hour
d = [("h" + str(i)) for i in range(24)]
for i, h in enumerate(d):
r_df[h] = (r_df["hour"] == i).astype(int)
# create month vector
r_df["month"] = df["dates"].dt.month
y = [("m" + str(i)) for i in range(12)]
for i, m in enumerate(y):
r_df[m] = (r_df["month"] == i).astype(int)
# create 'load day before' vector
n = np.array([val for val in _chunks(list(r_df["load_n"]), 24) for _ in range(24)])
l = ["l" + str(i) for i in range(24)]
for i, s in enumerate(l):
r_df[s] = n[:, i]
# create holiday booleans
r_df["isNewYears"] = isHoliday("New Year's Day", df)
r_df["isMemorialDay"] = isHoliday("Memorial Day", df)
r_df["isIndependenceDay"] = isHoliday("Independence Day", df)
r_df["isLaborDay"] = isHoliday("Labor Day", df)
r_df["isThanksgiving"] = isHoliday("Thanksgiving", df)
r_df["isChristmas"] = isHoliday("Christmas Day", df)
m = r_df.drop(["month", "hour", "day", "load_n"], axis=1)
return m
def neural_net_predictions(all_X, all_y):
import tensorflow as tf
from tensorflow.keras import layers
tic = time.time()
X_train, y_train = all_X[:-8760], all_y[:-8760]
model = tf.keras.Sequential([
layers.Dense(all_X.shape[1], activation=tf.nn.relu, input_shape=[len(X_train.keys())]),
layers.Dense(all_X.shape[1], activation=tf.nn.relu),
layers.Dense(all_X.shape[1], activation=tf.nn.relu),
layers.Dense(all_X.shape[1], activation=tf.nn.relu),
layers.Dense(all_X.shape[1], activation=tf.nn.relu),
layers.Dense(1)
])
optimizer = tf.keras.optimizers.RMSprop(0.001)
model.compile(
loss="mean_squared_error",
optimizer=optimizer,
metrics=["mean_absolute_error", "mean_squared_error"],
)
EPOCHS = 100
early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)
history = model.fit(
X_train,
y_train,
epochs=EPOCHS,
validation_split=0.2,
verbose=0,
callbacks=[early_stop],
)
def MAPE(predictions, answers):
# THIS IS PROBLEMATIC IF Y is EVER 0
assert len(predictions) == len(answers)
return sum([abs(x-y)/(y+1e-5) for x, y in zip(predictions, answers)])/len(answers)*100
predictions = [float(f) for f in model.predict(all_X[-8760:])]
train = [float(f) for f in model.predict(all_X[:-8760])]
accuracy = {
'test': MAPE(predictions, all_y[-8760:]),
'train': MAPE(train, all_y[:-8760])
}
return predictions, accuracy, time.time() - tic, model
In [44]:
df = pd.read_csv('data/test/NCENT.csv', parse_dates=['dates'])
all_X = makeUsefulDf(df)
all_y = df['load']
predictions, accuracy, t, model = neural_net_predictions(all_X, all_y)
print(accuracy, t)
In [46]:
df_results = pd.DataFrame()
for f in os.listdir('data/test'):
if f.endswith('.csv'):
df_t = pd.read_csv('data/test/' + f, parse_dates=['dates'])
all_X = makeUsefulDf(df_t)
all_y = df_t['load']
predictions = [float(f) for f in model.predict(all_X[-8760:])]
answers = all_y[-8760:]
df_results['{}-pred'.format(f[:-4])] = predictions
df_results['{}-ans'.format(f[:-4])] = [float(f) for f in answers]
print(f, MAPE(predictions, answers))
In [47]:
df_results.head()
Out[47]:
In [55]:
N = 8
fig, axes = plt.subplots(nrows=N, ncols=1, figsize=(10, 50))
i = 0
fig.suptitle('Deploying prediction with NCENT weights')
for f in os.listdir('data/test'):
if f.endswith('.csv'):
reg = f[:-4]
df_results['{}-pred'.format(reg)].plot(ax=axes[i])
df_results['{}-ans'.format(reg)].plot(ax=axes[i])
axes[i].set_title(reg)
i += 1
plt.savefig('generalized.png')