24-Hour Load Forecasting


In [6]:
import pandas as pd
import numpy as np
from datetime import datetime as dt, timedelta
import mpld3
mpld3.enable_notebook()
%matplotlib inline

load = pd.read_csv('data/load/NCENT.csv')
weather = pd.read_csv('data/weather/NCENT.csv')
load.head()


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-6-15b9b4183853> in <module>
----> 1 import pandas as pd
      2 import numpy as np
      3 from datetime import datetime as dt, timedelta
      4 import mpld3
      5 mpld3.enable_notebook()

/anaconda3/lib/python3.6/site-packages/pandas/__init__.py in <module>
     40 import pandas.core.config_init
     41 
---> 42 from pandas.core.api import *
     43 from pandas.core.sparse.api import *
     44 from pandas.tseries.api import *

/anaconda3/lib/python3.6/site-packages/pandas/core/api.py in <module>
     24 )
     25 from pandas.core.arrays import Categorical, array
---> 26 from pandas.core.groupby import Grouper
     27 from pandas.io.formats.format import set_eng_float_format
     28 from pandas.core.index import (Index, CategoricalIndex, Int64Index,

/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/__init__.py in <module>
----> 1 from pandas.core.groupby.groupby import GroupBy  # noqa: F401
      2 from pandas.core.groupby.generic import (  # noqa: F401
      3     SeriesGroupBy, DataFrameGroupBy, PanelGroupBy)
      4 from pandas.core.groupby.grouper import Grouper  # noqa: F401

/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in <module>
     18 
     19 from pandas._libs import Timestamp, groupby as libgroupby
---> 20 import pandas.compat as compat
     21 from pandas.compat import callable, range, set_function_name, zip
     22 from pandas.compat.numpy import function as nv

AttributeError: module 'pandas' has no attribute 'compat'

First get everything into one large dataframe

load | dt, day of week, hour of day, years since 2000, temperature


In [ ]:
large_df = pd.DataFrame()
l, t = [], []
for column in load.columns:
    l.append(list(load[column]))
    t.append(list(weather[column]))
# flatten
lf = [i for s in l for i in s]
tf = [i for s in t for i in s]

large_df['load'] = lf
large_df['tempc'] = tf

# fix outliers
large_df['tempc'].replace([-9999], np.nan, inplace=True)
large_df['tempc'].ffill(inplace=True)
large_df['load'].ffill(inplace=True)

large_df['load_prev'] = large_df['load'].shift(24)
large_df['load_prev'].bfill(inplace=True)

d = []
for i in range(2002, 2019):
    d.append([(dt(i, 1, 1) + timedelta(hours=1)*x) for x in range(8760)]) 
large_df['dates'] = [i for s in d for i in s]

large_df['day'] = large_df['dates'].dt.dayofweek # 0 is MONDAY!!!
large_df['hour'] = large_df['dates'].dt.hour
large_df['month'] = large_df['dates'].dt.month
large_df['year'] = large_df['dates'].dt.year
large_df.head()
#large_df.to_csv('flatData.csv', index=False)

Normalize the data from [-1, 1]


In [ ]:
norm_df = large_df.drop(['dates'], axis=1)

norm_df['yearSince2000'] = norm_df['year'] - 2000
norm_df['years_n'] = (norm_df['yearSince2000'] - norm_df['yearSince2000'].mean()) / (norm_df['yearSince2000'].max() - norm_df['yearSince2000'].min())

norm_df['load_prev_n'] = (norm_df['load_prev'] - norm_df['load_prev'].mean()) / (norm_df['load_prev'].max() - norm_df['load_prev'].min())
norm_df['temp_n'] = (norm_df['tempc'] - norm_df['tempc'].mean()) / (norm_df['tempc'].max() - norm_df['tempc'].min())

norm_df.drop(['load', 'tempc', 'year'], axis=1, inplace=True)

norm_df['day'] += 1 # make Sunday 0
norm_df.loc[norm_df['day'] == 7, 'day'] = 0

norm_df.head()

Let's now turn into a binary dataframe


In [ ]:
bin_df = pd.DataFrame()
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    s = []
    for i in range(0, len(l), n):
         s.append(l[i:i + n])
    return [l[i:i+n] for i in range(0, len(l), n)]

l = list(large_df['load'])
m = [i for i in chunks(l, 24)]
s = [val for val in m for _ in range(24)]
n = np.array(s)

for i in range(24):
    s = 'l' + str(i)
    bin_df[s] = n[:, i]
    bin_df[s] = (bin_df[s] - large_df['load'].mean()) / (large_df['load'].max() - large_df['load'].min())

bin_df.head()

# create month vector
y = [('m' + str(i)) for i in range(12)]
for i, m in enumerate(y):
    bin_df[m] = (norm_df['month'] == i).astype(int)

# create day of week vector
w = ['S', 'M', 'T', 'W', 'R', 'F', 'A']
for i, d in enumerate(w):
    bin_df[d] = (norm_df['day'] == i).astype(int)

# create hour of day vector
d = [('h' + str(i)) for i in range(24)]
for i, h in enumerate(d):
    bin_df[h] = (norm_df['hour'] == i).astype(int)

bin_df['years_n'] = norm_df['years_n']
bin_df['temp_n'] = norm_df['temp_n']
bin_df['load_prev_n'] = norm_df['load_prev_n']

# for_csv = bin_df.copy()
# for_csv['load'] = large_df['load']
# for_csv.to_csv('data/ncent_ml_data.csv', index=False)

print(len(bin_df))
print(bin_df.columns)
bin_df.head()

In [ ]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split

all_X = bin_df#.drop('load_prev_n', axis=1) # all rows, features only, no labels
all_y = large_df['load'] #all rows, label only, no features

X_train, y_train = all_X[:-8760], all_y[:-8760]

In [ ]:
clf = linear_model.SGDRegressor(max_iter=10000, tol=1e-4)
clf.fit(X_train, y_train)

In [ ]:
X_test, y_test = all_X[-8760:], all_y[-8760:]
y_prediction = clf.predict(X_test)

graph_df = pd.DataFrame()
graph_df['test'] = y_test
graph_df['predict'] = y_prediction
graph_df.plot(figsize=(12, 3))

print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

In [ ]:
large_df.groupby('month')['load'].quantile(.99)

In [ ]:
large_df.columns

In [ ]:
m = []
large_df['dayOfYear'] = large_df['dates'].dt.dayofyear
for month in range(1, 13):
    test = large_df[large_df['month'] == month]
    m.append(test.loc[test['load'].idxmax()]['dayOfYear'])

In [ ]:
print(m)

Holiday data


In [ ]:
import datetime
nerc6 = {
	'Memorial Day': [
		datetime.date(1990, 5, 28),
		datetime.date(1991, 5, 27),
		datetime.date(1992, 5, 25),
		datetime.date(1993, 5, 31),
		datetime.date(1994, 5, 30),
		datetime.date(1995, 5, 29),
		datetime.date(1996, 5, 27),
		datetime.date(1997, 5, 26),
		datetime.date(1998, 5, 25),
		datetime.date(1999, 5, 31),
		datetime.date(2000, 5, 29),
		datetime.date(2001, 5, 28),
		datetime.date(2002, 5, 27),
		datetime.date(2003, 5, 26),
		datetime.date(2004, 5, 31),
		datetime.date(2005, 5, 30),
		datetime.date(2006, 5, 29),
		datetime.date(2007, 5, 28),
		datetime.date(2008, 5, 26),
		datetime.date(2009, 5, 25),
		datetime.date(2010, 5, 31),
		datetime.date(2011, 5, 30),
		datetime.date(2012, 5, 28),
		datetime.date(2013, 5, 27),
		datetime.date(2014, 5, 26),
		datetime.date(2015, 5, 25),
		datetime.date(2016, 5, 30),
		datetime.date(2017, 5, 29),
		datetime.date(2018, 5, 28),
		datetime.date(2019, 5, 27),
		datetime.date(2020, 5, 25),
		datetime.date(2021, 5, 31),
		datetime.date(2022, 5, 30),
		datetime.date(2023, 5, 29),
		datetime.date(2024, 5, 27),
	],
 	'Labor Day': [
	 	datetime.date(1990, 9, 3),
		datetime.date(1991, 9, 2),
		datetime.date(1992, 9, 7),
		datetime.date(1993, 9, 6),
		datetime.date(1994, 9, 5),
		datetime.date(1995, 9, 4),
		datetime.date(1996, 9, 2),
		datetime.date(1997, 9, 1),
		datetime.date(1998, 9, 7),
		datetime.date(1999, 9, 6),
		datetime.date(2000, 9, 4),
		datetime.date(2001, 9, 3),
		datetime.date(2002, 9, 2),
		datetime.date(2003, 9, 1),
		datetime.date(2004, 9, 6),
		datetime.date(2005, 9, 5),
		datetime.date(2006, 9, 4),
		datetime.date(2007, 9, 3),
		datetime.date(2008, 9, 1),
		datetime.date(2009, 9, 7),
		datetime.date(2010, 9, 6),
		datetime.date(2011, 9, 5),
		datetime.date(2012, 9, 3),
		datetime.date(2013, 9, 2),
		datetime.date(2014, 9, 1),
		datetime.date(2015, 9, 7),
		datetime.date(2016, 9, 5),
		datetime.date(2017, 9, 4),
		datetime.date(2018, 9, 3),
		datetime.date(2019, 9, 2),
		datetime.date(2020, 9, 7),
		datetime.date(2021, 9, 6),
		datetime.date(2022, 9, 5),
		datetime.date(2023, 9, 4),
		datetime.date(2024, 9, 2),
	],
	'Thanksgiving': [
		datetime.date(1990, 11, 22),
		datetime.date(1991, 11, 28),
		datetime.date(1992, 11, 26),
		datetime.date(1993, 11, 25),
		datetime.date(1994, 11, 24),
		datetime.date(1995, 11, 23),
		datetime.date(1996, 11, 28),
		datetime.date(1997, 11, 27),
		datetime.date(1998, 11, 26),
		datetime.date(1999, 11, 25),
		datetime.date(2000, 11, 23),
		datetime.date(2001, 11, 22),
		datetime.date(2002, 11, 28),
		datetime.date(2003, 11, 27),
		datetime.date(2004, 11, 25),
		datetime.date(2005, 11, 24),
		datetime.date(2006, 11, 23),
		datetime.date(2007, 11, 22),
		datetime.date(2008, 11, 27),
		datetime.date(2009, 11, 26),
		datetime.date(2010, 11, 25),
		datetime.date(2011, 11, 24),
		datetime.date(2012, 11, 22),
		datetime.date(2013, 11, 28),
		datetime.date(2014, 11, 27),
		datetime.date(2015, 11, 26),
		datetime.date(2016, 11, 24),
		datetime.date(2017, 11, 23),
		datetime.date(2018, 11, 22),
		datetime.date(2019, 11, 28),
		datetime.date(2020, 11, 26),
		datetime.date(2021, 11, 25),
		datetime.date(2022, 11, 24),
		datetime.date(2023, 11, 23),
		datetime.date(2024, 11, 28),
	],
 	'Independence Day (Observed)': [
	 	datetime.date(1992, 7, 3),
		datetime.date(1993, 7, 5),
		datetime.date(1998, 7, 3),
		datetime.date(1999, 7, 5),
		datetime.date(2004, 7, 5),
		datetime.date(2009, 7, 3),
		datetime.date(2010, 7, 5),
		datetime.date(2015, 7, 3),
		datetime.date(2020, 7, 3),
		datetime.date(2021, 7, 5),
	],
	"New Year's Day (Observed)": [
		datetime.date(1993, 12, 31),
		datetime.date(1995, 1, 2),
		datetime.date(1999, 12, 31),
		datetime.date(2004, 12, 31),
		datetime.date(2006, 1, 2),
		datetime.date(2010, 12, 31),
		datetime.date(2012, 1, 2),
		datetime.date(2017, 1, 2),
		datetime.date(2021, 12, 31),
		datetime.date(2023, 1, 2),
	],
	'Christmas Day (Observed)': [
		datetime.date(1993, 12, 24),
		datetime.date(1994, 12, 26),
		datetime.date(1999, 12, 24),
		datetime.date(2004, 12, 24),
		datetime.date(2005, 12, 26),
		datetime.date(2010, 12, 24),
		datetime.date(2011, 12, 26),
		datetime.date(2016, 12, 26),
		datetime.date(2021, 12, 24),
		datetime.date(2022, 12, 26),
	]
}

In [ ]:
df.h