In [6]:
import pandas as pd
import numpy as np
from datetime import datetime as dt, timedelta
import mpld3
mpld3.enable_notebook()
%matplotlib inline
load = pd.read_csv('data/load/NCENT.csv')
weather = pd.read_csv('data/weather/NCENT.csv')
load.head()
In [ ]:
large_df = pd.DataFrame()
l, t = [], []
for column in load.columns:
l.append(list(load[column]))
t.append(list(weather[column]))
# flatten
lf = [i for s in l for i in s]
tf = [i for s in t for i in s]
large_df['load'] = lf
large_df['tempc'] = tf
# fix outliers
large_df['tempc'].replace([-9999], np.nan, inplace=True)
large_df['tempc'].ffill(inplace=True)
large_df['load'].ffill(inplace=True)
large_df['load_prev'] = large_df['load'].shift(24)
large_df['load_prev'].bfill(inplace=True)
d = []
for i in range(2002, 2019):
d.append([(dt(i, 1, 1) + timedelta(hours=1)*x) for x in range(8760)])
large_df['dates'] = [i for s in d for i in s]
large_df['day'] = large_df['dates'].dt.dayofweek # 0 is MONDAY!!!
large_df['hour'] = large_df['dates'].dt.hour
large_df['month'] = large_df['dates'].dt.month
large_df['year'] = large_df['dates'].dt.year
large_df.head()
#large_df.to_csv('flatData.csv', index=False)
In [ ]:
norm_df = large_df.drop(['dates'], axis=1)
norm_df['yearSince2000'] = norm_df['year'] - 2000
norm_df['years_n'] = (norm_df['yearSince2000'] - norm_df['yearSince2000'].mean()) / (norm_df['yearSince2000'].max() - norm_df['yearSince2000'].min())
norm_df['load_prev_n'] = (norm_df['load_prev'] - norm_df['load_prev'].mean()) / (norm_df['load_prev'].max() - norm_df['load_prev'].min())
norm_df['temp_n'] = (norm_df['tempc'] - norm_df['tempc'].mean()) / (norm_df['tempc'].max() - norm_df['tempc'].min())
norm_df.drop(['load', 'tempc', 'year'], axis=1, inplace=True)
norm_df['day'] += 1 # make Sunday 0
norm_df.loc[norm_df['day'] == 7, 'day'] = 0
norm_df.head()
In [ ]:
bin_df = pd.DataFrame()
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
s = []
for i in range(0, len(l), n):
s.append(l[i:i + n])
return [l[i:i+n] for i in range(0, len(l), n)]
l = list(large_df['load'])
m = [i for i in chunks(l, 24)]
s = [val for val in m for _ in range(24)]
n = np.array(s)
for i in range(24):
s = 'l' + str(i)
bin_df[s] = n[:, i]
bin_df[s] = (bin_df[s] - large_df['load'].mean()) / (large_df['load'].max() - large_df['load'].min())
bin_df.head()
# create month vector
y = [('m' + str(i)) for i in range(12)]
for i, m in enumerate(y):
bin_df[m] = (norm_df['month'] == i).astype(int)
# create day of week vector
w = ['S', 'M', 'T', 'W', 'R', 'F', 'A']
for i, d in enumerate(w):
bin_df[d] = (norm_df['day'] == i).astype(int)
# create hour of day vector
d = [('h' + str(i)) for i in range(24)]
for i, h in enumerate(d):
bin_df[h] = (norm_df['hour'] == i).astype(int)
bin_df['years_n'] = norm_df['years_n']
bin_df['temp_n'] = norm_df['temp_n']
bin_df['load_prev_n'] = norm_df['load_prev_n']
# for_csv = bin_df.copy()
# for_csv['load'] = large_df['load']
# for_csv.to_csv('data/ncent_ml_data.csv', index=False)
print(len(bin_df))
print(bin_df.columns)
bin_df.head()
In [ ]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
all_X = bin_df#.drop('load_prev_n', axis=1) # all rows, features only, no labels
all_y = large_df['load'] #all rows, label only, no features
X_train, y_train = all_X[:-8760], all_y[:-8760]
In [ ]:
clf = linear_model.SGDRegressor(max_iter=10000, tol=1e-4)
clf.fit(X_train, y_train)
In [ ]:
X_test, y_test = all_X[-8760:], all_y[-8760:]
y_prediction = clf.predict(X_test)
graph_df = pd.DataFrame()
graph_df['test'] = y_test
graph_df['predict'] = y_prediction
graph_df.plot(figsize=(12, 3))
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))
In [ ]:
large_df.groupby('month')['load'].quantile(.99)
In [ ]:
large_df.columns
In [ ]:
m = []
large_df['dayOfYear'] = large_df['dates'].dt.dayofyear
for month in range(1, 13):
test = large_df[large_df['month'] == month]
m.append(test.loc[test['load'].idxmax()]['dayOfYear'])
In [ ]:
print(m)
In [ ]:
import datetime
nerc6 = {
'Memorial Day': [
datetime.date(1990, 5, 28),
datetime.date(1991, 5, 27),
datetime.date(1992, 5, 25),
datetime.date(1993, 5, 31),
datetime.date(1994, 5, 30),
datetime.date(1995, 5, 29),
datetime.date(1996, 5, 27),
datetime.date(1997, 5, 26),
datetime.date(1998, 5, 25),
datetime.date(1999, 5, 31),
datetime.date(2000, 5, 29),
datetime.date(2001, 5, 28),
datetime.date(2002, 5, 27),
datetime.date(2003, 5, 26),
datetime.date(2004, 5, 31),
datetime.date(2005, 5, 30),
datetime.date(2006, 5, 29),
datetime.date(2007, 5, 28),
datetime.date(2008, 5, 26),
datetime.date(2009, 5, 25),
datetime.date(2010, 5, 31),
datetime.date(2011, 5, 30),
datetime.date(2012, 5, 28),
datetime.date(2013, 5, 27),
datetime.date(2014, 5, 26),
datetime.date(2015, 5, 25),
datetime.date(2016, 5, 30),
datetime.date(2017, 5, 29),
datetime.date(2018, 5, 28),
datetime.date(2019, 5, 27),
datetime.date(2020, 5, 25),
datetime.date(2021, 5, 31),
datetime.date(2022, 5, 30),
datetime.date(2023, 5, 29),
datetime.date(2024, 5, 27),
],
'Labor Day': [
datetime.date(1990, 9, 3),
datetime.date(1991, 9, 2),
datetime.date(1992, 9, 7),
datetime.date(1993, 9, 6),
datetime.date(1994, 9, 5),
datetime.date(1995, 9, 4),
datetime.date(1996, 9, 2),
datetime.date(1997, 9, 1),
datetime.date(1998, 9, 7),
datetime.date(1999, 9, 6),
datetime.date(2000, 9, 4),
datetime.date(2001, 9, 3),
datetime.date(2002, 9, 2),
datetime.date(2003, 9, 1),
datetime.date(2004, 9, 6),
datetime.date(2005, 9, 5),
datetime.date(2006, 9, 4),
datetime.date(2007, 9, 3),
datetime.date(2008, 9, 1),
datetime.date(2009, 9, 7),
datetime.date(2010, 9, 6),
datetime.date(2011, 9, 5),
datetime.date(2012, 9, 3),
datetime.date(2013, 9, 2),
datetime.date(2014, 9, 1),
datetime.date(2015, 9, 7),
datetime.date(2016, 9, 5),
datetime.date(2017, 9, 4),
datetime.date(2018, 9, 3),
datetime.date(2019, 9, 2),
datetime.date(2020, 9, 7),
datetime.date(2021, 9, 6),
datetime.date(2022, 9, 5),
datetime.date(2023, 9, 4),
datetime.date(2024, 9, 2),
],
'Thanksgiving': [
datetime.date(1990, 11, 22),
datetime.date(1991, 11, 28),
datetime.date(1992, 11, 26),
datetime.date(1993, 11, 25),
datetime.date(1994, 11, 24),
datetime.date(1995, 11, 23),
datetime.date(1996, 11, 28),
datetime.date(1997, 11, 27),
datetime.date(1998, 11, 26),
datetime.date(1999, 11, 25),
datetime.date(2000, 11, 23),
datetime.date(2001, 11, 22),
datetime.date(2002, 11, 28),
datetime.date(2003, 11, 27),
datetime.date(2004, 11, 25),
datetime.date(2005, 11, 24),
datetime.date(2006, 11, 23),
datetime.date(2007, 11, 22),
datetime.date(2008, 11, 27),
datetime.date(2009, 11, 26),
datetime.date(2010, 11, 25),
datetime.date(2011, 11, 24),
datetime.date(2012, 11, 22),
datetime.date(2013, 11, 28),
datetime.date(2014, 11, 27),
datetime.date(2015, 11, 26),
datetime.date(2016, 11, 24),
datetime.date(2017, 11, 23),
datetime.date(2018, 11, 22),
datetime.date(2019, 11, 28),
datetime.date(2020, 11, 26),
datetime.date(2021, 11, 25),
datetime.date(2022, 11, 24),
datetime.date(2023, 11, 23),
datetime.date(2024, 11, 28),
],
'Independence Day (Observed)': [
datetime.date(1992, 7, 3),
datetime.date(1993, 7, 5),
datetime.date(1998, 7, 3),
datetime.date(1999, 7, 5),
datetime.date(2004, 7, 5),
datetime.date(2009, 7, 3),
datetime.date(2010, 7, 5),
datetime.date(2015, 7, 3),
datetime.date(2020, 7, 3),
datetime.date(2021, 7, 5),
],
"New Year's Day (Observed)": [
datetime.date(1993, 12, 31),
datetime.date(1995, 1, 2),
datetime.date(1999, 12, 31),
datetime.date(2004, 12, 31),
datetime.date(2006, 1, 2),
datetime.date(2010, 12, 31),
datetime.date(2012, 1, 2),
datetime.date(2017, 1, 2),
datetime.date(2021, 12, 31),
datetime.date(2023, 1, 2),
],
'Christmas Day (Observed)': [
datetime.date(1993, 12, 24),
datetime.date(1994, 12, 26),
datetime.date(1999, 12, 24),
datetime.date(2004, 12, 24),
datetime.date(2005, 12, 26),
datetime.date(2010, 12, 24),
datetime.date(2011, 12, 26),
datetime.date(2016, 12, 26),
datetime.date(2021, 12, 24),
datetime.date(2022, 12, 26),
]
}
In [ ]:
df.h