In [26]:
import pandas as pd
from scipy.stats import zscore
import numpy as np
from datetime import datetime as date

In [39]:
nerc6 = {
    "Memorial Day": [
        date(1990, 5, 28),
        date(1991, 5, 27),
        date(1992, 5, 25),
        date(1993, 5, 31),
        date(1994, 5, 30),
        date(1995, 5, 29),
        date(1996, 5, 27),
        date(1997, 5, 26),
        date(1998, 5, 25),
        date(1999, 5, 31),
        date(2000, 5, 29),
        date(2001, 5, 28),
        date(2002, 5, 27),
        date(2003, 5, 26),
        date(2004, 5, 31),
        date(2005, 5, 30),
        date(2006, 5, 29),
        date(2007, 5, 28),
        date(2008, 5, 26),
        date(2009, 5, 25),
        date(2010, 5, 31),
        date(2011, 5, 30),
        date(2012, 5, 28),
        date(2013, 5, 27),
        date(2014, 5, 26),
        date(2015, 5, 25),
        date(2016, 5, 30),
        date(2017, 5, 29),
        date(2018, 5, 28),
        date(2019, 5, 27),
        date(2020, 5, 25),
        date(2021, 5, 31),
        date(2022, 5, 30),
        date(2023, 5, 29),
        date(2024, 5, 27),
    ],
    "Labor Day": [
        date(1990, 9, 3),
        date(1991, 9, 2),
        date(1992, 9, 7),
        date(1993, 9, 6),
        date(1994, 9, 5),
        date(1995, 9, 4),
        date(1996, 9, 2),
        date(1997, 9, 1),
        date(1998, 9, 7),
        date(1999, 9, 6),
        date(2000, 9, 4),
        date(2001, 9, 3),
        date(2002, 9, 2),
        date(2003, 9, 1),
        date(2004, 9, 6),
        date(2005, 9, 5),
        date(2006, 9, 4),
        date(2007, 9, 3),
        date(2008, 9, 1),
        date(2009, 9, 7),
        date(2010, 9, 6),
        date(2011, 9, 5),
        date(2012, 9, 3),
        date(2013, 9, 2),
        date(2014, 9, 1),
        date(2015, 9, 7),
        date(2016, 9, 5),
        date(2017, 9, 4),
        date(2018, 9, 3),
        date(2019, 9, 2),
        date(2020, 9, 7),
        date(2021, 9, 6),
        date(2022, 9, 5),
        date(2023, 9, 4),
        date(2024, 9, 2),
    ],
    "Thanksgiving": [
        date(1990, 11, 22),
        date(1991, 11, 28),
        date(1992, 11, 26),
        date(1993, 11, 25),
        date(1994, 11, 24),
        date(1995, 11, 23),
        date(1996, 11, 28),
        date(1997, 11, 27),
        date(1998, 11, 26),
        date(1999, 11, 25),
        date(2000, 11, 23),
        date(2001, 11, 22),
        date(2002, 11, 28),
        date(2003, 11, 27),
        date(2004, 11, 25),
        date(2005, 11, 24),
        date(2006, 11, 23),
        date(2007, 11, 22),
        date(2008, 11, 27),
        date(2009, 11, 26),
        date(2010, 11, 25),
        date(2011, 11, 24),
        date(2012, 11, 22),
        date(2013, 11, 28),
        date(2014, 11, 27),
        date(2015, 11, 26),
        date(2016, 11, 24),
        date(2017, 11, 23),
        date(2018, 11, 22),
        date(2019, 11, 28),
        date(2020, 11, 26),
        date(2021, 11, 25),
        date(2022, 11, 24),
        date(2023, 11, 23),
        date(2024, 11, 28),
    ],
    "Independence Day (Observed)": [
        date(1992, 7, 3),
        date(1993, 7, 5),
        date(1998, 7, 3),
        date(1999, 7, 5),
        date(2004, 7, 5),
        date(2009, 7, 3),
        date(2010, 7, 5),
        date(2015, 7, 3),
        date(2020, 7, 3),
        date(2021, 7, 5),
    ],
    "New Year's Day (Observed)": [
        date(1993, 12, 31),
        date(1995, 1, 2),
        date(1999, 12, 31),
        date(2004, 12, 31),
        date(2006, 1, 2),
        date(2010, 12, 31),
        date(2012, 1, 2),
        date(2017, 1, 2),
        date(2021, 12, 31),
        date(2023, 1, 2),
    ],
    "Christmas Day (Observed)": [
        date(1993, 12, 24),
        date(1994, 12, 26),
        date(1999, 12, 24),
        date(2004, 12, 24),
        date(2005, 12, 26),
        date(2010, 12, 24),
        date(2011, 12, 26),
        date(2016, 12, 26),
        date(2021, 12, 24),
        date(2022, 12, 26),
    ],
}


def isHoliday(holiday, df):
    # New years, memorial, independence, labor day, Thanksgiving, Christmas
    m1 = None
    if holiday == "New Year's Day":
        m1 = (df["dates"].dt.month == 1) & (df["dates"].dt.day == 1)
    if holiday == "Independence Day":
        m1 = (df["dates"].dt.month == 7) & (df["dates"].dt.day == 4)
    if holiday == "Christmas Day":
        m1 = (df["dates"].dt.month == 12) & (df["dates"].dt.day == 25)
    m1 = df["dates"].dt.date.isin(nerc6[holiday]) if m1 is None else m1
    m2 = df["dates"].dt.date.isin(nerc6.get(holiday + " (Observed)", []))
    return m1 | m2

def add_noise(m, std):
        noise = np.random.normal(0, std, m.shape[0])
        return m + noise

def makeUsefulDf(df, noise=2.5, hours_prior=24):
    
    """
    Turn a dataframe of datetime and load data into a dataframe useful for
    machine learning. Normalize values and turn 
    Features are placed into r_df (return dataframe), creates the following columns

        YEARS SINCE 2000

        LOAD AT THIS TIME DAY BEFORE

        HOUR OF DAY
        - is12AM (0, 1)
        - is1AM (0, 1)
        ...
        - is11PM (0, 1)

        DAYS OF THE WEEK
        - isSunday (0, 1)
        - isMonday (0, 1)
        ...
        - isSaturday (0, 1)

        MONTHS OF THE YEAR
        - isJanuary (0, 1)
        - isFebruary (0, 1)
        ...
        - isDecember (0, 1)

        TEMPERATURE
        - Celcius (normalized from -1 to 1)

        PREVIOUS DAY'S LOAD 
        - 12AM of day previous (normalized from -1 to 1)
        - 1AM of day previous (normalized from -1 to 1)
        ...
        - 11PM of day previous (normalized from -1 to 1)

        HOLIDAYS (the nerc6 holidays)
        - isNewYears (0, 1)
        - isMemorialDay (0, 1)
        ...
        - is Christmas (0, 1)

    """

    def _chunks(l, n):
        return [l[i : i + n] for i in range(0, len(l), n)]
    
    if 'dates' not in df.columns:
        df['dates'] = df.apply(
            lambda x: dt(
                int(x['year']), 
                int(x['month']), 
                int(x['day']), 
                int(x['hour'])), 
            axis=1
        )
    
    r_df = pd.DataFrame()
    r_df["load_n"] = zscore(df["load"])
    r_df["years_n"] = zscore(df["dates"].dt.year)

    # fix outliers
    m = df["tempc"].replace([-9999], np.nan)
    m.ffill(inplace=True)
    # day-before predictions
    temp_noise = add_noise(m, noise)
    r_df["temp_n"] = zscore(temp_noise)
    r_df['temp_n^2'] = r_df["temp_n"] ** 2

    # add the value of the load 24hrs before
    r_df["load_prev_n"] = r_df["load_n"].shift(hours_prior)
    r_df["load_prev_n"].bfill(inplace=True)

    # create day of week vector
    r_df["day"] = df["dates"].dt.dayofweek  # 0 is Monday.
    w = ["S", "M", "T", "W", "R", "F", "A"]
    for i, d in enumerate(w):
        r_df[d] = (r_df["day"] == i).astype(int)

    # create hour of day vector
    r_df["hour"] = df["dates"].dt.hour
    d = [("h" + str(i)) for i in range(24)]
    for i, h in enumerate(d):
        r_df[h] = (r_df["hour"] == i).astype(int)

    # create month vector
    r_df["month"] = df["dates"].dt.month
    y = [("m" + str(i)) for i in range(12)]
    for i, m in enumerate(y):
        r_df[m] = (r_df["month"] == i).astype(int)

    # create 'load day before' vector
    n = np.array([val for val in _chunks(list(r_df["load_prev_n"]), 24) for _ in range(24)])
    l = ["l" + str(i) for i in range(24)]
    for i, s in enumerate(l):
        r_df[s] = n[:, i]

    # create holiday booleans
    r_df["isNewYears"] = isHoliday("New Year's Day", df)
    r_df["isMemorialDay"] = isHoliday("Memorial Day", df)
    r_df["isIndependenceDay"] = isHoliday("Independence Day", df)
    r_df["isLaborDay"] = isHoliday("Labor Day", df)
    r_df["isThanksgiving"] = isHoliday("Thanksgiving", df)
    r_df["isChristmas"] = isHoliday("Christmas Day", df)

    m = r_df.drop(["month", "hour", "day", "load_n"], axis=1)
    df = df.drop(['dates'], axis=1)

    return m

In [40]:
df = pd.read_csv('data/test/NCENT.csv', parse_dates=['dates'])

In [48]:
all_X = makeUsefulDf(df, hours_prior=48)
m = ['h' + str(i) for i in range(24)]
print(all_X.columns)
all_X[m]


Index(['years_n', 'temp_n', 'temp_n^2', 'load_prev_n', 'S', 'M', 'T', 'W', 'R',
       'F', 'A', 'h0', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'h9',
       'h10', 'h11', 'h12', 'h13', 'h14', 'h15', 'h16', 'h17', 'h18', 'h19',
       'h20', 'h21', 'h22', 'h23', 'm0', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6',
       'm7', 'm8', 'm9', 'm10', 'm11', 'l0', 'l1', 'l2', 'l3', 'l4', 'l5',
       'l6', 'l7', 'l8', 'l9', 'l10', 'l11', 'l12', 'l13', 'l14', 'l15', 'l16',
       'l17', 'l18', 'l19', 'l20', 'l21', 'l22', 'l23', 'isNewYears',
       'isMemorialDay', 'isIndependenceDay', 'isLaborDay', 'isThanksgiving',
       'isChristmas'],
      dtype='object')
Out[48]:
h0 h1 h2 h3 h4 h5 h6 h7 h8 h9 ... h14 h15 h16 h17 h18 h19 h20 h21 h22 h23
0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
7 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
8 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
9 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
10 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
11 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
12 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
13 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
14 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 0
15 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
16 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
17 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 0
18 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
19 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
20 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
21 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
22 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
23 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
24 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
25 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
26 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
27 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
28 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
29 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
148890 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
148891 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
148892 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
148893 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
148894 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
148895 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
148896 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
148897 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
148898 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
148899 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
148900 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
148901 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
148902 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
148903 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
148904 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
148905 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
148906 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
148907 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
148908 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
148909 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
148910 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 0
148911 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
148912 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
148913 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 0
148914 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
148915 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
148916 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
148917 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
148918 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
148919 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1

148920 rows × 24 columns


In [49]:
l = ['l' + str(i) for i in range(1)]
keep = l + ['load_prev_n']
all_X = all_X[keep]
all_X['now'] = zscore(df['load'])

In [45]:
all_X.tail(72)


Out[45]:
l0 load_prev_n now
148848 -0.777509 -0.777509 0.254446
148849 -0.777509 -0.861909 0.200650
148850 -0.777509 -0.892132 0.189704
148851 -0.777509 -0.889106 0.203461
148852 -0.777509 -0.817810 0.269434
148853 -0.777509 -0.634586 0.390573
148854 -0.777509 -0.376683 0.567531
148855 -0.777509 -0.184649 0.750387
148856 -0.777509 -0.151645 0.924999
148857 -0.777509 -0.147174 1.064748
148858 -0.777509 -0.134362 1.100883
148859 -0.777509 -0.162484 1.056328
148860 -0.777509 -0.212359 0.951244
148861 -0.777509 -0.266579 0.799542
148862 -0.777509 -0.308398 0.657907
148863 -0.777509 -0.344700 0.600430
148864 -0.777509 -0.297550 0.632566
148865 -0.777509 -0.060201 0.836941
148866 -0.777509 0.204664 0.995225
148867 -0.777509 0.260093 0.989627
148868 -0.777509 0.259633 0.960601
148869 -0.777509 0.209122 0.864410
148870 -0.777509 0.108127 0.707867
148871 -0.777509 -0.030003 0.532810
148872 -0.137236 -0.137236 0.391865
148873 -0.137236 -0.165557 0.306366
148874 -0.137236 -0.149422 0.273242
148875 -0.137236 -0.069444 0.277391
148876 -0.137236 0.087113 0.309385
148877 -0.137236 0.375305 0.383767
... ... ... ...
148890 -0.137236 0.735134 0.851213
148891 -0.137236 0.762525 0.818456
148892 -0.137236 0.757572 0.766746
148893 -0.137236 0.684232 0.648482
148894 -0.137236 0.519708 0.446108
148895 -0.137236 0.351489 0.237738
148896 0.254446 0.254446 0.033652
148897 0.254446 0.200650 -0.045656
148898 0.254446 0.189704 -0.067667
148899 0.254446 0.203461 -0.054746
148900 0.254446 0.269434 0.053950
148901 0.254446 0.390573 0.218370
148902 0.254446 0.567531 0.408954
148903 0.254446 0.750387 0.541852
148904 0.254446 0.924999 0.597758
148905 0.254446 1.064748 0.610667
148906 0.254446 1.100883 0.511123
148907 0.254446 1.056328 0.351118
148908 0.254446 0.951244 0.151231
148909 0.254446 0.799542 -0.027274
148910 0.254446 0.657907 -0.182179
148911 0.254446 0.600430 -0.269973
148912 0.254446 0.632566 -0.270544
148913 0.254446 0.836941 -0.056833
148914 0.254446 0.995225 0.165207
148915 0.254446 0.989627 0.113750
148916 0.254446 0.960601 0.053521
148917 0.254446 0.864410 -0.004938
148918 0.254446 0.707867 -0.084454
148919 0.254446 0.532810 -0.139237

72 rows × 3 columns