In [234]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import os
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
%matplotlib inline
In [235]:
df = pd.read_csv('data/NCENT.csv', parse_dates=['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df.head()
Out[235]:
In [236]:
# NORMALIZE
d = {}
for y in df.year.unique():
month_max = df[df['year']==y].groupby('month')['max_load'].max()
month_min = df[df['year']==y].groupby('month')['max_load'].min()
for m, (m_min, m_max) in enumerate(zip(month_min, month_max)):
d['{}-{}'.format(y, m+1)] = [m_min, m_max]
def norm_load(row):
year = row['year']
month = row['month']
l = d['{}-{}'.format(int(year), int(month))]
m_min = l[0]
m_max = l[1]
return (row['max_load'] - m_min)/(m_max - m_min)
def norm(c):
return (c - c.min())/(c.max() - c.min())
df_n = pd.DataFrame()
df_n['peak_prob'] = df.apply(norm_load, axis=1)
df_n['load_n'] = norm(df['max_load'])
df_n['year_n'] = norm(df['year'])
df_n['day'] = df['day'] / 31
for i in range(1, 13):
df_n['month-{}'.format(i)] = df['month'] == i
df_n.tail()
Out[236]:
In [237]:
all_X = df_n.drop(['peak_prob'], axis=1)
all_y = df_n['peak_prob']
X = all_X[:-365]
y = all_y[:-365]
X_test = all_X[-365:]
y_test = all_y[-365:]
In [238]:
model = LinearRegression().fit(X, y)
predictions_lr = model.predict(X_test)
print(model.score(X, y), model.score(X_test, y_test))
In [260]:
def MAPE(ans, pre):
return sum([abs(x-y)/(abs(y) + 1e-7) for (x, y) in zip(ans, pre)])/len(pre)*100
model = keras.Sequential([
layers.Dense(X.shape[1], activation=tf.nn.relu, input_shape=[X.shape[1]]),
layers.Dense(1)
])
optimizer = tf.keras.optimizers.RMSprop(0.001)
model.compile(loss='mean_squared_error',
optimizer=optimizer,
metrics=['mean_absolute_error', 'mean_squared_error'])
EPOCHS = 100
model.fit(X, y, epochs=EPOCHS, verbose=0)
predictions_nn = model.predict(X_test).flatten()
print(100 - MAPE(y, model.predict(X).flatten()), 100 - MAPE(y_test, model.predict(X_test).flatten()))
In [319]:
def recall(ans, pre):
true_positive = sum(ans & pre)
false_negative = sum(ans & (~ pre))
return true_positive / (true_positive + false_negative + 1e-7)
def precision(ans, pre):
true_positive = sum(ans & pre)
false_positive = sum((~ ans) & pre)
return (true_positive)/(true_positive + false_positive + 1e-7)
def peaks_missed(ans, pre):
return sum(ans & (~ pre)) / 12
def unnecessary_dispatches(ans, pre):
return sum((~ ans) & pre) / len(pre)
def run_testing_gambit(y_hat, y_test):
results = {'recall': [], 'precision' : [], 'peaks_missed': [], 'unnecessary_dispatches': []}
es = np.linspace(0,1,100)
y_a = y_test == 1
for e in es:
p_a = y_hat > e
results['recall'].append(recall(y_a, p_a))
results['precision'].append(precision(y_a, p_a))
results['peaks_missed'].append(peaks_missed(y_a, p_a))
results['unnecessary_dispatches'].append(unnecessary_dispatches(y_a, p_a))
df_r = pd.DataFrame(results)
df_r['F1'] = df_r['recall']*df_r['precision'] / (df_r['recall'] + df_r['precision'])
return df_r
df_nn = run_testing_gambit(predictions_nn, y_test)
df_lr = run_testing_gambit(predictions_lr, y_test)
In [262]:
df_nn.plot()
Out[262]:
In [263]:
df_lr.plot()
Out[263]:
In [290]:
# what if I say: dispatch if in x percentile of historical for this month is
results = {'recall': [], 'precision' : [], 'peaks_missed': [], 'unnecessary_dispatches': []}
es = np.linspace(0,1,100)
s = df[:-365].groupby('month')['max_load']
load_test = df['max_load'][-365:]
month_test = df['month'][-365:]
y_a = y_test == 1
for e in es:
month_quant = [s.quantile(e)[month] for month in range(1, 13)]
p_a = np.array([load >= month_quant[month-1] for load, month in zip(load_test, month_test)])
results['recall'].append(recall(y_a, p_a))
results['precision'].append(precision(y_a, p_a))
results['peaks_missed'].append(peaks_missed(y_a, p_a))
results['unnecessary_dispatches'].append(unnecessary_dispatches(y_a, p_a))
df_r = pd.DataFrame(results)
df_r['F1'] = df_r['recall']*df_r['precision'] / (df_r['recall'] + df_r['precision'])
df_r.head()
Out[290]:
In [292]:
df_r.plot()
Out[292]:
In [349]:
e = .91
s = df[:-365].groupby('month')['max_load']
load_test = df['max_load'][-365:]
month_test = df['month'][-365:]
month_quant = [s.quantile(e)[month] for month in range(1, 13)]
df_n['monthly_q'] = (df.month.apply(lambda x: month_quant[x-1]))
df_n = pd.DataFrame()
df_n['peak_prob'] = df.apply(norm_load, axis=1)
df_n['load_n'] = norm(df['max_load'])
df_n['year_n'] = norm(df['year'])
df_n['day_n'] = df['day'] / 31
for i in range(1, 13):
df_n['month-{}'.format(i)] = df['month'] == i
all_X = df_n.drop(['peak_prob'], axis=1)
all_y = df_n['peak_prob']
X = all_X[:-365]
y = all_y[:-365]
X_test = all_X[-365:]
y_test = all_y[-365:]
predictions_q = np.array([load >= month_quant[month-1] for load, month in zip(load_test, month_test)])
highest_l = [-42]*12
dispatch = [None]*365
for i, (l, m) in enumerate(zip(X_test['load_n'], df['month'][-365:])):
if highest_l[m-1] == -42:
highest_l[m-1] = l
dispatch[i] = True
elif l < highest_l[m-1]:
dispatch[i] = False
elif l >= highest_l[m-1]:
dispatch[i] = True
highest_l[m-1] = l
dispatch = np.array(dispatch)
dispatch[~predictions_q] = 0
dispatch2 = [None]*365
loads = list(X_test['load_n'])
months_d = list(df['month'][-365:])
for i, (l, m) in enumerate(zip(loads[:-1], months_d[:-1])):
if l > loads[i+1] and months_d[i] == months_d[i+1]:
dispatch2[i] = True
else:
dispatch2[i] = False
dispatch2[-1] = False
dispatch2 = np.array(dispatch2)
predictions_nn[~dispatch] = 0
predictions_nn[~dispatch2] = 0
df_r = run_testing_gambit(predictions_nn, y_test)
df_r.plot()
Out[349]:
In [350]:
df_r.head()
Out[350]:
In [355]:
m = predictions_nn[predictions_nn != 0]
m.shape
Out[355]:
In [354]:
predictions_nn
Out[354]:
In [ ]: