Prognose Werk LPZ

http://machinelearningmastery.com/time-series-forecasting-long-short-term-memory-network-python/
http://machinelearningmastery.com/multi-step-time-series-forecasting/
http://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/
https://github.com/fchollet/keras/blob/master/examples/stateful_lstm.py

  1. Load the dataset from CSV file.
  2. Transform the dataset to make it suitable for the LSTM model, including:
    • Transforming the data to a supervised learning problem.
    • Transforming the data to be stationary.
    • Transforming the data so that it has the scale -1 to 1.
  3. Fitting a stateful LSTM network model to the training data.
  4. Evaluating the static LSTM model on the test data.
  5. Report the performance of the forecasts.

Multistep Forecasting

There are at least four commonly used strategies for making multi-step forecasts.

They are:

Direct Multi-step Forecast Strategy. Recursive Multi-step Forecast Strategy. Direct-Recursive Hybrid Multi-step Forecast Strategies. Multiple Output Forecast Strategy.

Variables/Functions ^


In [1]:
# imports
import sys, os, argparse, logging  # NOQA
from pprint import pprint
from tqdm import tqdm
import importlib

from IPython.core.debugger import Tracer
#Tracer()()

### prevent the dying jupyter notebook
stdout = sys.stdout
#sys.stdout = sys.__stdout__  # did not work to restoure print -> console
#sys.stdout = open('keras_output.txt', 'a+')
#sys.stdout = stdout

import utils
_ = importlib.reload(utils)
from utils import *

import twBase
_ = importlib.reload(twBase)
from twBase import *

%matplotlib inline
np.random.seed(42)

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

Choose the Model for display


In [ ]:
import cfg
_ = importlib.reload(cfg)
from cfg import Config

import PrognoseModel as Model
_ = importlib.reload(Model)

import PrognoseRun as Run
_ = importlib.reload(Run)

config_file = "./data/out/RNN3.1.48.5.01/config.json"
#config_file = "./data/out/MLP2.336.48.5.01/config.json"
args = Run.process_command_line(["--load_config", config_file, "train"])
P = Config()(args)

Data ^

Load Data and select columns


In [ ]:
fn = os.path.join(P.DATA.BASE_DIR, "regr.336.48.5.h5")
log.info("Opening", fn=fn)
preds_regr = load_h5(fn, 'regr')
preds_regr.shape

In [ ]:
fn = P.DATA.H5_FN
log.info("Opening", fn=fn, model=P.MODEL.NAME)
X = load_h5(fn, 'X_test')
Y = load_h5(fn, 'Y_test')
preds_mlp = load_h5(fn, P.MODEL.NAME)
X.shape, preds_mlp.shape, Y.shape

data = load_h5(fn, 'test_scaled')
data.shape

Graphs


In [ ]:
from sklearn.metrics import mean_squared_error

idx = np.random.randint(0,X.shape[0])
d = data[idx,:]
d = d[P.DATA.LABEL_IDX::P.MODEL.NVARIATE]  # take only nth value, at idx place
x = X[idx,:P.MODEL.N_IN*P.MODEL.NVARIATE]  # last idx are the features of the future/N_OUT timesteps
x = x[P.DATA.LABEL_IDX::P.MODEL.NVARIATE]  # take only nth value, at idx place
y = Y[idx,:]
p_regr = preds_regr[idx,:]
p_mlp = preds_mlp[idx,:]
legend_regr = "regr [MSE: {:.3f}]".format(mean_squared_error(y, p_regr))
legend_mlp = "mlp [MSE: {:.3f}]".format(mean_squared_error(y, p_mlp))

x.shape, y.shape, p_regr.shape


fig, axs = plt.subplots(1, figsize=(20,10))
_ = fig.suptitle("Energy Consumption Forecast Plant LPZ", fontsize=26)
_ = axs.set_title("Prognosis with Model: {}".format(P.MODEL.NAME), fontsize=10)
#axs.set_title("Title for first plot")
t = np.arange(0, len(d))
_ = axs.plot(t, d)
#t = np.arange(0, len(x))
#_ = axs.plot(t, x)
t = np.arange(len(x), len(x)+len(y))
_ = axs.plot(t, y, color='b')
_ = axs.plot(t, p_regr, color='r')
_ = axs.plot(t, p_mlp, color='g')
_ = axs.legend(["data", "truth", legend_regr, legend_mlp])

if len(x) > len(y):
    xmin, xmax = axs.get_xlim()
    _ = axs.set_xlim([xmax/2,xmax])
_ = plt.axvline(x=len(x))

In [ ]:
from sklearn.metrics import mean_squared_error

idx = np.random.randint(0,X.shape[0])

correct = 0  # factor for shfiting to correct point in time
if P.MODEL.SEQ_LEN is not None:
    d = data[idx:idx+P.MODEL.SEQ_LEN+P.MODEL.N_OUT,P.DATA.LABEL_IDX]
    x = X[idx,:,P.DATA.LABEL_IDX]
    correct = 67  # factor for shfiting to correct point in time
else:
    d = data[idx,:]
    d = d[P.DATA.LABEL_IDX::P.MODEL.NVARIATE]  # take only nth value, at idx place
    x = X[idx,:P.MODEL.N_IN*P.MODEL.NVARIATE]  # last idxs are the features of the future/N_OUT timesteps

y = Y[idx,:]

if correct > idx:
    p_regr = np.zeros(len(y))
else:
    p_regr = preds_regr[idx-correct,:]

p_mlp = preds_mlp[idx,:]
d.shape, x.shape, y.shape, p_regr.shape

legend_regr = "regr [MSE: {:.3f}]".format(mean_squared_error(y, p_regr))
legend_mlp = "mlp [MSE: {:.3f}]".format(mean_squared_error(y, p_mlp))

fig, axs = plt.subplots(1, figsize=(20,10))
_ = fig.suptitle("Energy Consumption Forecast Plant LPZ", fontsize=26)
_ = axs.set_title("Prognosis with Model: {}".format(P.MODEL.NAME), fontsize=10)
#axs.set_title("Title for first plot")
t = np.arange(0, len(d))
_ = axs.plot(t, d)
#t = np.arange(0, len(x))
#_ = axs.plot(t, x)
t = np.arange(len(x), len(x)+len(y))
#_ = axs.plot(t, y, color='b')
_ = axs.plot(t, p_regr, color='r')
_ = axs.plot(t, p_mlp, color='g')
_ = axs.legend(["truth", legend_regr, legend_mlp])

if len(x) > len(y):
    xmin, xmax = axs.get_xlim()
    _ = axs.set_xlim([xmax/2,xmax])
_ = plt.axvline(x=len(x))

Clustering

model selection does not matter, only raw data is necessary from here on


In [13]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

import cfg
_ = importlib.reload(cfg)
from cfg import Config

import PrognoseModel as Model
_ = importlib.reload(Model)

import PrognoseRun as Run
_ = importlib.reload(Run)


import DataHarness as DH
_ = importlib.reload(DH)

#config_file = "./data/out/RNN3.1.48.5.01/config.json"
#config_file = "./data/out/MLP2.336.48.5.01/config.json"
#args = Run.process_command_line(["--load_config", config_file, "train"])
args = Run.process_command_line(["train"])
P = Config()(args)

dh = DH.DataHarness(P)


/home/tw/nbs/Prognose/PrognoseRun.py
fn='./data/Lastprognose/2015.csv' event='Loading dataset'
event='Differencing...'
event='Transforming for supervised learning...'
event='Splitting data'
event='Rescaling data...'

In [14]:
P.DATA.COLS


Out[14]:
['Ist [MWh]']

In [23]:
#data = load_h5(fn, 'data')
X = dh.generate_period(dh.data, period=24)
dh.data.shape, X.shape


Out[23]:
((8760, 1), (364, 24, 1))

In [24]:
# flatten the dayly features
X = X.reshape(-1, X.shape[1]*X.shape[2])
X.shape


Out[24]:
(364, 24)

In [25]:
X = StandardScaler().fit_transform(X)
X.max(), X.min()


Out[25]:
(1.5928342452793409, -2.5867967523967641)

In [26]:
from sklearn.cluster import KMeans
random_state = 170
y_pred = KMeans(n_clusters=3, random_state=random_state).fit(X)
y_pred.labels_


Out[26]:
array([1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2,
       1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0,
       0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 1, 2, 1, 0,
       0, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 1, 1, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0,
       0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1,
       0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0,
       0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2,
       1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0,
       0, 0, 0, 0, 2, 1, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

Finding MinPts and Epsilong for DBSCAN

https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf

MinPts: As a rule of thumb, a minimum minPts can be derived from the number of dimensions D in the data set, as minPts ≥ D + 1. minPts must be chosen at least 3. However, larger values are usually better for data sets with noise and will yield more significant clusters. The larger the data set, the larger the value of minPts should be chosen.

ε: The value for ε can then be chosen by using a k-distance graph, plotting the distance to the k = minPts nearest neighbor. Good values of ε are where this plot shows a strong bend: if ε is chosen much too small, a large part of the data will not be clustered; whereas for a too high value of ε, clusters will merge and the majority of objects will be in the same cluster. In general, small values of ε are preferable, and as a rule of thumb only a small fraction of points should be within this distance of each other.


In [74]:
from sklearn.neighbors import kneighbors_graph
n_neighbors = 4

# get the k-neighbors per point
A = kneighbors_graph(X, n_neighbors, mode='distance')
A = A.toarray()
A.shape

# get only the k distances
#A[0][A[0]>0]

# get only the k-th distance
B = A.max(axis=1)

# sort in reverse order
B[::-1].sort()

fig = plt.figure(figsize=(20,10))
_ = plt.plot(B)


Out[74]:
(364, 364)

In [120]:
db = DBSCAN(eps=1.5, min_samples=10).fit(X)

db.labels_


Out[120]:
array([ 0, -1, -1,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,  1,
        2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,  1,  2,  0,
        1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,
        1,  1, -1,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1, -1,
        0,  0,  0,  0,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,  1,  2,
        0,  1,  1,  1,  1,  0,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  0, -1,  0,  0,  1,
        1,  1,  1,  1,  2,  0,  0,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,
        1,  1,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,  1,
       -1,  0,  1,  1, -1,  1,  1,  2,  0,  1,  1,  1,  1,  1, -1,  0,  1,  1,  1,  1,  1,  2,  0,
        1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,
        1,  1,  1,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,
        1,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1, -1,  0,
        0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,  1, -1,  0,  1,  1,  1,  1,  1, -1,  0,  1,
        1,  1,  1,  1,  0,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  0,
        1,  1,  2,  0,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1, -1,  1, -1,  0,  1,  1,  1,  1,  1,
       -1,  0,  1,  1, -1, -1,  0,  0,  0, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0])

In [121]:
img = db.labels_.reshape(1,-1)
img2 = y_pred.labels_.reshape(1,-1)

# get the shift data
d = dh.load(P.DATA.DATA_FN, 'Shift')
d = d[::24][:-1]
img3 = d.reshape(1,-1)

img.shape, img2.shape, img3.shape

import matplotlib
def get_color(colors):
    # red: noise, grey: workday, yello: Sa, blue: nowork
    levels = [-1, 0, 1, 2]
    #colors = ['red', 'blue', 'grey', 'yellow']
    cmap, norm = matplotlib.colors.from_levels_and_colors(levels=levels, colors=colors, extend='max')
    return cmap, norm

#ax.scatter(x,y,c=timeDiffInt, s=150, marker='<', edgecolor='none', cmap=cmap, norm=norm)

fig, axs = plt.subplots(3, 1, figsize=(20,10))

_ = axs[0].set_title("DBSCAN Clustering")
colors = ['red', 'blue', 'grey', 'yellow']
cmap, norm = get_color(colors)
_ = axs[0].pcolor(img, cmap=cmap, norm=norm)

_ = axs[1].set_title("k-NN Clustering with n=3")
colors = ['red', 'grey', 'blue', 'yellow']
cmap, norm = get_color(colors)
_ = axs[1].pcolor(img2, cmap=cmap, norm=norm)

_ = axs[2].set_title("Original shift plan")
colors = ['red', 'blue', 'yellow', 'grey']
cmap, norm = get_color(colors)
_ = axs[2].pcolor(img3, cmap=cmap, norm=norm)


fn='./data/Lastprognose/2015.csv' event='Loading dataset'
Out[121]:
((1, 364), (1, 364), (1, 364))

In [89]:
d.max()


Out[89]:
2

In [ ]: