http://machinelearningmastery.com/time-series-forecasting-long-short-term-memory-network-python/
http://machinelearningmastery.com/multi-step-time-series-forecasting/
http://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/
https://github.com/fchollet/keras/blob/master/examples/stateful_lstm.py
There are at least four commonly used strategies for making multi-step forecasts.
They are:
Direct Multi-step Forecast Strategy. Recursive Multi-step Forecast Strategy. Direct-Recursive Hybrid Multi-step Forecast Strategies. Multiple Output Forecast Strategy.
In [1]:
# imports
import sys, os, argparse, logging # NOQA
from pprint import pprint
from tqdm import tqdm
import importlib
from IPython.core.debugger import Tracer
#Tracer()()
### prevent the dying jupyter notebook
stdout = sys.stdout
#sys.stdout = sys.__stdout__ # did not work to restoure print -> console
#sys.stdout = open('keras_output.txt', 'a+')
#sys.stdout = stdout
import utils
_ = importlib.reload(utils)
from utils import *
import twBase
_ = importlib.reload(twBase)
from twBase import *
%matplotlib inline
np.random.seed(42)
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
In [ ]:
import cfg
_ = importlib.reload(cfg)
from cfg import Config
import PrognoseModel as Model
_ = importlib.reload(Model)
import PrognoseRun as Run
_ = importlib.reload(Run)
config_file = "./data/out/RNN3.1.48.5.01/config.json"
#config_file = "./data/out/MLP2.336.48.5.01/config.json"
args = Run.process_command_line(["--load_config", config_file, "train"])
P = Config()(args)
In [ ]:
fn = os.path.join(P.DATA.BASE_DIR, "regr.336.48.5.h5")
log.info("Opening", fn=fn)
preds_regr = load_h5(fn, 'regr')
preds_regr.shape
In [ ]:
fn = P.DATA.H5_FN
log.info("Opening", fn=fn, model=P.MODEL.NAME)
X = load_h5(fn, 'X_test')
Y = load_h5(fn, 'Y_test')
preds_mlp = load_h5(fn, P.MODEL.NAME)
X.shape, preds_mlp.shape, Y.shape
data = load_h5(fn, 'test_scaled')
data.shape
In [ ]:
from sklearn.metrics import mean_squared_error
idx = np.random.randint(0,X.shape[0])
d = data[idx,:]
d = d[P.DATA.LABEL_IDX::P.MODEL.NVARIATE] # take only nth value, at idx place
x = X[idx,:P.MODEL.N_IN*P.MODEL.NVARIATE] # last idx are the features of the future/N_OUT timesteps
x = x[P.DATA.LABEL_IDX::P.MODEL.NVARIATE] # take only nth value, at idx place
y = Y[idx,:]
p_regr = preds_regr[idx,:]
p_mlp = preds_mlp[idx,:]
legend_regr = "regr [MSE: {:.3f}]".format(mean_squared_error(y, p_regr))
legend_mlp = "mlp [MSE: {:.3f}]".format(mean_squared_error(y, p_mlp))
x.shape, y.shape, p_regr.shape
fig, axs = plt.subplots(1, figsize=(20,10))
_ = fig.suptitle("Energy Consumption Forecast Plant LPZ", fontsize=26)
_ = axs.set_title("Prognosis with Model: {}".format(P.MODEL.NAME), fontsize=10)
#axs.set_title("Title for first plot")
t = np.arange(0, len(d))
_ = axs.plot(t, d)
#t = np.arange(0, len(x))
#_ = axs.plot(t, x)
t = np.arange(len(x), len(x)+len(y))
_ = axs.plot(t, y, color='b')
_ = axs.plot(t, p_regr, color='r')
_ = axs.plot(t, p_mlp, color='g')
_ = axs.legend(["data", "truth", legend_regr, legend_mlp])
if len(x) > len(y):
xmin, xmax = axs.get_xlim()
_ = axs.set_xlim([xmax/2,xmax])
_ = plt.axvline(x=len(x))
In [ ]:
from sklearn.metrics import mean_squared_error
idx = np.random.randint(0,X.shape[0])
correct = 0 # factor for shfiting to correct point in time
if P.MODEL.SEQ_LEN is not None:
d = data[idx:idx+P.MODEL.SEQ_LEN+P.MODEL.N_OUT,P.DATA.LABEL_IDX]
x = X[idx,:,P.DATA.LABEL_IDX]
correct = 67 # factor for shfiting to correct point in time
else:
d = data[idx,:]
d = d[P.DATA.LABEL_IDX::P.MODEL.NVARIATE] # take only nth value, at idx place
x = X[idx,:P.MODEL.N_IN*P.MODEL.NVARIATE] # last idxs are the features of the future/N_OUT timesteps
y = Y[idx,:]
if correct > idx:
p_regr = np.zeros(len(y))
else:
p_regr = preds_regr[idx-correct,:]
p_mlp = preds_mlp[idx,:]
d.shape, x.shape, y.shape, p_regr.shape
legend_regr = "regr [MSE: {:.3f}]".format(mean_squared_error(y, p_regr))
legend_mlp = "mlp [MSE: {:.3f}]".format(mean_squared_error(y, p_mlp))
fig, axs = plt.subplots(1, figsize=(20,10))
_ = fig.suptitle("Energy Consumption Forecast Plant LPZ", fontsize=26)
_ = axs.set_title("Prognosis with Model: {}".format(P.MODEL.NAME), fontsize=10)
#axs.set_title("Title for first plot")
t = np.arange(0, len(d))
_ = axs.plot(t, d)
#t = np.arange(0, len(x))
#_ = axs.plot(t, x)
t = np.arange(len(x), len(x)+len(y))
#_ = axs.plot(t, y, color='b')
_ = axs.plot(t, p_regr, color='r')
_ = axs.plot(t, p_mlp, color='g')
_ = axs.legend(["truth", legend_regr, legend_mlp])
if len(x) > len(y):
xmin, xmax = axs.get_xlim()
_ = axs.set_xlim([xmax/2,xmax])
_ = plt.axvline(x=len(x))
In [13]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import cfg
_ = importlib.reload(cfg)
from cfg import Config
import PrognoseModel as Model
_ = importlib.reload(Model)
import PrognoseRun as Run
_ = importlib.reload(Run)
import DataHarness as DH
_ = importlib.reload(DH)
#config_file = "./data/out/RNN3.1.48.5.01/config.json"
#config_file = "./data/out/MLP2.336.48.5.01/config.json"
#args = Run.process_command_line(["--load_config", config_file, "train"])
args = Run.process_command_line(["train"])
P = Config()(args)
dh = DH.DataHarness(P)
In [14]:
P.DATA.COLS
Out[14]:
In [23]:
#data = load_h5(fn, 'data')
X = dh.generate_period(dh.data, period=24)
dh.data.shape, X.shape
Out[23]:
In [24]:
# flatten the dayly features
X = X.reshape(-1, X.shape[1]*X.shape[2])
X.shape
Out[24]:
In [25]:
X = StandardScaler().fit_transform(X)
X.max(), X.min()
Out[25]:
In [26]:
from sklearn.cluster import KMeans
random_state = 170
y_pred = KMeans(n_clusters=3, random_state=random_state).fit(X)
y_pred.labels_
Out[26]:
https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf
MinPts: As a rule of thumb, a minimum minPts can be derived from the number of dimensions D in the data set, as minPts ≥ D + 1. minPts must be chosen at least 3. However, larger values are usually better for data sets with noise and will yield more significant clusters. The larger the data set, the larger the value of minPts should be chosen.
ε: The value for ε can then be chosen by using a k-distance graph, plotting the distance to the k = minPts nearest neighbor. Good values of ε are where this plot shows a strong bend: if ε is chosen much too small, a large part of the data will not be clustered; whereas for a too high value of ε, clusters will merge and the majority of objects will be in the same cluster. In general, small values of ε are preferable, and as a rule of thumb only a small fraction of points should be within this distance of each other.
In [74]:
from sklearn.neighbors import kneighbors_graph
n_neighbors = 4
# get the k-neighbors per point
A = kneighbors_graph(X, n_neighbors, mode='distance')
A = A.toarray()
A.shape
# get only the k distances
#A[0][A[0]>0]
# get only the k-th distance
B = A.max(axis=1)
# sort in reverse order
B[::-1].sort()
fig = plt.figure(figsize=(20,10))
_ = plt.plot(B)
Out[74]:
In [120]:
db = DBSCAN(eps=1.5, min_samples=10).fit(X)
db.labels_
Out[120]:
In [121]:
img = db.labels_.reshape(1,-1)
img2 = y_pred.labels_.reshape(1,-1)
# get the shift data
d = dh.load(P.DATA.DATA_FN, 'Shift')
d = d[::24][:-1]
img3 = d.reshape(1,-1)
img.shape, img2.shape, img3.shape
import matplotlib
def get_color(colors):
# red: noise, grey: workday, yello: Sa, blue: nowork
levels = [-1, 0, 1, 2]
#colors = ['red', 'blue', 'grey', 'yellow']
cmap, norm = matplotlib.colors.from_levels_and_colors(levels=levels, colors=colors, extend='max')
return cmap, norm
#ax.scatter(x,y,c=timeDiffInt, s=150, marker='<', edgecolor='none', cmap=cmap, norm=norm)
fig, axs = plt.subplots(3, 1, figsize=(20,10))
_ = axs[0].set_title("DBSCAN Clustering")
colors = ['red', 'blue', 'grey', 'yellow']
cmap, norm = get_color(colors)
_ = axs[0].pcolor(img, cmap=cmap, norm=norm)
_ = axs[1].set_title("k-NN Clustering with n=3")
colors = ['red', 'grey', 'blue', 'yellow']
cmap, norm = get_color(colors)
_ = axs[1].pcolor(img2, cmap=cmap, norm=norm)
_ = axs[2].set_title("Original shift plan")
colors = ['red', 'blue', 'yellow', 'grey']
cmap, norm = get_color(colors)
_ = axs[2].pcolor(img3, cmap=cmap, norm=norm)
Out[121]:
In [89]:
d.max()
Out[89]:
In [ ]: