REDD - Coleta e Preparação dos Dados (Low Frequency)

Neste documento será realizada a tabulação dos dados para cliassificação de cargas da base REDD.

A janela considerada será de 5 minutos (3000 segundos).

Procedimentos experimentais baseados em http://www.sbrt.org.br/sbrt2017/anais/1570359866.pdf


In [1]:
import warnings
#warnings.filterwarnings("warning")
import traceback

import time
import tensorflow as tf

import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('ggplot')

from matplotlib import rcParams
rcParams['figure.figsize'] = (13, 10)

import pandas as pd

from tqdm import tqdm, tqdm_notebook
#tf.debugging.set_log_device_placement(True)
#print("GPU Available: ", tf.test.is_gpu_available())


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-1-08f648a838a2> in <module>
      4 
      5 import time
----> 6 import tensorflow as tf
      7 
      8 import matplotlib.pyplot as plt

~\anaconda3\envs\doutorado\lib\site-packages\tensorflow\__init__.py in <module>
     43 from tensorflow._api.v2 import autograph
     44 from tensorflow._api.v2 import bitwise
---> 45 from tensorflow._api.v2 import compat
     46 from tensorflow._api.v2 import config
     47 from tensorflow._api.v2 import data

~\anaconda3\envs\doutorado\lib\site-packages\tensorflow\_api\v2\compat\__init__.py in <module>
     19 from __future__ import print_function as _print_function
     20 
---> 21 from tensorflow._api.v2.compat import v1
     22 from tensorflow._api.v2.compat import v2
     23 from tensorflow.python.compat.compat import forward_compatibility_horizon

~\anaconda3\envs\doutorado\lib\site-packages\tensorflow\_api\v2\compat\v1\__init__.py in <module>
    649 _current_module = _sys.modules[__name__]
    650 try:
--> 651   from tensorflow_estimator.python.estimator.api._v1 import estimator
    652   _current_module.__path__ = (
    653       [_module_util.get_parent_dir(estimator)] + _current_module.__path__)

~\anaconda3\envs\doutorado\lib\site-packages\tensorflow_estimator\__init__.py in <module>
      6 from __future__ import print_function as _print_function
      7 
----> 8 from tensorflow_estimator._api.v1 import estimator
      9 _names_with_underscore = []
     10 __all__ = [_s for _s in dir() if not _s.startswith('_')]

~\anaconda3\envs\doutorado\lib\site-packages\tensorflow_estimator\_api\v1\estimator\__init__.py in <module>
      6 from __future__ import print_function as _print_function
      7 
----> 8 from tensorflow_estimator._api.v1.estimator import experimental
      9 from tensorflow_estimator._api.v1.estimator import export
     10 from tensorflow_estimator._api.v1.estimator import inputs

~\anaconda3\envs\doutorado\lib\site-packages\tensorflow_estimator\_api\v1\estimator\experimental\__init__.py in <module>
      6 from __future__ import print_function as _print_function
      7 
----> 8 from tensorflow_estimator.python.estimator.canned.dnn import dnn_logit_fn_builder
      9 from tensorflow_estimator.python.estimator.canned.kmeans import KMeansClustering as KMeans
     10 from tensorflow_estimator.python.estimator.canned.linear import LinearSDCA

~\anaconda3\envs\doutorado\lib\site-packages\tensorflow_estimator\python\estimator\__init__.py in <module>
     23 from __future__ import print_function
     24 
---> 25 import tensorflow_estimator.python.estimator.estimator_lib
     26 

~\anaconda3\envs\doutorado\lib\site-packages\tensorflow_estimator\python\estimator\estimator_lib.py in <module>
     51 from tensorflow_estimator.python.estimator.hooks import hooks
     52 from tensorflow_estimator.python.estimator.hooks import session_run_hook
---> 53 from tensorflow_estimator.python.estimator.inputs import inputs
     54 from tensorflow_estimator.python.estimator.keras import model_to_estimator
     55 from tensorflow_estimator.python.estimator.mode_keys import ModeKeys

~\anaconda3\envs\doutorado\lib\site-packages\tensorflow_estimator\python\estimator\inputs\inputs.py in <module>
     20 
     21 # pylint: disable=unused-import,line-too-long
---> 22 from tensorflow_estimator.python.estimator.inputs.numpy_io import numpy_input_fn
     23 from tensorflow_estimator.python.estimator.inputs.pandas_io import pandas_input_fn
     24 

~\anaconda3\envs\doutorado\lib\site-packages\tensorflow_estimator\python\estimator\inputs\numpy_io.py in <module>
     24 from six import string_types
     25 
---> 26 from tensorflow_estimator.python.estimator.inputs.queues import feeding_functions
     27 from tensorflow.python.util.tf_export import estimator_export
     28 

~\anaconda3\envs\doutorado\lib\site-packages\tensorflow_estimator\python\estimator\inputs\queues\feeding_functions.py in <module>
     38 try:
     39   # pylint: disable=g-import-not-at-top
---> 40   import pandas as pd
     41   HAS_PANDAS = True
     42 except IOError:

~\anaconda3\envs\doutorado\lib\site-packages\pandas\__init__.py in <module>
     11 for dependency in hard_dependencies:
     12     try:
---> 13         __import__(dependency)
     14     except ImportError as e:
     15         missing_dependencies.append(dependency)

~\anaconda3\envs\doutorado\lib\site-packages\pytz\__init__.py in <module>
   1099 
   1100 all_timezones_set = LazySet(all_timezones)
-> 1101 _all_timezones_lower_to_standard = dict((tz.lower(), tz) for tz in all_timezones)
   1102 common_timezones = \
   1103 ['Africa/Abidjan',

~\anaconda3\envs\doutorado\lib\site-packages\pytz\lazy.py in _lazy(self, *args, **kw)
     99                 try:
    100                     if len(fill_iter) > 0:
--> 101                         list.extend(self, fill_iter.pop())
    102                         for method_name in cls._props:
    103                             delattr(LazyList, method_name)

~\anaconda3\envs\doutorado\lib\site-packages\pytz\__init__.py in <genexpr>(.0)
   1096  'Zulu']
   1097 all_timezones = LazyList(
-> 1098         tz for tz in all_timezones if resource_exists(tz))
   1099 
   1100 all_timezones_set = LazySet(all_timezones)

~\anaconda3\envs\doutorado\lib\site-packages\pytz\__init__.py in resource_exists(name)
    112     """Return true if the given resource exists"""
    113     try:
--> 114         open_resource(name).close()
    115         return True
    116     except IOError:

~\anaconda3\envs\doutorado\lib\site-packages\pytz\__init__.py in open_resource(name)
    106             if resource_stream is not None:
    107                 return resource_stream(__name__, 'zoneinfo/' + name)
--> 108     return open(filename, 'rb')
    109 
    110 

KeyboardInterrupt: 

Carregando os dados da base REDD via NILMTK


In [ ]:
from nilmtk import DataSet
from nilmtk.utils import print_dict

redd = DataSet('datasets/REDD/low_freq.h5')

In [ ]:
# Configurações da amostragem
from datetime import datetime, timedelta

# Informações do CS446 Project : Electric Load Identification using Machine Learning (REDD)
building_idx = 3
set_sampling_rate = 3 
start = datetime(2011, 4, 16, 5, 11, 27)
end = datetime(2011, 5, 31, 0, 19, 54)
time_interval_minutes = 5 # Split de amostra


# ... 6 seconds - Imaging Time Series  (UK-DALE)

Pré-processamento dos dados


Delimitar (intervalo de tempo global e janelas de medições) e exportar os dados.


In [ ]:
building_idx = 1

building = redd.buildings[building_idx]

In [ ]:
# Available devices in building
building.elec

In [ ]:
# Defining the fixed time block measurement
redd.set_window(start='2011-04-18', end='2011-04-20')

In [ ]:
# Showing device consumption (inside time block)
num_apps = 20
fig, axes = plt.subplots((num_apps+1)//2,2, figsize=(24, num_apps*2) )
for i in range(num_apps):
    e = redd.buildings[1].elec[i+1]
    axes.flat[i].plot(e.power_series_all_data(sample_period=3), alpha = 0.6)
    axes.flat[i].set_title(e.label(), fontsize = '15')
plt.suptitle('', fontsize = '30')
fig.tight_layout()
fig.subplots_adjust(top=0.95)

Chunking Energy Consumption in time-box (1 box = 5 minutes)


In [ ]:
# Intervalos de geracao dos dados
def datetime_range(start, end, delta):
    '''
    Generating a list of datetime intervals (chunks of energy consumption)
    from `start` to `end` at each `delta` units.
    '''
    current = start
    while current < end:
        yield current
        current += delta

# List of datatimes 
dts = [dt.strftime('%Y-%m-%d %H:%M:%S') 
           for dt in 
               datetime_range(start, 
                              end, 
                              timedelta(minutes=time_interval_minutes))
      ]

In [ ]:
# Checking chunks list...
for idx in range(1, len(dts)):
    print('de', dts[idx-1], ' a ', dts[idx])

In [ ]:
power = building.elec[1].power_series_all_data(sample_period=set_sampling_rate)
mains1 = pd.DataFrame(data = {"Power": power.values }, index=power.index)
print('Mains 1 orginal shape: ', mains1.shape)
power = building.elec[2].power_series_all_data(sample_period=set_sampling_rate)
mains2 = pd.DataFrame(data = {"Power": power.values }, index=power.index)
print('Mains 2 orginal shape: ', mains2.shape)

power = building.elec[5].power_series_all_data(sample_period=set_sampling_rate)
appliance = pd.DataFrame(data = {"Power": power.values }, index=power.index)
print('Appliance shape: ', appliance.shape)

In [ ]:
tStart

In [ ]:
len(appliance.index)

In [ ]:
# Conjunto de dataframes (chunks de 5 minutos)
dataframes = []

# Iterando sobre blocos de tempos (5 minutos)
for idx in tqdm_notebook(range(1, len(dts))):
    
    tStart = dts[idx-1]
    tEnd = dts[idx]

    # Intervalo de treino do modelo
    redd.set_window(start=tStart, end=tEnd)
    try:
        print('- Chunk #',idx,': from ', tStart, 'to', tEnd)
        
        dfs = {}
        _index = []
        for m in building.elec.all_meters():

            label = str(m.label()).lower().replace(' ','_') + '_' + str(m.instance())
            power = m.power_series_all_data(sample_period=set_sampling_rate)
            
            dfs[label] = pd.DataFrame(data = {"Power": power.values }, index=power.index)
            
            if len(_index) < len(power.index) and ('site_meter' not in label):
                _index = power.index

        for meter_label in dfs:
            #if 'site_meter' in meter_label:
            #    dfs[meter_label] = dfs[meter_label].reindex(index=_index)
            dfs[meter_label] = dfs[meter_label].reindex(index=_index)
            dfs[meter_label] = dfs[meter_label]['Power'].values

        df = pd.DataFrame(dfs, index = _index)
        
#         power = building.elec[1].power_series_all_data(sample_period=set_sampling_rate)
#         mains1 = pd.DataFrame(data = {"Power": power.values }, index=power.index)
#         print('Mains 1 orginal shape: ', mains1.shape)
#         power = building.elec[2].power_series_all_data(sample_period=set_sampling_rate)
#         mains2 = pd.DataFrame(data = {"Power": power.values }, index=power.index)
#         print('Mains 2 orginal shape: ', mains2.shape)

#         power = building.elec[5].power_series_all_data(sample_period=set_sampling_rate)
#         appliance = pd.DataFrame(data = {"Power": power.values }, index=power.index)
#         print('Appliance shape: ', appliance.shape)

#         # Ajustar timeframes (eletronicos medidos em 3s, contra 1s da rede)
#         mains1 = mains1.reindex(index=appliance.index)
#         print('---\nMains 1 new shape: ', mains1.shape)
#         mains2 = mains2.reindex(index=appliance.index)
#         print('Mains 2 new shape: ', mains2.shape)

#         # Dataframe da modelagem
#         df = pd.DataFrame({
#             'Mains1': mains1["Power"].values,
#             'Mains2': mains2["Power"].values,
#             'Appliance': appliance["Power"].values
#         }, index = appliance.index)
        print('\n---\nDataframe shape: ', df.shape,'\n')

        dataframes.append(df)
    except Exception as e:
        print(' ----- Error: ', str(e))#str(traceback.format_exc()))
        #print(' ----- Não foi possível extrair dados do intervalo!')

In [ ]:
print('Total Chunks:', len(dataframes))

In [ ]:
# Check if the chunk has the valid length
valid_chunk_length = (time_interval_minutes*60)/set_sampling_rate
valid_chunks = [d for d in dataframes if d.shape[0] == valid_chunk_length]

print('Valid Chunks:', len(valid_chunks) )

In [ ]:
# Plotting 5 chunks
for df in tqdm(dataframes):
    #df = valid_chunks[i]
#     if sum(df['Appliance'].values) == 0:
#         fig = plt.figure(figsize=(10,8))
#         plt.plot(df['Mains1'].values)
#         plt.plot(df['Mains2'].values)
#         plt.plot(df['Appliance'].values)
#         plt.gca().legend(('Mains1','Mains2', 'Appliance'))
    fig = plt.figure(figsize=(20,10))
    for column in df.columns:
        plt.plot(df[column].values)
    plt.gca().legend(df.columns)    
    break

Feature Extraction / Label Building


In [ ]:
final_df = pd.DataFrame()
rows = []
classes = [c for c in dataframes[0].columns if 'site_meter' not in c]
for df in dataframes:
    attributes = {
        'mean_1': df['site_meter_1'].mean(),
        'std_1': df['site_meter_1'].std(),
        'max_1': df['site_meter_1'].max(),
        'min_1': df['site_meter_1'].min(),
        'sum_1': df['site_meter_1'].sum(),
        'mean_2': df['site_meter_2'].mean(),
        'std_2': df['site_meter_2'].std(),
        'max_2': df['site_meter_2'].max(),
        'min_2': df['site_meter_2'].min(),
        'sum_2': df['site_meter_2'].sum()
    }
    labels = {}
    for c in classes:
        labels[c] = 1 if df[c].sum() > 0 else 0
    
    final_df = final_df.append({**attributes, **labels}, ignore_index=True)

final_df = final_df[ list(attributes.keys()) + list(labels.keys()) ]

In [ ]:
final_df.head(10)

In [ ]:
final_df.describe()

In [ ]:
final_df.to_csv('df_building_1_statistics_features.csv')

In [ ]:
# TODO:
#- Validar metodologia
#- Validar chunks gerados (noralizar erros)
#- Rotular base (labels binários por dispositivo)

Modelagem


In [ ]:
final_df[final_df.columns[:10]].head()

In [ ]:


In [ ]:
from skmultilearn.adapt import MLkNN
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

scores = cross_val_score(
    MLkNN(k=3),
    final_df[final_df.columns[:5]].values,
    final_df[final_df.columns[10:]].values,
    scoring = 'f1_micro',
    cv=5,
    n_jobs = 8
)

scores.mean()

In [ ]:
from sklearn.metrics import make_scorer, hamming_loss
hamming_score = make_scorer(hamming_loss)

scores = cross_val_score(
    MLkNN(k=3),
    final_df[final_df.columns[:5]].values,
    final_df[final_df.columns[10:]].values,
    scoring = hamming_score,
    cv=5,
    n_jobs = 8
)

scores.mean()

In [ ]:


In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score

scores = cross_val_score(
    RandomForestClassifier(n_estimators=1000),
    final_df[final_df.columns[:5]].values,
    final_df[final_df.columns[10:]].values,
    scoring = hamming_score,
    cv=5,
    n_jobs = 8
)

scores.mean()

In [ ]:

Outras análises


In [ ]:
#dates = [str(dt).split(' ')[0] for dt in df.index]
dates = [str(time)[:10] for time in df.index.values]
dates = sorted(list(set(dates)))
print('Os dados da Residência modelada contém medições de {1} dia(s) (de {2} a {3}).'.format(i,len(dates),dates[0], dates[-1]))

In [ ]:
# Split de treino, teste e validação
df1_train = df.loc[:dates[10]]
df1_val = df.loc[dates[11]:dates[16]]
df1_test = df.loc[dates[17]:]
print('df_train.shape: ', df1_train.shape)
print('df_val.shape: ', df1_val.shape)
print('df_test.shape: ', df1_test.shape)

In [ ]:
# Usando a corrente 1 e 2 (variaveis independetes) para a previsão do refrigerador (variavel dependente)
X_train = df1_train[['Mains1','Mains2']].values
y_train = df1_train['Appliance'].values

X_test = df1_test[['Mains1','Mains2']].values
y_test = df1_test['Appliance'].values

X_val = df1_val[['Mains1','Mains2']].values
y_val = df1_val['Appliance'].values

print(
    'Train: ', X_train.shape, y_train.shape, '\n',
    'Test: ', X_val.shape, y_val.shape, '\n',
    'Validation: ', X_test.shape, y_test.shape
)

MLP


In [ ]:
# Metrcas de avaliação da regressão
def mse_loss(y_predict, y):
    return np.mean(np.square(y_predict - y)) 
def mae_loss(y_predict, y):
    return np.mean(np.abs(y_predict - y))

In [ ]:
from tensorflow.keras.layers import Dense, Activation, Dropout, LSTM, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import plot_model

def build_fc_model(layers):
    fc_model = Sequential()
    for i in range(len(layers)-1):
        #fc_model.add( Dense(input_dim=layers[i], output_dim= layers[i+1]) )#, W_regularizer=l2(0.1)) )
        fc_model.add( Dense(input_shape=(layers[i],), units = layers[i+1]) )#, W_regularizer=l2(0.1)) )
        fc_model.add( Dropout(0.5) )
        if i < (len(layers) - 2):
            fc_model.add( Activation('relu') )
    fc_model.build()
    fc_model.summary()
    plot_model(fc_model)
    return fc_model

fc_model_1 = build_fc_model([2, 256, 512, 1024, 1])

In [ ]:
adam = Adam(lr = 1e-5)
fc_model_1.compile(loss='mean_squared_error', optimizer=adam)
start = time.time()
model_path = "./resources/mlp_fridge_h1.hdf5"
checkpointer = ModelCheckpoint(filepath=model_path, verbose=0, save_best_only=True)
hist_fc_1 = fc_model_1.fit( X_train, y_train,
                    batch_size=512, verbose=1, nb_epoch=200,
                    validation_split=0.33, callbacks=[checkpointer])
print('Tempo total de treinamento do modelo (s):', round(time.time() - start, 0))

In [ ]:
import numpy as np
fc_model = load_model(model_path)
pred_fc = fc_model.predict(X_test).reshape(-1)
mse_loss_fc = mse_loss(pred_fc, y_test)
mae_loss_fc = mae_loss(pred_fc, y_test)
print('MSE no conjunto de teste: ', mse_loss_fc)
print('MAE no conjunto de teste:', mae_loss_fc)

In [ ]:
train_loss = hist_fc_1.history['loss']
val_loss = hist_fc_1.history['val_loss']
def plot_losses(train_loss, val_loss):
    plt.rcParams["figure.figsize"] = [24,10]
    plt.title('MSE dos conjuntos de treino e teste - Resid. 1')
    plt.plot( range(len(train_loss)), train_loss, color = 'b', alpha = 0.6, label='loss (treino)' )
    plt.plot( range(len( val_loss )), val_loss, color = 'r', alpha = 0.6, label='loss (validação)' )
    plt.xlabel( 'época' )
    plt.ylabel( 'loss' )
    plt.legend()

plot_losses(train_loss, val_loss)

In [ ]:
# Plotando os cnsumos REAL e o PREVISTO do refrigerador nos 6 dias dos dados de teste
def plot_each_app(df, dates, predict, y_test, title, look_back = 0):
    num_date = len(dates)
    fig, axes = plt.subplots(num_date,1,figsize=(24, num_date*5) )
    plt.suptitle(title, fontsize = '25')
    fig.tight_layout()
    fig.subplots_adjust(top=0.95)
    for i in range(num_date):
        if i == 0: l = 0
        ind = df.ix[dates[i]].index[look_back:]
        axes.flat[i].plot(ind, y_test[l:l+len(ind)], color = 'blue', alpha = 0.6, label = 'REAL')
        axes.flat[i].plot(ind, predict[l:l+len(ind)], color = 'red', alpha = 0.6, label = 'PREVISTO')
        axes.flat[i].legend()
        l = len(ind)
plot_each_app(df1_test, dates[17:], pred_fc, y_test, 
              'Rede Neural FC: Real e Previsão nos 6 dias do Conjunto de Teste da Resid. 1', look_back = 50)

In [ ]:
# Testar FC mais complexa

LSTM


In [ ]:
def build_lstm_model(layers):
#         #fc_model.add( Dense(input_dim=layers[i], output_dim= layers[i+1]) )#, W_regularizer=l2(0.1)) )
#         fc_model.add( Dense(input_shape=(layers[i],), units = layers[i+1]) )#, W_regularizer=l2(0.1)) )
#         fc_model.add( Dropout(0.5) )
#         if i < (len(layers) - 2):
#             fc_model.add( Activation('relu') )
    model = Sequential()
    for i in range(len(layers) - 2):
        if i == 0:
            model.add(Embedding(input_dim=layers[i], output_dim=layers[i+1]))
        else:
            model.add(LSTM(
                input_shape=(layers[i],),
                units=layers[i+1],
                return_sequences = True if i < len(layers) - 3 else False ))
            model.add(Dropout(0.3))
    
    model.add(Dense(layers[-1]))
    
    model.build()
    model.summary()
    plot_model(model)
    return model

model = build_lstm_model([2,64,128,256, 1])

In [ ]:
# Utilizando 50 registros de consumos para retreinar o modelo, e prever o consumo de energia de cada aparelho
def process_data(df, dates, x_features, y_features, look_back = 50):
    i = 0
    for date in dates:
        data = df.loc[date]
        len_data = data.shape[0]
        x = np.array([data[x_features].values[i:i+look_back] 
                      for i in range(len_data - look_back) ]).reshape(-1,look_back, 2)
        y = data[y_features].values[look_back:,:]
        if i == 0:
            X = x
            Y = y
        else:
            X = np.append(X, x, axis=0)
            Y = np.append(Y, y, axis=0)
        i += 1
    return X,Y

start = time.time()
X_train, y_train = process_data(df, dates[:17], ['Mains1','Mains2'], df.columns.values[2:])
X_test, y_test = process_data(df, dates[17:], ['Mains1','Mains2'], df.columns.values[2:])
print('Tempo de execução total (s): ', time.time() - start)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [ ]:
start = time.time()
adam = Adam(lr = 5e-5)
lstm_model_path = "./resources/lstm_model.hdf5"
model.compile(loss='mean_squared_error', optimizer=adam)
checkpointer = ModelCheckpoint(filepath=lstm_model_path, verbose=0, save_best_only=True)
hist_lstm = model.fit(
            X_train,
            y_train[:,2],
            batch_size=512,
            verbose=1,
            nb_epoch=200,
            validation_split=0.3,
            callbacks=[checkpointer])
print('Tempo de treino (s): ', time.time() - start)

In [ ]:
y_train

Referências


In [ ]: