Análise exploratória da base REDD, aplicando Classificação de Séries Temporais baseada em Aprendizado Supervisionado.
A base REDD contempla dados de consumo energético de 6 casas distintas. Primeiro, após a análise exploratória, será treinado e testado um modelo a partir dos dados da residência 1 e avaliado o quão bem esse modelo generaliza para os padrões não-observados da residência 2.
In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
import datetime
import time, os
import math
import warnings
warnings.filterwarnings("ignore")
import glob
import nilmtk
In [2]:
PATH_DATASET = './datasets/REDD/'
def read_label():
label = {}
for i in range(1, 7):
hi = os.path.join(PATH_DATASET, 'low_freq/house_{}/labels.dat').format(i)
label[i] = {}
with open(hi) as f:
for line in f:
splitted_line = line.split(' ')
label[i][int(splitted_line[0])] = splitted_line[1].strip() + '_' + splitted_line[0]
return label
labels = read_label()
for i in range(1,3):
print('Residência {}: '.format(i), labels[i], '\n')
In [3]:
def read_merge_data(house):
path = os.path.join(PATH_DATASET, 'low_freq/house_{}/').format(house)
file = path + 'channel_1.dat'
df = pd.read_table(file, sep = ' ', names = ['unix_time', labels[house][1]],
dtype = {'unix_time': 'int64', labels[house][1]:'float64'})
num_apps = len(glob.glob(path + 'channel*'))
for i in range(2, num_apps + 1):
file = path + 'channel_{}.dat'.format(i)
data = pd.read_table(file, sep = ' ', names = ['unix_time', labels[house][i]],
dtype = {'unix_time': 'int64', labels[house][i]:'float64'})
df = pd.merge(df, data, how = 'inner', on = 'unix_time')
df['timestamp'] = df['unix_time'].astype("datetime64[s]")
df = df.set_index(df['timestamp'].values)
df.drop(['unix_time','timestamp'], axis=1, inplace=True)
return df
df = {}
for i in range(1,3):
df[i] = read_merge_data(i)
In [4]:
for i in range(1,3):
print('Shape dos dados da Residência {}: '.format(i), df[i].shape)
display(df[i].tail(3))
In [5]:
dates = {}
for i in range(1,3):
dates[i] = [str(time)[:10] for time in df[i].index.values]
dates[i] = sorted(list(set(dates[i])))
print('Os dados da Residência {0} contém medições de {1} dia(s) (de {2} a {3}).'.format(i,len(dates[i]),dates[i][0], dates[i][-1]))
print(dates[i], '\n')
In [6]:
# Plotar 2 primeiros dias de dados das residências 1 e 2
def plot_df(df, title):
apps = df.columns.values
num_apps = len(apps)
fig, axes = plt.subplots((num_apps+1)//2,2, figsize=(24, num_apps*2) )
for i, key in enumerate(apps):
axes.flat[i].plot(df[key], alpha = 0.6)
axes.flat[i].set_title(key, fontsize = '15')
plt.suptitle(title, fontsize = '30')
fig.tight_layout()
fig.subplots_adjust(top=0.95)
for i in range(1,3):
plot_df(df[i].ix[:dates[i][1]], 'Registros dos 2 primeiros dias da Residência {}'.format(i))
In [7]:
# Plot total energy sonsumption of each appliance from two houses
fig, axes = plt.subplots(1,2,figsize=(24, 10))
plt.suptitle('Energia total consumida por cada aparelho', fontsize = 30)
cons1 = df[1][df[1].columns.values[2:]].sum().sort_values(ascending=False)
app1 = cons1.index
y_pos1 = np.arange(len(app1))
axes[0].bar(y_pos1, cons1.values, alpha=0.6)
plt.sca(axes[0])
plt.xticks(y_pos1, app1, rotation = 90, fontsize=16)
plt.title('Residência 1')
cons2 = df[2][df[2].columns.values[2:]].sum().sort_values(ascending=False)
app2 = cons2.index
y_pos2 = np.arange(len(app2))
axes[1].bar(y_pos2, cons2.values, alpha=0.6)
plt.sca(axes[1])
plt.xticks(y_pos2, app2, rotation = 90, fontsize=16)
plt.title('Residência 2')
Out[7]:
In [8]:
# Split de treino, teste e validação
df1_train = df[1].ix[:dates[1][10]]
df1_val = df[1].ix[dates[1][11]:dates[1][16]]
df1_test = df[1].ix[dates[1][17]:]
print('df_train.shape: ', df1_train.shape)
print('df_val.shape: ', df1_val.shape)
print('df_test.shape: ', df1_test.shape)
In [9]:
# Exemplo de base, com X = ('mains_1','mains_2') e Y = (refrigerator_5)
# A previsão (desagregação) da corrente é na vertical
df_sample = df1_val[['mains_1','mains_2','refrigerator_5']]
df_sample.head(10)
Out[9]:
In [10]:
print('Dias compreendidos na leitura/desagregação:')
set([str(dt).split(' ')[0] for dt in df_sample.index])
Out[10]:
In [11]:
# Usando a corrente 1 e 2 (variaveis independetes) para a previsão do refrigerador (variavel dependente)
X_train1 = df1_train[['mains_1','mains_2']].values
y_train1 = df1_train['refrigerator_5'].values
X_val1 = df1_val[['mains_1','mains_2']].values
y_val1 = df1_val['refrigerator_5'].values
X_test1 = df1_test[['mains_1','mains_2']].values
y_test1 = df1_test['refrigerator_5'].values
print(
X_train1.shape, y_train1.shape,
X_val1.shape, y_val1.shape,
X_test1.shape, y_test1.shape
)
In [12]:
# Metrcas de avaliação da regressão
def mse_loss(y_predict, y):
return np.mean(np.square(y_predict - y))
def mae_loss(y_predict, y):
return np.mean(np.abs(y_predict - y))
# Serão usados os dados de validação para ajustar o parâmetros min_samples_split
min_samples_split = np.arange(2, 400, 10)
# Treinando o modelo
from sklearn.tree import DecisionTreeRegressor
def tree_reg(X_train, y_train, X_val, y_val, min_samples_split):
clfs = []
losses = []
start = time.time()
for split in min_samples_split:
clf = DecisionTreeRegressor(min_samples_split = split)
clf.fit(X_train, y_train)
y_predict_val = clf.predict(X_val)
clfs.append(clf)
losses.append( mse_loss(y_predict_val, y_val) )
print('Tempo de execução (s): ', round(time.time() - start, 0))
return clfs, losses
tree_clfs_1, tree_losses_1 = tree_reg(X_train1, y_train1, X_val1, y_val1, min_samples_split)
In [13]:
def plot_losses(losses, min_samples_split):
index = np.arange(len(min_samples_split))
bar_width = 0.4
opacity = 0.35
plt.bar(index, losses, bar_width, alpha=opacity, color='b')
plt.xlabel('min_samples_split', fontsize=30)
plt.ylabel('loss', fontsize=30)
plt.title('Loss (Validação) x min_samples_split', fontsize = '25')
plt.xticks(index + bar_width/2, min_samples_split, fontsize=20 )
plt.yticks(fontsize=20 )
plt.rcParams["figure.figsize"] = [24,15]
plt.tight_layout()
plot_losses(tree_losses_1, min_samples_split)
In [14]:
# Escolhendo o melhor modelo (minsplit x loss) e prevendo o consumo do refrigerador no conjunto de teste
ind = np.argmin(tree_losses_1)
tree_clf_1 = tree_clfs_1[ind]
y_test_predict_1 = tree_clf_1.predict(X_test1)
mse_tree_1 = mse_loss(y_test_predict_1, y_test1)
mae_tree_1 = mae_loss(y_test_predict_1, y_test1)
print('MSE no Conjunto de Teste:', mse_tree_1)
print('MAE no Conjunto de Teste:', mae_tree_1)
In [15]:
# Plotando os cnsumos REAL e o PREVISTO do refrigerador nos 6 dias dos dados de teste
def plot_each_app(df, dates, predict, y_test, title, look_back = 0):
num_date = len(dates)
fig, axes = plt.subplots(num_date,1,figsize=(24, num_date*5) )
plt.suptitle(title, fontsize = '25')
fig.tight_layout()
fig.subplots_adjust(top=0.95)
for i in range(num_date):
if i == 0: l = 0
ind = df.ix[dates[i]].index[look_back:]
axes.flat[i].plot(ind, y_test[l:l+len(ind)], color = 'blue', alpha = 0.6, label = 'REAL')
axes.flat[i].plot(ind, predict[l:l+len(ind)], color = 'red', alpha = 0.6, label = 'PREVISTO')
axes.flat[i].legend()
l = len(ind)
plot_each_app(df1_test, dates[1][17:], y_test_predict_1, y_test1, 'Consumo Real/Previsto do refrigerador nos 6 dias da Residência 1')
In [29]:
X_2 = df[2][['mains_2','mains_1']].values # Mesmas variáveis independentes, correntes 1 e 2)
y_2 = df[2]['refrigerator_9'].values
print(X_2.shape, y_2.shape)
In [30]:
y_predict_2 = tree_clf_1.predict(X_2)
mse_tree_2 = mse_loss(y_predict_2, y_2)
mae_tree_2 = mae_loss(y_predict_2, y_2)
print('MSE no Conjunto de Teste:', mse_tree_2)
print('MAE no Conjunto de Teste:', mae_tree_2)
In [31]:
plot_each_app(df[2], dates[2], y_predict_2, y_2, 'Modelo de Árvore de Decisão aplicado ao Refrigerador: treinado na Res. 1, prevendo na Res. 2')