In [1]:
    
from __future__ import print_function, division
import time
from matplotlib import rcParams
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from six import iteritems
from nilmtk import DataSet, TimeFrame, MeterGroup, HDFDataStore
from nilmtk.disaggregate import CombinatorialOptimisation, FHMM
import nilmtk.utils
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
    
In [2]:
    
rcParams['figure.figsize'] = (13, 6)
    
In [3]:
    
train = DataSet('../datasets/REDD/low_freq.h5')
test = DataSet('../datasets/REDD/low_freq.h5')
    
Let us use building 1 for demo purposes
In [4]:
    
building = 1
    
Let's split data at April 30th
In [5]:
    
train.set_window(end="2011-04-30")
test.set_window(start="2011-04-30")
train_elec = train.buildings[1].elec
test_elec = test.buildings[1].elec
    
In [18]:
    
print('                                              TRAIN MAINS')
train_elec.mains().plot();
    
    
    
In [16]:
    
print('                                              TRAIN APPLIANCES')
train_elec.submeters().plot();
    
    
    
In [19]:
    
print('                                              TEST MAINS')
test_elec.mains().plot();
    
    
    
In [20]:
    
print('                                              TEST APPLIANCES')
test_elec.submeters().plot();
    
    
    
REDD data set has got appliance level data sampled every 3 or 4 seconds and mains data sampled every 1 second. Let us verify the same.
In [8]:
    
fridge_meter = train_elec['fridge']
    
In [9]:
    
fridge_df = next(fridge_meter.load())
    
In [10]:
    
fridge_df.head()
    
    Out[10]:
In [11]:
    
mains = train_elec.mains()
    
In [12]:
    
mains_df = next(mains.load())
    
    
In [13]:
    
mains_df.head()
    
    Out[13]:
Since, both of these are sampled at different frequencies, we will downsample both to 1 minute resolution. We will also select the top-5 appliances in terms of energy consumption and use them for training our FHMM and CO models.
In [14]:
    
top_5_train_elec = train_elec.submeters().select_top_k(k=5)
    
    
In [15]:
    
top_5_train_elec
    
    Out[15]:
In [16]:
    
def predict(clf, test_elec, sample_period, timezone):
    pred = {}
    gt= {}
    
    # "ac_type" varies according to the dataset used. 
    # Make sure to use the correct ac_type before using the default parameters in this code.    
    for i, chunk in enumerate(test_elec.mains().load(physical_quantity = 'power', ac_type = 'apparent', sample_period=sample_period)):
        chunk_drop_na = chunk.dropna()
        pred[i] = clf.disaggregate_chunk(chunk_drop_na)
        gt[i]={}
        for meter in test_elec.submeters().meters:
            # Only use the meters that we trained on (this saves time!)    
            gt[i][meter] = next(meter.load(physical_quantity = 'power', ac_type = 'active', sample_period=sample_period))
        gt[i] = pd.DataFrame({k:v.squeeze() for k,v in iteritems(gt[i]) if len(v)}, index=next(iter(gt[i].values())).index).dropna()
        
    # If everything can fit in memory
    gt_overall = pd.concat(gt)
    gt_overall.index = gt_overall.index.droplevel()
    pred_overall = pd.concat(pred)
    pred_overall.index = pred_overall.index.droplevel()
    # Having the same order of columns
    gt_overall = gt_overall[pred_overall.columns]
    
    #Intersection of index
    gt_index_utc = gt_overall.index.tz_convert("UTC")
    pred_index_utc = pred_overall.index.tz_convert("UTC")
    common_index_utc = gt_index_utc.intersection(pred_index_utc)
    
    common_index_local = common_index_utc.tz_convert(timezone)
    gt_overall = gt_overall.loc[common_index_local]
    pred_overall = pred_overall.loc[common_index_local]
    appliance_labels = [m for m in gt_overall.columns.values]
    gt_overall.columns = appliance_labels
    pred_overall.columns = appliance_labels
    return gt_overall, pred_overall
    
In [17]:
    
classifiers = {'CO':CombinatorialOptimisation(), 'FHMM':FHMM()}
predictions = {}
sample_period = 120
for clf_name, clf in classifiers.items():
    print("*"*20)
    print(clf_name)
    print("*" *20)
    start = time.time()
    # Note that we have given the sample period to downsample the data to 1 minute. 
    # If instead of top_5 we wanted to train on all appliance, we would write 
    # fhmm.train(train_elec, sample_period=60)
    clf.train(top_5_train_elec, sample_period=sample_period)
    end = time.time()
    print("Runtime =", end-start, "seconds.")
    gt, predictions[clf_name] = predict(clf, test_elec, sample_period, train.metadata['timezone'])
    
    
Using prettier labels!
In [18]:
    
appliance_labels = [m.label() for m in gt.columns.values]
    
In [19]:
    
gt.columns = appliance_labels
predictions['CO'].columns = appliance_labels
predictions['FHMM'].columns = appliance_labels
    
In [20]:
    
gt.head()
    
    Out[20]:
In [21]:
    
predictions['CO'].head()
    
    Out[21]:
In [22]:
    
predictions['FHMM'].head()
    
    Out[22]:
In [23]:
    
predictions['CO']['Fridge'].head(300).plot(label="Pred")
gt['Fridge'].head(300).plot(label="GT")
plt.legend()
    
    Out[23]:
    
In [24]:
    
predictions['FHMM']['Fridge'].head(300).plot(label="Pred")
gt['Fridge'].head(300).plot(label="GT")
plt.legend()
    
    Out[24]:
    
nilmtk.utils.compute_rmse is an extended of the following, handling both missing values and labels better:
def compute_rmse(gt, pred):
    from sklearn.metrics import mean_squared_error
    rms_error = {}
    for appliance in gt.columns:
        rms_error[appliance] = np.sqrt(mean_squared_error(gt[appliance], pred[appliance]))
    return pd.Series(rms_error)
In [27]:
    
? nilmtk.utils.compute_rmse
    
In [26]:
    
rmse = {}
for clf_name in classifiers.keys():
    rmse[clf_name] = nilmtk.utils.compute_rmse(gt, predictions[clf_name])
rmse = pd.DataFrame(rmse)
rmse
    
    Out[26]:
In [ ]: