Disaggregation


In [1]:
from __future__ import print_function, division
import time

from matplotlib import rcParams
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from six import iteritems

from nilmtk import DataSet, TimeFrame, MeterGroup, HDFDataStore
from nilmtk.legacy.disaggregate import CombinatorialOptimisation, FHMM
import nilmtk.utils

%matplotlib inline

In [2]:
rcParams['figure.figsize'] = (13, 6)

Dividing data into train and test set


In [3]:
train = DataSet('/data/redd.h5')
test = DataSet('/data/redd.h5')

Let us use building 1 for demo purposes


In [4]:
building = 1

Let's split data at April 30th


In [5]:
train.set_window(end="2011-04-30")
test.set_window(start="2011-04-30")

train_elec = train.buildings[1].elec
test_elec = test.buildings[1].elec

Visualizing the data


In [6]:
train_elec.plot()


Loading data for meter ElecMeterID(instance=4, building=1, dataset='REDD')     
Done loading data all meters for this chunk.
Loading data for meter ElecMeterID(instance=20, building=1, dataset='REDD')     
Done loading data all meters for this chunk.
Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x21f6e9b99e8>

In [7]:
test_elec.mains().plot()


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x21f704a6e80>

REDD data set has got appliance level data sampled every 3 or 4 seconds and mains data sampled every 1 second. Let us verify the same.


In [8]:
fridge_meter = train_elec['fridge']

In [9]:
fridge_df = next(fridge_meter.load())

In [10]:
fridge_df.head()


Out[10]:
physical_quantity power
type active
2011-04-18 09:22:13-04:00 6.0
2011-04-18 09:22:16-04:00 6.0
2011-04-18 09:22:20-04:00 6.0
2011-04-18 09:22:23-04:00 6.0
2011-04-18 09:22:26-04:00 6.0

In [11]:
mains = train_elec.mains()

In [12]:
mains_df = next(mains.load())


Loading data for meter ElecMeterID(instance=2, building=1, dataset='REDD')     
Done loading data all meters for this chunk.

In [13]:
mains_df.head()


Out[13]:
physical_quantity power
type apparent
2011-04-18 09:22:09-04:00 342.820007
2011-04-18 09:22:10-04:00 344.559998
2011-04-18 09:22:11-04:00 345.140015
2011-04-18 09:22:12-04:00 341.679993
2011-04-18 09:22:13-04:00 341.029999

Since, both of these are sampled at different frequencies, we will downsample both to 1 minute resolution. We will also select the top-5 appliances in terms of energy consumption and use them for training our FHMM and CO models.

Selecting top-5 appliances


In [14]:
top_5_train_elec = train_elec.submeters().select_top_k(k=5)


15/16 MeterGroup(meters==19, building=1, dataset='REDD', appliances=[Appliance(type='unknown', instance=2)])e=1)])ce=1)])
  ElecMeter(instance=3, building=1, dataset='REDD', appliances=[Appliance(type='electric oven', instance=1)])
  ElecMeter(instance=4, building=1, dataset='REDD', appliances=[Appliance(type='electric oven', instance=1)])
16/16 MeterGroup(meters= for ElecMeterID(instance=4, building=1, dataset='REDD') ...   
  ElecMeter(instance=10, building=1, dataset='REDD', appliances=[Appliance(type='washer dryer', instance=1)])
  ElecMeter(instance=20, building=1, dataset='REDD', appliances=[Appliance(type='washer dryer', instance=1)])
Calculating total_energy for ElecMeterID(instance=20, building=1, dataset='REDD') ...   

In [15]:
top_5_train_elec


Out[15]:
MeterGroup(meters=
  ElecMeter(instance=8, building=1, dataset='REDD', appliances=[Appliance(type='sockets', instance=2)])
  ElecMeter(instance=5, building=1, dataset='REDD', appliances=[Appliance(type='fridge', instance=1)])
  ElecMeter(instance=9, building=1, dataset='REDD', appliances=[Appliance(type='light', instance=1)])
  ElecMeter(instance=6, building=1, dataset='REDD', appliances=[Appliance(type='dish washer', instance=1)])
  ElecMeter(instance=11, building=1, dataset='REDD', appliances=[Appliance(type='microwave', instance=1)])
)

Training and disaggregation

A function to disaggregate the mains data to constituent appliances and return the predictions


In [16]:
def predict(clf, test_elec, sample_period, timezone):
    pred = {}
    gt= {}
    
    # "ac_type" varies according to the dataset used. 
    # Make sure to use the correct ac_type before using the default parameters in this code.    
    for i, chunk in enumerate(test_elec.mains().load(physical_quantity = 'power', ac_type = 'apparent', sample_period=sample_period)):
        chunk_drop_na = chunk.dropna()
        pred[i] = clf.disaggregate_chunk(chunk_drop_na)
        gt[i]={}

        for meter in test_elec.submeters().meters:
            # Only use the meters that we trained on (this saves time!)    
            gt[i][meter] = next(meter.load(physical_quantity = 'power', ac_type = 'active', sample_period=sample_period))
        gt[i] = pd.DataFrame({k:v.squeeze() for k,v in iteritems(gt[i]) if len(v)}, index=next(iter(gt[i].values())).index).dropna()
        
    # If everything can fit in memory
    gt_overall = pd.concat(gt)
    gt_overall.index = gt_overall.index.droplevel()
    pred_overall = pd.concat(pred)
    pred_overall.index = pred_overall.index.droplevel()

    # Having the same order of columns
    gt_overall = gt_overall[pred_overall.columns]
    
    #Intersection of index
    gt_index_utc = gt_overall.index.tz_convert("UTC")
    pred_index_utc = pred_overall.index.tz_convert("UTC")
    common_index_utc = gt_index_utc.intersection(pred_index_utc)
    
    common_index_local = common_index_utc.tz_convert(timezone)
    gt_overall = gt_overall.loc[common_index_local]
    pred_overall = pred_overall.loc[common_index_local]
    appliance_labels = [m for m in gt_overall.columns.values]
    gt_overall.columns = appliance_labels
    pred_overall.columns = appliance_labels
    return gt_overall, pred_overall

Train using 2 benchmarking algorithms - Combinatorial Optimisation (CO) and Factorial Hidden Markov Model (FHMM)


In [18]:
classifiers = {'CO':CombinatorialOptimisation(), 'FHMM':FHMM()}
predictions = {}
sample_period = 120
for clf_name, clf in classifiers.items():
    print("*"*20)
    print(clf_name)
    print("*" *20)
    start = time.time()
    # Note that we have given the sample period to downsample the data to 1 minute. 
    # If instead of top_5 we wanted to train on all appliance, we would write 
    # fhmm.train(train_elec, sample_period=60)
    clf.train(top_5_train_elec, sample_period=sample_period)
    end = time.time()
    print("Runtime =", end-start, "seconds.")
    gt, predictions[clf_name] = predict(clf, test_elec, sample_period, train.metadata['timezone'])


********************
CO
********************
Training model for submeter 'ElecMeter(instance=8, building=1, dataset='REDD', appliances=[Appliance(type='sockets', instance=2)])'
Training model for submeter 'ElecMeter(instance=5, building=1, dataset='REDD', appliances=[Appliance(type='fridge', instance=1)])'
Training model for submeter 'ElecMeter(instance=9, building=1, dataset='REDD', appliances=[Appliance(type='light', instance=1)])'
Training model for submeter 'ElecMeter(instance=6, building=1, dataset='REDD', appliances=[Appliance(type='dish washer', instance=1)])'
Training model for submeter 'ElecMeter(instance=11, building=1, dataset='REDD', appliances=[Appliance(type='microwave', instance=1)])'
Done training!
Runtime = 1.5623564720153809 seconds.
Loading data for meter ElecMeterID(instance=2, building=1, dataset='REDD')     
Done loading data all meters for this chunk.
Estimating power demand for 'ElecMeter(instance=8, building=1, dataset='REDD', appliances=[Appliance(type='sockets', instance=2)])'
Estimating power demand for 'ElecMeter(instance=5, building=1, dataset='REDD', appliances=[Appliance(type='fridge', instance=1)])'
Estimating power demand for 'ElecMeter(instance=9, building=1, dataset='REDD', appliances=[Appliance(type='light', instance=1)])'
Estimating power demand for 'ElecMeter(instance=6, building=1, dataset='REDD', appliances=[Appliance(type='dish washer', instance=1)])'
Estimating power demand for 'ElecMeter(instance=11, building=1, dataset='REDD', appliances=[Appliance(type='microwave', instance=1)])'
Loading data for meter ElecMeterID(instance=4, building=1, dataset='REDD')     
Done loading data all meters for this chunk.
Loading data for meter ElecMeterID(instance=20, building=1, dataset='REDD')     
Done loading data all meters for this chunk.
********************
FHMM
********************
Training model for submeter 'ElecMeter(instance=8, building=1, dataset='REDD', appliances=[Appliance(type='sockets', instance=2)])' with 3 states
Training model for submeter 'ElecMeter(instance=5, building=1, dataset='REDD', appliances=[Appliance(type='fridge', instance=1)])' with 3 states
Training model for submeter 'ElecMeter(instance=9, building=1, dataset='REDD', appliances=[Appliance(type='light', instance=1)])' with 3 states
Training model for submeter 'ElecMeter(instance=6, building=1, dataset='REDD', appliances=[Appliance(type='dish washer', instance=1)])' with 3 states
Training model for submeter 'ElecMeter(instance=11, building=1, dataset='REDD', appliances=[Appliance(type='microwave', instance=1)])' with 3 states
Runtime = 2.2812917232513428 seconds.
Loading data for meter ElecMeterID(instance=2, building=1, dataset='REDD')     
Done loading data all meters for this chunk.
Loading data for meter ElecMeterID(instance=4, building=1, dataset='REDD')     
Done loading data all meters for this chunk.
Loading data for meter ElecMeterID(instance=20, building=1, dataset='REDD')     
Done loading data all meters for this chunk.

Using prettier labels!


In [19]:
appliance_labels = [m.label() for m in gt.columns.values]

In [20]:
gt.columns = appliance_labels
predictions['CO'].columns = appliance_labels
predictions['FHMM'].columns = appliance_labels

Taking a look at the ground truth of top 5 appliance power consumption


In [21]:
gt.head()


Out[21]:
Sockets Fridge Light Dish washer Microwave
2011-04-30 00:00:00-04:00 30.566668 6.000000 76.133331 1132.033325 4.0
2011-04-30 00:02:00-04:00 28.551723 6.000000 76.206894 1131.517212 4.0
2011-04-30 00:04:00-04:00 28.233334 6.000000 76.099998 1132.066650 4.0
2011-04-30 00:06:00-04:00 28.500000 6.100000 76.900002 1038.866699 4.0
2011-04-30 00:08:00-04:00 24.193548 6.612903 81.032257 205.387100 4.0

In [22]:
predictions['CO'].head()


Out[22]:
Sockets Fridge Light Dish washer Microwave
2011-04-30 00:00:00-04:00 0.0 0.0 0.0 0.0 1306.0
2011-04-30 00:02:00-04:00 0.0 0.0 0.0 0.0 1306.0
2011-04-30 00:04:00-04:00 0.0 0.0 0.0 0.0 1306.0
2011-04-30 00:06:00-04:00 71.0 65.0 0.0 1069.0 0.0
2011-04-30 00:08:00-04:00 71.0 0.0 81.0 221.0 0.0

In [23]:
predictions['FHMM'].head()


Out[23]:
Sockets Fridge Light Dish washer Microwave
2011-04-30 00:00:00-04:00 22.0 6.0 81.0 0.0 1359.0
2011-04-30 00:02:00-04:00 41.0 6.0 81.0 1068.0 111.0
2011-04-30 00:04:00-04:00 41.0 6.0 81.0 1068.0 111.0
2011-04-30 00:06:00-04:00 41.0 6.0 81.0 1068.0 4.0
2011-04-30 00:08:00-04:00 41.0 192.0 81.0 47.0 4.0

Plotting the predictions against the actual usage


In [24]:
predictions['CO']['Fridge'].head(300).plot(label="Pred")
gt['Fridge'].head(300).plot(label="GT")
plt.legend()


Out[24]:
<matplotlib.legend.Legend at 0x21f7054d278>

In [25]:
predictions['FHMM']['Fridge'].head(300).plot(label="Pred")
gt['Fridge'].head(300).plot(label="GT")
plt.legend()


Out[25]:
<matplotlib.legend.Legend at 0x21f6e7016d8>

Comparing NILM algorithms (CO vs FHMM)

nilmtk.utils.compute_rmse is an extended of the following, handling both missing values and labels better:

def compute_rmse(gt, pred):
    from sklearn.metrics import mean_squared_error
    rms_error = {}
    for appliance in gt.columns:
        rms_error[appliance] = np.sqrt(mean_squared_error(gt[appliance], pred[appliance]))
    return pd.Series(rms_error)

In [26]:
? nilmtk.utils.compute_rmse

In [27]:
rmse = {}
for clf_name in classifiers.keys():
    rmse[clf_name] = nilmtk.utils.compute_rmse(gt, predictions[clf_name])

rmse = pd.DataFrame(rmse)
rmse


Out[27]:
CO FHMM
Sockets 28.268673 37.155416
Fridge 107.893730 100.269828
Light 120.060822 73.627119
Dish washer 456.985626 185.935940
Microwave 235.265549 545.966517