In [1]:

    
from __future__ import print_function 
import seaborn as sns
from IPython.core.display import HTML
from IPython.display import Image
import pandas as pd
import calendar

from nilmtk.disaggregate.co_1d import CO_1d
from nilmtk.cross_validation import train_test_split
import nilmtk.preprocessing.electricity.building as prepb
import nilmtk.stats.electricity.building as bstats
import nilmtk.stats.electricity.single as sstats
from nilmtk.dataset import DataSet
from nilmtk.dataset import REDD
from nilmtk.plots import plot_series
from nilmtk.sensors.electricity import Measurement
from nilmtk.metrics import (mean_normalized_error_power, 
                            fraction_energy_assigned_correctly, 
                            f_score,
                            rms_error_power)

import warnings
warnings.filterwarnings("ignore")

sns.set(font="serif")

def pretty_print_dict(dictionary):
    html = '<ul>'
    for key, value in dictionary.iteritems():
        html += '<li><strong>{}</strong>: '.format(key)
        if isinstance(value, list):
            html += '<ul>'
            for item in value:
                html += '<li>{}</li>'.format(item)
            html += '</ul></li>'
        else:
            html += '{}</li>'.format(value)
    html += '</ul>'
    display(HTML(html))

Demo of NILMTK v0.1 for NILM Workshop 2014

Loading Data

Load raw REDD .dat data



In [2]:

    
from nilmtk.dataset import REDD

redd = REDD()
redd.load('/data/REDD/low_freq/')

# Note increase in system memory usage.
# NILMTK v0.2 does not eagerly load data like v0.1 does.









    



Loading building 1:
  chans:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 20
Loading building 2:
  chans:  1 2 3 4 5 6 7 8 9 10 11
Loading building 3:
  chans:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
Loading building 4:
  chans:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
Loading building 5:
  chans:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
Loading building 6:
  chans:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17

Examine dataset metadata



In [3]:

    
pretty_print_dict(redd.metadata)









    




name: REDD
full_name: Reference Energy Disaggregation Data Set
citations: J. Zico Kolter and Matthew J. Johnson. REDD: A public data set for energy disaggregation research. In proceedings of the SustKDD workshop on Data Mining Applications in Sustainability, 2011.
urls: http://redd.csail.mit.edu
timezone: US/Eastern
geographic_coordinates: (42.360091, -71.09416)

Appliance names are converted to use the NILMTK controlled vocabulary

Raw REDD appliance names:



In [4]:

    
!cat '/data/REDD/low_freq/house_1/labels.dat'









    



1 mains
2 mains
3 oven
4 oven
5 refrigerator
6 dishwaser
7 kitchen_outlets
8 kitchen_outlets
9 lighting
10 washer_dryer
11 microwave
12 bathroom_gfi
13 electric_heat
14 stove
15 kitchen_outlets
16 kitchen_outlets
17 lighting
18 lighting
19 washer_dryer
20 washer_dryer

NILMTK appliance names



In [5]:

    
electric = redd.buildings[1].utility.electric
electric.appliances.keys()

# Note that the washer dryer has been converted to a single DualSupply appliance









    Out[5]:





[ApplianceName(name='fridge', instance=1),
 ApplianceName(name='washer dryer', instance=1),
 ApplianceName(name='kitchen outlets', instance=2),
 ApplianceName(name='lighting', instance=3),
 ApplianceName(name='kitchen outlets', instance=1),
 ApplianceName(name='space heater', instance=1),
 ApplianceName(name='kitchen outlets', instance=4),
 ApplianceName(name='kitchen outlets', instance=3),
 ApplianceName(name='lighting', instance=1),
 ApplianceName(name='dishwasher', instance=1),
 ApplianceName(name='microwave', instance=1),
 ApplianceName(name='bathroom misc', instance=1),
 ApplianceName(name='lighting', instance=2),
 ApplianceName(name='oven', instance=1),
 ApplianceName(name='hob', instance=1)]

Get data for a particular appliance



In [6]:

    
fridge = electric.appliances['fridge', 1]

# fridge is a 2-column matrix (a Python Pandas DataFrame object)

fridge.head()









    Out[6]:






  
    
      
      (power, active)
    
  
  
    
      2011-04-18 09:22:13-04:00
       6
    
    
      2011-04-18 09:22:16-04:00
       6
    
    
      2011-04-18 09:22:20-04:00
       6
    
    
      2011-04-18 09:22:23-04:00
       6
    
    
      2011-04-18 09:22:26-04:00
       6
    
  

5 rows × 1 columns



In [7]:

    
fridge["2011-04-18":"2011-04-19"].plot()









    Out[7]:





<matplotlib.axes.AxesSubplot at 0x7f2194c37690>

HDF5 file format

Export to HDF5

DATA_DIRECTORY = '/path/to/hdf5/'
redd.export(DATA_DIRECTORY)

Load from HDF5

Note that we use the generic DataSet class to load HDF5...

from nilmtk.dataset import DataSet
dataset = DataSet()
dataset.load_hdf5(DATA_DIRECTORY)

Dataset Diagnostics and Statistics

All datasets have imperfections. We need to understand the imperfections of each dataset before proceeding with data analysis. We'll continue working with REDD but please note that we're not picking on REDD! All datasets have issues!

import nilmtk.stats.electricity.building as bstats

Gaps

There are two reasons why data might not be recorded:

The appliance and meter were unplugged from the mains (hence the appliance is off when the appliance monitor is off).
The appliance monitor is broken (hence we have no reliable information about the state of the appliance).

In the plots below, a dark rectangle shows the presence of a gap. A gap is defined as any pair of consecutive samples further apart than 4 times the sample period for that meter.

Plot gaps using rectangles



In [8]:

    
bstats.plot_missing_samples_using_rectangles(electric)

The advantages of plot_missing_samples_using_rectangles are:

clearly shows large gaps
shows all data so can be zoomed in to your heart's content

The disadvantages are:

The choice of max_sample_period is somewhat subjective
Because it plots lots of rectangles, it can be slow to plot.

To overcome both of these disadvantages, we have a sister function:

Plot gaps using bitmap



In [9]:

    
bstats.plot_missing_samples_using_bitmap(electric)









    



Seconds per pixel = 3917.6






    Out[9]:





<matplotlib.axes.AxesSubplot at 0x7f2191f84550>

Proportion of energy submetered for house 1



In [10]:

    
bstats.proportion_of_energy_submetered(electric)









    



Calculating proportion of energy submetered...
Masking appliances with mains... may take a little while...Mains sample period = 1.0, max_sample_period = 20.0
Getting gap starts and ends...
Found 27 gap starts and 27 gap ends.
...............done
Inserting zeros... may take a little while...done inserting zeros






    Out[10]:





0.76004797563554483

Proportion of energy per appliance



In [11]:

    
bstats.proportion_per_appliance(electric)









    



Masking appliances with mains... may take a little while...Mains sample period = 1.0, max_sample_period = 20.0
Getting gap starts and ends...
Found 27 gap starts and 27 gap ends.
...............done
Inserting zeros... may take a little while...done inserting zeros






    Out[11]:





(fridge, 1)             0.143596
(lighting, 1)           0.112564
(washer dryer, 1)       0.093916
(kitchen outlets, 2)    0.073981
(dishwasher, 1)         0.066409
(microwave, 1)          0.058262
(kitchen outlets, 1)    0.055435
(lighting, 2)           0.046542
(oven, 1)               0.036079
(lighting, 3)           0.034429
(bathroom misc, 1)      0.018303
(kitchen outlets, 3)    0.014565
(kitchen outlets, 4)    0.005331
(space heater, 1)       0.000334
(hob, 1)                0.000302
dtype: float64

Statistics for a single appliance

import nilmtk.stats.electricity.single as sstats



In [12]:

    
APPLIANCE_TYPE = 'oven'
appliance = electric.appliances[(APPLIANCE_TYPE, 1)]

Sample period



In [13]:

    
sample_period = sstats.get_sample_period(appliance)

print('average sample period = {:.1f} seconds'.format(sample_period))









    



average sample period = 3.6 seconds

Dropout rate over entire timeseries



In [14]:

    
dropout_rate = sstats.get_dropout_rate(data=appliance)

print('average dropout rate = {:.1%}'.format(dropout_rate))









    



average dropout rate = 13.5%

Dropout rate per period



In [15]:

    
sstats.dropout_rate_per_period(data=appliance, rule='D').plot()

ylabel('dropout rate')
title('Dropout rate per day for ' + APPLIANCE_TYPE)









    Out[15]:





<matplotlib.text.Text at 0x7f2192029850>

Histogram of power consumption



In [16]:

    
THRESHOLD = 1
appliance_filtered = (appliance[appliance > THRESHOLD]).icol(0).dropna()

xlabel('power (watts)')
ylabel('frequency')
h = hist(appliance_filtered.values, bins=100)

Conclusions from the histogram:

oven spends a lot of its time consuming about 2-50 Watts
appears to be properly 'on' when it's consuming over 1600 watts
So let's use 1000 watts as the on power threshold.



In [17]:

    
ON_POWER_THRESHOLD = 1000

In NILMTK v0.2, on_power_threshold is stored as metadata per appliance.

Number of hours the oven was switched on for



In [18]:

    
hours_on = sstats.hours_on(appliance, on_power_threshold=ON_POWER_THRESHOLD)

print(APPLIANCE_TYPE + ' was on for {:.1f} hours.'.format(hours_on))









    



oven was on for 4.5 hours.

Total energy consumed by the oven



In [19]:

    
kwh = sstats.energy(appliance)

print(APPLIANCE_TYPE + ' consumed {:.1f} kWh.'.format(kwh))









    



oven consumed 3.8 kWh.

Explore usage behaviour of an appliance

Usage (hours on and energy used) per period



In [20]:

    
usage = sstats.usage_per_period(appliance,
                                freq='D', 
                                on_power_threshold=ON_POWER_THRESHOLD)
usage.head(n=7)









    Out[20]:






  
    
      
      hours_on
      kwh
    
  
  
    
      2011-04-18
            NaN
            NaN
    
    
      2011-04-19
       0.000000
       0.000063
    
    
      2011-04-20
       0.000000
       0.000010
    
    
      2011-04-21
       0.000000
       0.000014
    
    
      2011-04-22
       0.000000
       0.000004
    
    
      2011-04-23
       0.000000
       0.007393
    
    
      2011-04-24
       0.308056
       0.510096
    
  

7 rows × 2 columns

Sanity-check by looking at power data



In [21]:

    
plot_series(appliance[:"2011-04-24"], 
            date_format='%Y-%m-%d', 
            tz_localize=False)
title(APPLIANCE_TYPE + ' power demand')









    Out[21]:





<matplotlib.text.Text at 0x7f2191b51390>

Usage of the oven, hour-by-hour, over an average day



In [22]:

    
dist = sstats.activity_distribution(appliance, 
                                    bin_size='H', 
                                    timespan='D', 
                                    on_power_threshold=ON_POWER_THRESHOLD)

# Graph formatting
x = np.arange(dist.size)
ylabel('frequency')
xlabel('hour of day')
title('Usage of the oven, hour-by-hour, over an average day')
xlim([0, 24])
xticks(range(0, 25, 6))
bar(x, dist.values)









    Out[22]:





<Container object of 24 artists>

Usage of the oven, day-by-day, over an average week



In [23]:

    
dist = sstats.activity_distribution(appliance, 
                                    bin_size='D', 
                                    timespan='W', 
                                    on_power_threshold=ON_POWER_THRESHOLD)
x = np.arange(dist.size)
ylabel('frequency')
xlabel('day of week')
title('')
xticks(np.arange(7)+0.5, calendar.day_name[0:7])
bar(x, dist.values)









    Out[23]:





<Container object of 7 artists>

Histogram of on-durations



In [24]:

    
# Get a Series of booleans indicating when the oven is on:
on_series = sstats.on(appliance, on_power_threshold=ON_POWER_THRESHOLD)

# Now get the length of every on-duration
on_durations = sstats.durations(on_series, 
                                on_or_off='on',
                                ignore_n_off_samples=10)
xlabel('minutes on')
ylabel('frequency')
title('Distribution of on-durations for oven')
h = hist(on_durations/60, bins=10)

Descriptive stats of whole dataset

redd.describe()

NUMBER OF BUILDINGS: 6

NUMBER OF APPLIANCES PER BUILDING:
  min  =  9.00
  mean = 16.50
  mode = 15.00
  max  = 23.00
  std  =  4.31
PROPORTION OF ENERGY SUBMETERED PER BUILDLING:
  min  =  0.58
  mean =  0.70
  mode =  0.58
  max  =  0.89
  std  =  0.11
DROPOUT RATE PER CHANNEL, INCLUDING LARGE GAPS:
  min  =  0.13
  mean =  0.58
  mode =  0.92
  max  =  0.92
  std  =  0.24
DROPOUT RATE PER CHANNEL, IGNORING LARGE GAPS:
  min  =  0.00
  mean =  0.10
  mode =  0.09
  max  =  0.17
  std  =  0.04
MAINS UPTIME PER BUILDING (DAYS):
  min  =  3.60
  mean = 15.12
  mode =  3.60
  max  = 19.44
  std  =  5.43
PROPORTION OF TIME SLICES WHERE > 70% ENERGY IS SUBMETERED:
  min  =  0.23
  mean =  0.64
  mode =  0.23
  max  =  0.98
  std  =  0.26

Preprocessing

First, for reference, here's the plot from above showing missing samples:



In [25]:

    
building = redd.buildings[1]
bstats.plot_missing_samples_using_rectangles(building.utility.electric)

import nilmtk.preprocessing.electricity.building as prepb

Select data for a specific time period

building = prepb.filter_datetime(building, '7-13-2013', '8-4-2013')

Resampling



In [26]:

    
building = prepb.downsample(building, rule='1T')
electric = building.utility.electric
bstats.plot_missing_samples_using_rectangles(electric)

Fill large gaps in appliance data with zeros and forward-fill small gaps



In [27]:

    
building = prepb.fill_appliance_gaps(building)

# Now plot missing samples again:
electric = building.utility.electric
bstats.plot_missing_samples_using_rectangles(electric)

Detect gaps in mains and mask these out from appliance data



In [28]:

    
building = prepb.drop_missing_mains(building)
building = prepb.make_common_index(building)
electric = building.utility.electric
bstats.plot_missing_samples_using_rectangles(electric)

Sum the split-phase supply in REDD



In [29]:

    
building.utility.electric = building.utility.electric.sum_split_supplies()

Remove appliances which contribute less than 5% energy



In [30]:

    
# Hack to trick NILMTK into thinking that REDD mains measures
# active power not apparent. This will be handled much better
# in NILMTK v0.2
mains = building.utility.electric.mains[(1,1)]
mains.rename(columns={Measurement('power','apparent'): 
                      Measurement('power','active')}, 
             inplace=True)

building = prepb.filter_contribution_less_than_x(building, x=5)









    



Common Measurement:  Measurement(physical_quantity='power', type='active')



In [31]:

    
electric = building.utility.electric
bstats.plot_missing_samples_using_rectangles(electric)

Disaggregation

Split a building into training and test sets

from nilmtk.cross_validation import train_test_split



In [32]:

    
train, test = train_test_split(building, train_size = 0.5)
# train and test are now both Building objects

Training

from nilmtk.disaggregate.co_1d import CO_1d



In [33]:

    
DISAGG_FEATURE = Measurement('power', 'active')

disaggregator = CO_1d()
disaggregator.train(train, disagg_features=[DISAGG_FEATURE])
# Below is the learnt model









    Out[33]:





{ApplianceName(name='dishwasher', instance=1): [0, 589],
 ApplianceName(name='fridge', instance=1): [0, 185],
 ApplianceName(name='kitchen outlets', instance=1): [0, 21],
 ApplianceName(name='kitchen outlets', instance=2): [0, 28],
 ApplianceName(name='lighting', instance=1): [0, 86],
 ApplianceName(name='microwave', instance=1): [0, 947],
 ApplianceName(name='washer dryer', instance=1): [0, 1957]}

Export appliance models to JSON



In [34]:

    
disaggregator.export_model('model.json')
!cat model.json









    



{"fridge_1": [0, 185], "washer dryer_1": [0, 1957], "kitchen outlets_2": [0, 28], "kitchen outlets_1": [0, 21], "dishwasher_1": [0, 589], "microwave_1": [0, 947], "lighting_1": [0, 86]}

Import appliance models from JSON



In [35]:

    
disaggregator.import_model('model.json')
!rm model.json
disaggregator.model









    Out[35]:





{ApplianceName(name='dishwasher', instance=1): [0, 589],
 ApplianceName(name='fridge', instance=1): [0, 185],
 ApplianceName(name='kitchen outlets', instance=1): [0, 21],
 ApplianceName(name='kitchen outlets', instance=2): [0, 28],
 ApplianceName(name='lighting', instance=1): [0, 86],
 ApplianceName(name='microwave', instance=1): [0, 947],
 ApplianceName(name='washer dryer', instance=1): [0, 1957]}

Disaggregation



In [36]:

    
disaggregator.disaggregate(test)
predicted_power = disaggregator.predictions









    



128
{ApplianceName(name='fridge', instance=1): [0, 185], ApplianceName(name='washer dryer', instance=1): [0, 1957], ApplianceName(name='kitchen outlets', instance=2): [0, 28], ApplianceName(name='kitchen outlets', instance=1): [0, 21], ApplianceName(name='lighting', instance=1): [0, 86], ApplianceName(name='dishwasher', instance=1): [0, 589], ApplianceName(name='microwave', instance=1): [0, 947]}



In [37]:

    
predicted_power.head()









    Out[37]:






  
    
      
      (dishwasher, 1)
      (fridge, 1)
      (kitchen outlets, 1)
      (kitchen outlets, 2)
      (lighting, 1)
      (microwave, 1)
      (washer dryer, 1)
    
  
  
    
      2011-04-27 19:10:00-04:00
       0
       185
       21
       28
       0
       0
       0
    
    
      2011-04-27 19:11:00-04:00
       0
       185
       21
       28
       0
       0
       0
    
    
      2011-04-27 19:12:00-04:00
       0
       185
       21
       28
       0
       0
       0
    
    
      2011-04-27 19:13:00-04:00
       0
       185
       21
       28
       0
       0
       0
    
    
      2011-04-27 19:14:00-04:00
       0
       185
       21
       28
       0
       0
       0
    
  

5 rows × 7 columns



In [38]:

    
ax = test.utility.electric.mains[1,1]["2011-04-27 20:00":"2011-04-27 23:00:00"].plot()
ax.set_ylim([150,450])
title('aggregate power demand')









    Out[38]:





<matplotlib.text.Text at 0x7f2191e39850>



In [39]:

    
ax = predicted_power["2011-04-27 20:00":"2011-04-27 23:00:00"].plot()
ax.set_ylim([0,200])
title('predicted appliance power demand')









    Out[39]:





<matplotlib.text.Text at 0x7f219110e390>



In [40]:

    
app_ground = test.utility.electric.appliances
ground_truth_power = pd.DataFrame(
    {appliance: app_ground[appliance][DISAGG_FEATURE] 
     for appliance in app_ground})

ax = ground_truth_power["2011-04-27 20:00":"2011-04-27 23:00:00"].plot()
ax.set_ylim([0,250])
title('ground truth appliance power demand')









    Out[40]:





<matplotlib.text.Text at 0x7f2191cd37d0>

Metrics

from nilmtk.metrics import (mean_normalized_error_power, 
                            fraction_energy_assigned_correctly, 
                            f_score,
                            rms_error_power)



In [41]:

    
metrics = {
    'mean normalized error power':
          mean_normalized_error_power,
    'fraction energy assigned correctly': 
          fraction_energy_assigned_correctly,
    'f score': f_score,
    'RMS Error Power': rms_error_power
}



In [42]:

    
for metric_name, metric_func in metrics.iteritems():
    result = metric_func(predicted_power, ground_truth_power)
    display(HTML('<h5>' + metric_name + '</h5>'))
    if isinstance(result, dict):
        pretty_print_dict(result)
    else:
        print(result)
    print()









    




RMS Error Power






    




ApplianceName(name='fridge', instance=1): 110.208348899
ApplianceName(name='washer dryer', instance=1): 216.400642527
ApplianceName(name='kitchen outlets', instance=2): 18.8756186696
ApplianceName(name='kitchen outlets', instance=1): 10.5264942354
ApplianceName(name='lighting', instance=1): 69.2372594931
ApplianceName(name='dishwasher', instance=1): 221.145976367
ApplianceName(name='microwave', instance=1): 223.294611141






    









    




f score






    




ApplianceName(name='fridge', instance=1): 0.486390231493
ApplianceName(name='washer dryer', instance=1): 0.692607003891
ApplianceName(name='kitchen outlets', instance=2): 0.0
ApplianceName(name='kitchen outlets', instance=1): 0.0
ApplianceName(name='lighting', instance=1): 0.517505876531
ApplianceName(name='dishwasher', instance=1): 0.236493374108
ApplianceName(name='microwave', instance=1): 0.25






    









    




fraction energy assigned correctly






    



0.771400439401







    




mean normalized error power






    




ApplianceName(name='fridge', instance=1): 1.60618649799
ApplianceName(name='washer dryer', instance=1): 0.703450115919
ApplianceName(name='kitchen outlets', instance=2): 0.714609954031
ApplianceName(name='kitchen outlets', instance=1): 0.516469554455
ApplianceName(name='lighting', instance=1): 1.36871998661
ApplianceName(name='dishwasher', instance=1): 2.85568760678
ApplianceName(name='microwave', instance=1): 3.57270358602

End

nilmtk website: nilmtk.github.io

Contact: jack.kelly@imperial.ac.uk



In [1]:

    
# CSS styling
from IPython.core.display import display, HTML
display(HTML(open('static/styles.css', 'r').read()));

	(power, active)
2011-04-18 09:22:13-04:00	6
2011-04-18 09:22:16-04:00	6
2011-04-18 09:22:20-04:00	6
2011-04-18 09:22:23-04:00	6
2011-04-18 09:22:26-04:00	6

	hours_on	kwh
2011-04-18	NaN	NaN
2011-04-19	0.000000	0.000063
2011-04-20	0.000000	0.000010
2011-04-21	0.000000	0.000014
2011-04-22	0.000000	0.000004
2011-04-23	0.000000	0.007393
2011-04-24	0.308056	0.510096

	(fridge, 1)	(kitchen outlets, 1)	(kitchen outlets, 2)
2011-04-27 19:10:00-04:00	185	21	28
2011-04-27 19:11:00-04:00	185	21	28
2011-04-27 19:12:00-04:00	185	21	28
2011-04-27 19:13:00-04:00	185	21	28
2011-04-27 19:14:00-04:00	185	21	28