In [1]:
from __future__ import print_function, division
import pandas as pd
import numpy as np
import os
from os.path import join, isfile
import fnmatch
import re
from nilmtk.utils import get_datastore
from nilmtk.datastore import Key
from nilmtk.measurement import LEVEL_NAMES
from nilmtk.utils import check_directory_exists
from nilm_metadata import convert_yaml_to_hdf5, save_yaml_to_datastore

column_mapping = {
    'frequency': ('frequency', ""),
    'voltage': ('voltage', ""),
    'W': ('power', 'active'),
    'active': ('power', 'active'),
    'energy': ('energy', 'apparent'),
    'A': ('current', ''),
    'reactive_power': ('power', 'reactive'),
    'apparent_power': ('power', 'apparent'),
    'power_factor': ('pf', ''),
    'PF': ('pf', ''),
    'phase_angle': ('phi', ''),
    'VA': ('power', 'apparent'),
    'VAR': ('power', 'reactive'),
    'reactive': ('power', 'reactive'),
    'VLN': ('voltage', ""),
    'V': ('voltage', ""),
    'f': ('frequency', "")
    
}
# data for file name manipulation
TYPE_A = "active"
TYPE_R = "reactive"

filename_prefix_mapping = {
    TYPE_A : ('4-POWER_REAL_FINE '),
    TYPE_R : ('5-POWER_REACTIVE_STANDARD ')
}
filename_suffix_mapping = {
    TYPE_A : (' Dump'),
    TYPE_R : (' Dump')
}

# DataFrame column names
TIMESTAMP_COLUMN_NAME = "timestamp"
ACTIVE_COLUMN_NAME = "active"
REACTIVE_COLUMN_NAME = "reactive"

type_column_mapping = {
    TYPE_A : (ACTIVE_COLUMN_NAME),
    TYPE_R : (REACTIVE_COLUMN_NAME) 
}


TIMEZONE = "Europe/London" # local time zone
home_dir='/Users/GJWood/nilm_gjw_data' # path to input data

#regular expression matching
bld_re = re.compile('building\d+') #used to pull building name from directory path
bld_nbr_re = re.compile ('\d+') # used to pull the building number from the name
iso_date_re = re.compile ('\d{4}-\d{2}-\d{2}') # used to pull the date from the file name

def convert_gjw(gjw_path, output_filename, format="HDF"):
    """
    Parameters
    ----------
    gjw_path : str
        The root path of the gjw dataset.
    output_filename : str
        The destination filename (including path and suffix), will default if not specified
    directory and file structure
    nilm_gjw_data
        building<1>
            elec
                4-POWER_REAL_FINE <date> Dump.csv
                5-POWER_REACTIVE_STANDARD <date> Dump.csv
                ...
        ...
        building<n>
        HDF5
            nilm_gjw_data.hdf5
        metadata
            building1.yaml
            dataset.yaml
            meter_devices.yaml
        other files    
    """
    if gjw_path is None: gjw_path = home_dir
    check_directory_exists(gjw_path)
    os.chdir(gjw_path)
    gjw_path = os.getcwd()  # sort out potential issue with slashes or backslashes
    if output_filename is None:
        output_filename =join(home_dir,'HDF5','nilm_gjw_data.hdf5')
    # Open data store
    print( 'opening datastore', output_filename)
    store = get_datastore(output_filename, format, mode='w')
    # walk the directory tree from the dataset home directory
    #clear dataframe & add column headers
    df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME])
    found = False
    for current_dir, dirs_in_current_dir, files in os.walk(gjw_path):
        if current_dir.find('.git')!=-1 or current_dir.find('.ipynb') != -1:
            #print( 'Skipping ', current_dir)
            continue
        print( 'checking', current_dir)
        m = bld_re.search(current_dir)
        if m: #The csv files may be further down the tree so this section may be repeated
            building_name = m.group()
            building_nbr = int(bld_nbr_re.search(building_name).group())
            meter_nbr = 1
            key = Key(building=building_nbr, meter=meter_nbr)
        for items in fnmatch.filter(files, "4*.csv"):
            # process any .CSV files found
            found = True
            ds = iso_date_re.search(items).group()
            # print( 'found files for date:', ds,end=" ")
            # found files to process
            df1 = _read_file_pair(current_dir,ds) # read two csv files into a dataframe    
            df = pd.concat([df,df1]) # concatenate the results into one long dataframe
        if found:
            found = False
            df = _prepare_data_for_toolkit(df)
            _summarise_dataframe(df,'Prepared for tool kit')
            store.put(str(key), df)
            #clear dataframe & add column headers
            #df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME])
            break # only 1 folder with .csv files at present
    store.close()
    convert_yaml_to_hdf5(join(gjw_path, 'metadata'),output_filename)
    print("Done converting gjw to HDF5!")

def _read_and_standardise_file(cdir,ds,mtype):   
    """
    parameters 
        cdir  - the directory path where the files may be found
        ds   - the date string which identifies the pair of files
        type - the type of data to be read
    The filename is constructed using the appropriate prefixes and suffixes
    The data is then read, merged, de-duplicated, converted to the correct time zone
    and converted to a time series and resampled per second
    """
    fn = filename_prefix_mapping[mtype]+ds+filename_suffix_mapping[mtype]+'.csv'
    ffn = join(cdir,fn)
    df = pd.read_csv(ffn,names=[TIMESTAMP_COLUMN_NAME,type_column_mapping[mtype]])
    df.drop_duplicates(subset=[TIMESTAMP_COLUMN_NAME], inplace=True) # remove duplicate rows with same timestamp
    df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) # convert the index to time based
    df = df.tz_convert(TIMEZONE) #deal with summertime etc. for London timezone
    # re-sample on single file only as there may be gaps between dumps            
    df = df.resample('S',fill_method='ffill') # make sure we have a reading for every second
    # resample seems to remove the timestamp column so put it back
    df[TIMESTAMP_COLUMN_NAME] = df.index
    df.drop_duplicates(subset=TIMESTAMP_COLUMN_NAME, inplace=True)
    return df

def _read_file_pair(cdir,ds):
    """"
    parameters 
        cdir - the directory path where the files may be found
        ds  - the date string which identifies the pair of files
    The files are processed individually then the columns merged on matching timestamps   
    """
    df1 = _read_and_standardise_file(cdir,ds,TYPE_A)
    #_summarise_dataframe(df1,'read file: '+TYPE_A)
    df2 = _read_and_standardise_file(cdir,ds,TYPE_R)
    #_summarise_dataframe(df2,'read file: '+TYPE_R)  
    df3 = pd.merge(df1,df2,on=TIMESTAMP_COLUMN_NAME, how='outer') #merge the two column types into 1 frame
    df3.fillna(value=0, inplace=True) # may need to enter initial entries to reactive sequence
    #_summarise_dataframe(df3,'return from merge and fillna)
    first_ts = pd.Timestamp(df3[TIMESTAMP_COLUMN_NAME][0])
    last_ts = pd.Timestamp(df3[TIMESTAMP_COLUMN_NAME][df3.index[-1]])
    print(first_ts,"to",last_ts) #print first and last entries
    return df3

def _prepare_data_for_toolkit(df):
    #remove any duplicate timestamps between files
    df.drop_duplicates(subset=["timestamp"], inplace=True) # remove duplicate rows with same timestamp
    df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) # convert the index to time based
    df = df.tz_convert(TIMEZONE) #deal with summertime etc. for London timezone
    df = df.drop(TIMESTAMP_COLUMN_NAME,1) # remove the timestamp column  
    df.rename(columns=lambda x: column_mapping[x], inplace=True) # Renaming from gjw header to nilmtk controlled vocabulary
    df.columns.set_names(LEVEL_NAMES, inplace=True) # Needed for column levelling (all converter need this line)
    df = df.convert_objects(convert_numeric=True) # make sure everything is numeric
    df = df.dropna() # drop rows with empty cells
    df = df.astype(np.float32) # Change float 64 (default) to float 32 
    df = df.sort_index() # Ensure that time series index is sorted
    return df

def _summarise_dataframe(df,loc):
    print(df.head(4))
    print("...", len(df.index),"rows at", loc)
    print (df.tail(4))
    
def main():
    convert_gjw('c:/Users/GJWood/nilm_gjw_data', None)

if __name__ == '__main__':
    main()


opening datastore /Users/GJWood/nilm_gjw_data\HDF5\nilm_gjw_data.hdf5
checking c:\Users\GJWood\nilm_gjw_data
checking c:\Users\GJWood\nilm_gjw_data\building1
checking c:\Users\GJWood\nilm_gjw_data\building1\elec
2013-10-17 17:19:07 to 2013-11-20 14:26:53
2013-10-21 19:30:26 to 2013-11-28 11:03:40
2013-10-23 23:12:05 to 2013-12-01 20:06:23
2013-11-08 12:16:21 to 2013-12-04 15:33:05
2013-11-16 16:14:40 to 2013-12-13 11:57:52
2013-12-02 13:15:47 to 2013-12-27 20:36:46
2013-12-15 22:15:33 to 2014-01-09 16:04:49
2014-01-12 13:02:22 to 2014-02-09 13:29:49
2014-03-13 21:59:56 to 2014-03-26 11:19:40
2014-04-26 23:17:57 to 2014-05-26 19:19:31
2015-04-17 11:48:40 to 2015-05-12 14:52:28
2015-05-03 04:44:28 to 2015-05-27 14:26:35
2015-05-10 12:48:52 to 2015-06-03 09:04:05
2015-05-17 04:39:18 to 2015-06-10 16:41:16
2015-05-23 19:46:35 to 2015-06-18 10:46:51
2015-05-28 18:33:33 to 2015-06-23 16:53:12
2015-06-06 16:33:20 to 2015-07-02 21:34:41
2015-06-14 14:25:52 to 2015-07-17 14:19:15
2015-06-27 11:36:52 to 2015-07-28 21:01:34
2015-07-18 22:00:50 to 2015-08-08 15:05:37
2015-08-05 07:09:06 to 2015-08-26 19:02:24
2015-08-19 10:16:33 to 2015-09-09 15:47:03
2015-08-28 09:34:04 to 2015-09-21 14:47:00
2015-09-07 22:27:21 to 2015-10-13 13:02:38
physical_quantity          power         
type                      active reactive
2013-10-17 18:19:07+01:00    382        0
2013-10-17 18:19:08+01:00    449        0
2013-10-17 18:19:09+01:00    449        0
2013-10-17 18:19:10+01:00    449        0
... 28806510 rows at Prepared for tool kit
physical_quantity          power         
type                      active reactive
2015-10-13 14:02:35+01:00   1947        0
2015-10-13 14:02:36+01:00   1969        0
2015-10-13 14:02:37+01:00   1969        0
2015-10-13 14:02:38+01:00   1927        0
Done converting YAML metadata to HDF5!
Done converting gjw to HDF5!

In [ ]: