notebook.community

Edit and run



In [16]:

    
import sys
import os.path
sys.path.append(os.path.abspath(os.path.join(os.pardir,os.pardir)))
import disaggregator as da
import disaggregator.PecanStreetDatasetAdapter as psda
import pandas as pd
import numpy as np
import copy

db_url = "postgresql://USERNAME:PASSWORD@db.wiki-energy.org:5432/postgres"
psda.set_url(db_url)

schema = 'shared'
tables = [u'validated_01_2014',
          u'validated_02_2014',
          u'validated_03_2014',
          u'validated_04_2014',
          u'validated_05_2014',]

Get two traces from march and one from april sampled at 15T, then one from april sampled at 1T.



In [2]:

    
march_dataids = psda.get_table_dataids(schema,tables[2])
april_dataids = psda.get_table_dataids(schema,tables[3])



In [11]:

    
march_trace1,march_trace2 = psda.generate_traces_for_appliance_by_dataids(schema,tables[2],'use',march_dataids[:2],sample_rate='15T')
april_trace1 = psda.generate_appliance_trace(schema,tables[3],'use',april_dataids[1],sample_rate='15T')
april_trace2 = psda.generate_appliance_trace(schema,tables[3],'use',april_dataids[1],sample_rate='1T')









    



select use,localminute from "PecanStreet_SharedData".validated_03_2014 where dataid=744
select use,localminute from "PecanStreet_SharedData".validated_03_2014 where dataid=2129
select use,localminute from "PecanStreet_SharedData".validated_04_2014 where dataid=2129
select use,localminute from "PecanStreet_SharedData".validated_04_2014 where dataid=2129



In [12]:

    
print [trace.series.index for trace in [march_trace1, march_trace2, april_trace1, april_trace2]]









    



[<class 'pandas.tseries.index.DatetimeIndex'>
[2014-03-01 00:00:00, ..., 2014-03-31 18:45:00]
Length: 2956, Freq: 15T, Timezone: None, <class 'pandas.tseries.index.DatetimeIndex'>
[2014-03-01 00:00:00, ..., 2014-03-31 18:45:00]
Length: 2956, Freq: 15T, Timezone: None, <class 'pandas.tseries.index.DatetimeIndex'>
[2014-04-01 00:00:00, ..., 2014-04-30 23:45:00]
Length: 2880, Freq: 15T, Timezone: None, <class 'pandas.tseries.index.DatetimeIndex'>
[2014-04-01 00:00:00, ..., 2014-04-30 23:59:00]
Length: 43200, Freq: T, Timezone: None]

Determine whether or not two traces align.



In [13]:

    
def traces_aligned(traces):
    """
    Returns True if traces are temporally aligned
    """
    indices = [trace.series.index for trace in traces]
    for index in indices[1:]:
        if not indices[0].equals(index):
            return False
    return True



In [14]:

    
assert(traces_aligned([march_trace1,march_trace2])) # drawn from same table
assert(not traces_aligned([march_trace1,april_trace1])) # drawn from different table
assert(not traces_aligned([april_trace1,april_trace2])) # sampled differently



In [15]:

    
print [trace.series.index for trace in [march_trace1, march_trace2, april_trace1, april_trace2]]









    



[<class 'pandas.tseries.index.DatetimeIndex'>
[2014-03-01 00:00:00, ..., 2014-03-31 18:45:00]
Length: 2956, Freq: 15T, Timezone: None, <class 'pandas.tseries.index.DatetimeIndex'>
[2014-03-01 00:00:00, ..., 2014-03-31 18:45:00]
Length: 2956, Freq: 15T, Timezone: None, <class 'pandas.tseries.index.DatetimeIndex'>
[2014-04-01 00:00:00, ..., 2014-04-30 23:45:00]
Length: 2880, Freq: 15T, Timezone: None, <class 'pandas.tseries.index.DatetimeIndex'>
[2014-04-01 00:00:00, ..., 2014-04-30 23:59:00]
Length: 43200, Freq: T, Timezone: None]

Align two misaligned traces.



In [17]:

    
def align_traces(traces,to=None,how="front"):
    """
    Temporally aligns the traces. `how`="front" means to align to the front of
    the `to` trace. If no `to` trace is given, the first shortest trace is used.
    Traces are all downsampled to match the lowest sampling rate
    """
    # make copies
    traces=copy.deepcopy(traces)
    
    # if already aligned, don't do extra work.
    if traces_aligned(traces):
        return traces
    
    # resample to the same frequency
    frequencies = [pd.tseries.frequencies.to_offset(trace.series.index.freq)
                   for trace in traces if trace.series.index.freq]
    new_freq = sorted(frequencies,reverse=True)[0]
    for trace in traces:
        trace.resample(new_freq)
    
    # determine where to shift to and how much to cut off
    if not to:
        shortest_i = np.argsort([trace.series.size for trace in traces])[0]
        to = traces[shortest_i]
        cutoff = to.series.size
    else:
        all_traces = traces[:]
        all_traces.append(to)
        shortest_i = np.argsort([trace.series.size for trace in all_traces])[0]
        cutoff = all_traces[shortest_i].series.size
    
    # shift
    if how == 'front':
        offsets = [to.series.index[0] - trace.series.index[0] for trace in traces]
        for trace,offset in zip(traces,offsets):
            trace.series.index = trace.series.index + offset
    else:
        raise NotImplementedError
    
    # cut off extra:
    for trace in traces:
        trace.series = trace.series[:cutoff]
    
    return traces



In [19]:

    
print [trace.series.index for trace in [march_trace1, march_trace2, april_trace1, april_trace2]]
print
print [trace.series.index for trace in align_traces([march_trace1,march_trace2])]
print
print [trace.series.index for trace in align_traces([april_trace1,april_trace2])]
print
print [trace.series.index for trace in align_traces([april_trace1,april_trace2],to=march_trace1)]
print









    



[<class 'pandas.tseries.index.DatetimeIndex'>
[2014-03-01 00:00:00, ..., 2014-03-31 18:45:00]
Length: 2956, Freq: 15T, Timezone: None, <class 'pandas.tseries.index.DatetimeIndex'>
[2014-03-01 00:00:00, ..., 2014-03-31 18:45:00]
Length: 2956, Freq: 15T, Timezone: None, <class 'pandas.tseries.index.DatetimeIndex'>
[2014-04-01 00:00:00, ..., 2014-04-30 23:45:00]
Length: 2880, Freq: 15T, Timezone: None, <class 'pandas.tseries.index.DatetimeIndex'>
[2014-04-01 00:00:00, ..., 2014-04-30 23:59:00]
Length: 43200, Freq: T, Timezone: None]

[<class 'pandas.tseries.index.DatetimeIndex'>
[2014-03-01 00:00:00, ..., 2014-03-31 18:45:00]
Length: 2956, Freq: 15T, Timezone: None, <class 'pandas.tseries.index.DatetimeIndex'>
[2014-03-01 00:00:00, ..., 2014-03-31 18:45:00]
Length: 2956, Freq: 15T, Timezone: None]

[<class 'pandas.tseries.index.DatetimeIndex'>
[2014-04-01 00:00:00, ..., 2014-04-30 23:45:00]
Length: 2880, Freq: 15T, Timezone: None, <class 'pandas.tseries.index.DatetimeIndex'>
[2014-04-01 00:00:00, ..., 2014-04-30 23:45:00]
Length: 2880, Freq: 15T, Timezone: None]

[<class 'pandas.tseries.index.DatetimeIndex'>
[2014-03-01 00:00:00, ..., 2014-03-30 23:45:00]
Length: 2880, Freq: 15T, Timezone: None, <class 'pandas.tseries.index.DatetimeIndex'>
[2014-03-01 00:00:00, ..., 2014-03-30 23:45:00]
Length: 2880, Freq: 15T, Timezone: None]



In [12]:

    
print dir(march_trace1.series.index)









    



['T', '__abs__', '__add__', '__and__', '__array__', '__array_finalize__', '__array_interface__', '__array_prepare__', '__array_priority__', '__array_struct__', '__array_wrap__', '__bytes__', '__class__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__delslice__', '__dict__', '__div__', '__divmod__', '__doc__', '__eq__', '__float__', '__floordiv__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getslice__', '__gt__', '__hash__', '__hex__', '__iadd__', '__iand__', '__idiv__', '__ifloordiv__', '__ilshift__', '__imod__', '__imul__', '__index__', '__init__', '__int__', '__invert__', '__ior__', '__ipow__', '__irshift__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__long__', '__lshift__', '__lt__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__oct__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdiv__', '__rdivmod__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rlshift__', '__rmod__', '__rmul__', '__ror__', '__rpow__', '__rrshift__', '__rshift__', '__rsub__', '__rtruediv__', '__rxor__', '__setattr__', '__setitem__', '__setslice__', '__setstate__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__unicode__', '__weakref__', '__xor__', '_add_delta', '_array_values', '_arrmap', '_assert_can_do_setop', '_box_scalars', '_box_values', '_cached_range', '_can_fast_union', '_cleanup', '_constructor', '_engine', '_engine_type', '_ensure_compat_concat', '_fast_union', '_format_native_types', '_format_with_header', '_generate', '_get_duplicates', '_get_level_number', '_get_method', '_get_names', '_get_object_index', '_get_string_slice', '_get_time_micros', '_groupby', '_has_complex_internals', '_inner_indexer', '_join_level', '_join_monotonic', '_join_non_unique', '_join_precedence', '_left_indexer', '_left_indexer_unique', '_local_timestamps', '_maybe_utc_convert', '_mpl_repr', '_outer_indexer', '_partial_date_slice', '_possibly_promote', '_resolution', '_sarr_cache', '_set_names', '_shallow_copy', '_simple_new', '_view_like', '_wrap_joined_index', '_wrap_union_result', 'all', 'any', 'append', 'argmax', 'argmin', 'argpartition', 'argsort', 'asi8', 'asobject', 'asof', 'asof_locs', 'astype', 'base', 'byteswap', 'choose', 'clip', 'compress', 'conj', 'conjugate', 'copy', 'ctypes', 'cumprod', 'cumsum', 'data', 'date', 'day', 'dayofweek', 'dayofyear', 'delete', 'diagonal', 'diff', 'dot', 'drop', 'dtype', 'dump', 'dumps', 'equals', 'fill', 'flags', 'flat', 'flatten', 'format', 'freq', 'freqstr', 'get_duplicates', 'get_indexer', 'get_indexer_non_unique', 'get_level_values', 'get_loc', 'get_value', 'getfield', 'groupby', 'holds_integer', 'hour', 'imag', 'indexer_at_time', 'indexer_between_time', 'inferred_freq', 'inferred_type', 'insert', 'intersection', 'is_all_dates', 'is_lexsorted_for_tuple', 'is_monotonic', 'is_normalized', 'is_numeric', 'is_type_compatible', 'is_unique', 'isin', 'item', 'itemset', 'itemsize', 'join', 'map', 'max', 'mean', 'microsecond', 'min', 'minute', 'month', 'name', 'names', 'nanosecond', 'nbytes', 'ndim', 'newbyteorder', 'nlevels', 'nonzero', 'normalize', 'offset', 'order', 'partition', 'prod', 'ptp', 'put', 'quarter', 'ravel', 'real', 'reindex', 'repeat', 'reshape', 'resize', 'resolution', 'round', 'searchsorted', 'second', 'set_value', 'setfield', 'setflags', 'shape', 'shift', 'size', 'slice_indexer', 'slice_locs', 'snap', 'sort', 'squeeze', 'std', 'strides', 'sum', 'summary', 'swapaxes', 'take', 'time', 'to_datetime', 'to_native_types', 'to_period', 'to_pydatetime', 'to_series', 'tofile', 'tolist', 'tostring', 'trace', 'transpose', 'tz', 'tz_convert', 'tz_localize', 'tzinfo', 'union', 'union_many', 'unique', 'values', 'var', 'view', 'week', 'weekday', 'weekofyear', 'year']



In [39]:

    
print pd.tseries.frequencies.to_offset('60S')==pd.tseries.frequencies.to_offset('1T')
print sorted([pd.tseries.frequencies.to_offset('1S'),pd.tseries.frequencies.to_offset('1T')],reverse=True)[0].freqstr
pd.tseries.frequencies.to_offset('60S').freqstr









    



True
T






    Out[39]:





'60S'



In [41]:

    
march_trace1.series.size









    Out[41]:





2956



In [19]:

    
march_trace1.series.index + pd.tseries.frequencies.to_offset('60S')









    Out[19]:





<class 'pandas.tseries.index.DatetimeIndex'>
[2014-03-01 00:01:00, ..., 2014-03-31 18:46:00]
Length: 2956, Freq: 15T, Timezone: None



In [ ]: