In [183]:
import json
SENSORES = ['DHT22', 'DHT11']
PIN_DATA_DHT11 = 17
# TODO Leer de disco o identificar configuración:
PIN_DATA_DHT22 = 22
pinout = dict(zip(SENSORES, [22, 17]))
pinout
Out[183]:
In [2]:
json.dumps(pinout)
Out[2]:
In [184]:
import os
import pandas as pd
import glob
os.chdir('/Users/uge/Dropbox/PYTHON/PYPROJECTS/enerpi/enerpi/DATA/')
glob.glob('*')
Out[184]:
In [7]:
pd.read_hdf('new.h5', 'rms', stop=1000)
In [24]:
glob.glob('*/*.h5', recursive=True), glob.glob('*.h5', recursive=True)
Out[24]:
In [36]:
import re
YEAR_MASK = '{}_YEAR_{:%Y}'
RG_YEAR_MASK = re.compile('(?P<name>\.*)_YEAR_(?P<year>\d{4})')
MONTH_MASK = '{}_{:%Y_MONTH_%m}'
RG_MONTH_MASK = re.compile('(?P<name>\.*)_(?P<year>\d{4})_MONTH_(?P<month>\d{2})')
DAY_MASK = '{}_{:%Y_%m_DAY_%d}'
RG_DAY_MASK = re.compile('(?P<name>\.*)_(?P<year>\d{4})_(?P<month>\d{2})_DAY_(?P<day>\d{2})')
files = [f for f in glob.glob(os.path.join(os.getcwd(), '**'), recursive=True) if f.endswith('.h5')]
Out[36]:
In [38]:
class HDFCatalog(object):
def __init__(self, base_path=DATA_PATH, preffix='DATA'):
self.base_path = os.path.abspath(base_path)
self.name = preffix
# self.intervalos = None
self.min_ts = None
self.tree = self._init_catalog_tree()
def __repr__(self):
print('HDFCatalog[{}->{}]'.format(self.name, self.base_path))
def _get_paths_interval(self, ts_ini, ts_fin=None):
# TODO glob del árbol desde DATA_PATH identificando archivos y tiempos.
files = [f.replace(self.base_path, '') for f in glob.glob(os.path.join(self.base_path, '**'), recursive=True)
if f.endswith(STORE_EXT)]
print(files)
return files
def _init_catalog_tree(self):
# TODO glob del árbol desde DATA_PATH identificando archivos y tiempos.
files = [f.replace(self.base_path, '') for f in glob.glob(os.path.join(self.base_path, '**'), recursive=True)
if f.endswith(STORE_EXT)]
print(files)
return files
def distribute_data(self, data):
# TODO separa en meses completos + días + resto. Crea directorios
pass
def info_catalog(self):
# TODO Tabla de información del catálogo: ruta, archivo, n_rows, ts_ini, ts_fin, medidas de completitud
pass
def get(self):
# TODO Load concat data de dt_ini, dt_fin. Con parser de textos? Devuelve data
pass
def append(self):
# TODO Append new data
pass
def put(self):
# TODO Put new data
pass
def map(self):
# TODO aplicar función a datos por store?
pass
def archive(self):
# TODO Compactado de datos MENSUAL / DIARIO / HORARIO?
# De now a Today por horas
# De Today a DAY_XX
# De All DAY_XX a MONTH_XX
pass
def backup(self, path_backup, compact_data=None):
# TODO backup a ruta alternativa, con compresión y opción de replicado de tree o compactado (x años, o total)
pass
def export(self, export_to='csv'):
# TODO Exportación a CSV / append to mysql?
pass
def _extract_info_store(file):
with pd.HDFStore(os.path.join(base_path, file), 'r')
def _get_paths_interval(self, ts_ini, ts_fin=None):
# TODO glob del árbol desde DATA_PATH identificando archivos y tiempos.
files = [f.replace(self.base_path, '') for f in glob.glob(os.path.join(self.base_path, '**'), recursive=True)
if f.endswith(STORE_EXT)]
print(files)
return files
Out[38]:
In [41]:
dfs = [pd.read_hdf(p, 'rms') for p in paths[1:]]
dfs[0].head()
Out[41]:
In [46]:
df_raw = pd.read_hdf(paths[0], 'raw')
print(df_raw.head())
df_raw.columns = dfs[0].columns
df_raw.index
Out[46]:
In [45]:
%matplotlib inline
df_raw.plot()
Out[45]:
In [133]:
from prettyprinting import *
#from enerpi.database import append_delta_y_consumo
def _compress_data(data, verbose=False):
if verbose:
data.info()
if not data.empty:
data = data.copy().astype('float32')
data['ref'] = data['ref'].astype('int16')
data['ldr'] = (data['ldr'] * 1000).round(0).astype('int16')
if verbose:
data.info()
return data
return data
#df1, con1 = process_data(df_raw)
#con1
_compress_data(df_raw, verbose=True).head()
Out[133]:
In [130]:
#print_info(df1.dtypes)
#df1.info()
df_tot = pd.concat([df_raw] + dfs)
df_tot_c = _compress_data(df_tot, verbose=True)
In [185]:
from enerpi.base import timeit
use_fletcher32 = True
@timeit('_save_hdf')
def _save_hdf(data, path, **kwargs):
data.to_hdf(path, 'rms', **kwargs)
print_cyan('STORE "{}"\t->\t{:.1f} KB'.format(path, os.path.getsize(path) / 1000))
_save_hdf(df_tot, 'data_juan_iborra_3d_default_big.h5')
_save_hdf(df_tot_c, 'data_juan_iborra_3d_default.h5')
_save_hdf(df_tot_c, 'data_juan_iborra_3d_zlib.h5', complevel=9, complib='zlib', fletcher32=use_fletcher32)
_save_hdf(df_tot_c, 'data_juan_iborra_3d_bzip2.h5', complevel=9, complib='bzip2', fletcher32=use_fletcher32)
_save_hdf(df_tot_c, 'data_juan_iborra_3d.h5', complevel=9, complib='blosc', fletcher32=use_fletcher32)
In [186]:
@timeit('_load_hdf')
def _load_hdf(path, **kwargs):
data = pd.read_hdf(path, 'rms', **kwargs)
print_cyan('STORE "{}"\t->\t{:.1f} KB'.format(path, os.path.getsize(path) / 1000))
return data
df = _load_hdf('data_juan_iborra_3d_default_big.h5')
df = _load_hdf('data_juan_iborra_3d_default.h5')
df = _load_hdf('data_juan_iborra_3d_zlib.h5')
df = _load_hdf('data_juan_iborra_3d_bzip2.h5')
df = _load_hdf('data_juan_iborra_3d.h5')
In [231]:
@timeit('_process_data')
def _process_data(data, verbose=False):
if not data.empty:
if verbose:
data.info()
data = data.copy()
deltas = pd.Series(data.index).diff().fillna(method='bfill')
frac_hora = deltas / pd.Timedelta(hours=1)
data['Wh'] = data.power * frac_hora.values
data['delta'] = deltas.values
data.loc[data['delta'] > '3s', 'high_delta'] = True
data.loc[data['delta'] > '1min', 'execution'] = 1
resampler = data[['power', 'Wh', 'delta', 'high_delta', 'execution']].resample('1h', label='left')
consumo = (pd.DataFrame(resampler['Wh'].sum().rename('kWh')).fillna(0.) / 1000.).astype('float32')
consumo['t_ref'] = (resampler['delta'].sum() / pd.Timedelta(hours=1)).astype('float32')
consumo['n_jump'] = resampler['high_delta'].sum().fillna(0).astype('int16')
consumo['n_exec'] = resampler['execution'].sum().fillna(0).astype('int32')
consumo['p_max'] = resampler['power'].max().round(0).astype('float16')
consumo['p_mean'] = resampler['power'].mean().round(0).astype('float16')
consumo['p_min'] = resampler['power'].min().round(0).astype('float16')
print_info(consumo.head())
data['high_delta'] = data['high_delta'].astype(bool)
data['execution'] = data['execution'].astype(bool)
data.drop(['delta', 'Wh'], axis=1, inplace=True)
if verbose:
data.info()
consumo.info()
return data, consumo
return data, None
df_p, consumo = _process_data(df, verbose=True)
consumo.to_hdf('data_juan_iborra_3d.h5', 'horas')
In [188]:
path_st = 'data_juan_iborra_3d.h5'
with pd.HDFStore(path_st, 'r') as st:
print_info(st)
data = st['rms']
dh = st['horas']
print_red('{:.1f} KB'.format(os.path.getsize(path_st) / 1000))
In [232]:
# Separa y clasifica:
ahora = pd.Timestamp.now()
current_year = ahora.year
current_month = ahora.month
current_day = ahora.day
ahora, current_year, current_month, current_day
'''
YEAR_MASK = 'YEAR_{:%Y}'
RG_YEAR_MASK = re.compile('(?P<name>\.*)_YEAR_(?P<year>\d{4})')
MONTH_MASK = '{}_{:%Y_MONTH_%m}'
RG_MONTH_MASK = re.compile('(?P<name>\.*)_(?P<year>\d{4})_MONTH_(?P<month>\d{2})')
DAY_MASK = '{}_{:%Y_%m_DAY_%d}'
RG_DAY_MASK = re.compile('(?P<name>\.*)_(?P<year>\d{4})_(?P<month>\d{2})_DAY_(?P<day>\d{2})')
'''
DIR_CURRENT_MONTH = 'CURRENT_MONTH'
preffix = 'ENERPI_DATA'
DAY_MASK = '{}_{:%Y_%m_DAY_%d}'
def _make_index_path(ts, w_day=False):
if w_day:
p = os.path.join(DIR_CURRENT_MONTH, DAY_MASK.format(preffix, ts))
else:
p = os.path.join(YEAR_MASK.format(ts), MONTH_MASK.format(preffix, ts))
return p
def _clasifica_data(df):
ahora = pd.Timestamp.now()
ts_ini, ts_fin = df.index[0], df.index[-1]
gb_años = data.groupby(pd.TimeGrouper(freq='A'))
for ts_year, d_year in gb_años:
gb_meses = d_year.groupby(pd.TimeGrouper(freq='M'))
for ts_month, d_month in gb_meses:
if (ts_year.year == ahora.year) and (ts_month.month == ahora.month):
# CURRENT MONTH
print_red('# CURRENT MONTH')
gb_dias = d_month.groupby(pd.TimeGrouper(freq='D', closed='left', label='left'))
for ts_day, d_day in gb_dias:
if ts_day.day == ahora.day:
# TODAY
print_magenta('# TODAY')
else:
# ARCHIVE DAY
print_red('# ARCHIVE DAY {:%Y-%m-%d}'.format(ts_day))
p = _make_index_path(ts_day, w_day=True)
d_day_p, c_day = _process_data(d_day)
print_info(p)
print_cyan(d_day.head(2))
print_cyan(d_day_p.tail(2))
print_red(c_day.head(2))
else:
# ARCHIVE MONTH
print_red('# ARCHIVE MONTH')
p = _make_index_path(ts_month, w_day=False)
d_month_p, c_month = _process_data(d_month)
print('DONE!')
_clasifica_data(data)
In [254]:
pd.Timestamp.fromtimestamp(os.path.getmtime(path_st))
Out[254]:
In [250]:
def _get_paths_interval(ts_ini, ts_fin=None):
ts_ini = pd.Timestamp(ts_ini)
ts_fin = pd.Timestamp(ts_fin) if ts_fin else pd.Timestamp.now()
periods = (ts_fin.year * 12 + ts_fin.month) - (ts_ini.year * 12 + ts_ini.month)
index = pd.DatetimeIndex(freq='M', start=ts_ini.date(), periods=periods + 1)
paths = [for i in index]
return pd.DatetimeIndex(freq='M', start=ts_ini.date(), periods=periods + 1)
_get_paths_interval('2016-02-03', ts_fin=None)
_get_paths_interval('2016-02-01', ts_fin=None)
_get_paths_interval('2016-01-31', ts_fin=None)
_get_paths_interval('2016-01-31', ts_fin='2016-02-01')
_get_paths_interval('2016-08-01', ts_fin='2016-08-11')
Out[250]:
In [279]:
#last = pd.read_hdf('2016_08_10.h5', 'rms')
p1 = last.copy()
p1.index = p1.index - pd.Timedelta('60D')
p2 = last.copy()
p2.index = p2.index - pd.Timedelta('180D')
p3 = last.copy()
p3.index = p2.index - pd.Timedelta('360D')
p1.to_hdf('pru1_jun.h5', 'rms')
p2.to_hdf('pru2.h5', 'rms')
p3.to_hdf('pru3_15.h5', 'rms')
p3.info()
In [270]:
pd.read_hdf('DATA_YEAR_2015/DATA_2015_MONTH_02.h5', 'rms').info()
In [274]:
last.plot()
Out[274]:
In [277]:
last.loc['2016-08-11 00:00':'2016-08-11 11:00']
Out[277]:
In [221]:
for day, g in gb_años:
print_red(day.year)
print_info(g.head())
for day, g in gb_meses:
print_red(day)
print_red(day.month)
print_info(g.head())
print_cyan(g.tail())
In [294]:
horas = pd.read_hdf('CURRENT_MONTH/DATA_2016_08_DAY_11.h5', 'hours')
rms = pd.read_hdf('CURRENT_MONTH/DATA_2016_08_DAY_11.h5', 'rms')
rms.info()
In [326]:
l = list(filter(lambda x: x[0] is not None, zip([data, None, None],
['key_raw', 'key_summary', 'key_summary_extra'])))
list(zip(*l))[1], list(zip(*l))[1]
Out[326]:
In [300]:
deltas = pd.Series(data.index).diff().fillna(method='bfill')
frac_hora = deltas / pd.Timedelta(hours=1)
data['Wh'] = data.power * frac_hora.values
data['delta'] = deltas.values
data.loc[data['delta'] > '3s', 'high_delta'] = True
data.dtypes
Out[300]:
In [306]:
data['delta'] = pd.Series(data.index).diff().fillna(method='bfill').dt.total_seconds().values
data['Wh'] = data.power * data.delta / 3600
data.loc[data['delta'] > 3, 'high_delta2'] = True
data.loc[data['delta'] > 60, 'execution'] = 1
print_cyan(data.head())
data[data['delta'] > 3]
Out[306]:
In [117]:
df_s = df1.copy()
df_s.noise = df_s.noise.round(5).astype('float32')
df_s.power = df_s.power.round(2).astype('float32')
df_s.Wh = df_s.Wh.astype('float32')
df_s.ref = df_s.ref.astype('int16')
df_s.ldr = (df_s.ldr * 1000).astype('int16')
df_s.delta = df_s.delta.dt.total_seconds().round(3).astype('float16')
df_s.high_delta = df_s.high_delta.astype(bool)
df_s.info()
In [122]:
#ts = df1[df1.delta > '1min'].index.values[0]
#df1.loc[ts - pd.Timedelta('18min'):ts + pd.Timedelta('1min')]
#df_s.high_delta.describe()
df1.T.drop(['Wh', 'delta', 'high_delta', 'execution']).T.astype('float32').info()
In [123]:
df1.T.drop(['Wh', 'delta', 'high_delta', 'execution']).T.astype('float32').plot()
Out[123]:
In [ ]: