In [6]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
from IPython.display import Image, HTML

import random
import math
import pysolar
import datetime as dt
import pytz
import matplotlib.dates as mpd
import scipy
import scipy.signal as signal
import sklearn
import numexpr as ne

from datacharm import *
from datacharm.dataplots import tableau20, ch_color

from enerpi.base import timeit
from enerpi.api import enerpi_data_catalog
from enerpi.enerplot import plot_tile_last_24h, plot_power_consumption_hourly
from enerpi.sunposition import sun_position


LAT = 38.631463
LONG = -0.866402
ELEV_M = 500
TZ = pytz.timezone('Europe/Madrid')
FS = (16, 10)
sns.set_style('ticks')
pd.set_option('display.width', 120)


def _tslim(ax, h0=12, hf=22, m0=0, mf=0):
    xlim = [mpd.num2date(x, tz=TZ) for x in ax.get_xlim()]
    new = [mpd.date2num(d.replace(hour=h, minute=m)) for d, h, m in zip(xlim, [h0, hf], [m0, mf])]
    ax.set_xlim(new)
    return ax



# Catálogo y lectura de todos los datos.
cat = enerpi_data_catalog()
data, data_s = cat.get_all_data()
print_info(data_s.tail())

LDR = pd.DataFrame(data.ldr).tz_localize(TZ)
print_cyan(LDR.describe().T.astype(int))
LDR.hist(bins=(LDR.max() - LDR.min() + 1).values[0] // 5, log=True, figsize=(18, 6), lw=0, color=tableau20[2])
plt.plot()

POWER = pd.DataFrame(data.power).tz_localize(TZ)
print_cyan(POWER.describe().T.astype(int))
POWER.hist(bins=int((POWER.max() - POWER.min() + 1).values[0] // 5), log=True, figsize=(18, 6), lw=0, color=tableau20[4])


***TIMEIT get_all_data TOOK: 2.065 s
                          kWh     t_ref  n_jump  n_exec  p_max  p_mean  p_min
ts                                                                           
2016-09-24 09:00:00  0.263024  0.999838       0       0  382.0   263.0  217.0
2016-09-24 10:00:00  0.272466  1.000088       0       0  857.0   272.0  236.0
2016-09-24 11:00:00  0.263264  0.999959       0       0  319.0   263.0  231.0
2016-09-24 12:00:00  0.288171  0.999889       0       0  909.0   288.0  235.0
2016-09-24 13:00:00  0.238090  0.828952       0       0  374.0   287.0  240.0
       count  mean  std  min  25%  50%  75%  max
ldr  3585678   249  231   11   32  161  474  748
         count  mean  std  min  25%  50%  75%   max
power  3585678   343  315  112  225  265  358  5304
Out[6]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1179db240>]], dtype=object)

In [1126]:
sns.kdeplot(POWER.power.values, shade=True, gridsize=6000)


/Users/uge/anaconda/envs/py35/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[1126]:
<matplotlib.axes._subplots.AxesSubplot at 0x190d8b4e0>

In [5]:
sns.palplot(tableau20)



In [34]:
homog_power = POWER.resample('1s').mean().fillna(method='bfill', limit=3).fillna(-1) #.round().astype('int16')
homog_power.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3726199 entries, 2016-08-12 10:46:25+02:00 to 2016-09-24 13:49:43+02:00
Freq: S
Data columns (total 1 columns):
power    float32
dtypes: float32(1)
memory usage: 42.6 MB

In [1268]:
POWER.power.round().value_counts()


Out[1268]:
225.0     40082
226.0     39897
227.0     39706
224.0     39609
223.0     39010
228.0     38825
222.0     38525
221.0     37295
229.0     37273
220.0     36522
230.0     35738
219.0     35256
218.0     34276
231.0     33899
217.0     32920
232.0     32222
216.0     31744
215.0     30570
233.0     30537
207.0     30457
208.0     30284
209.0     30049
214.0     30032
206.0     29996
210.0     29921
211.0     29740
213.0     29729
212.0     29493
234.0     29071
205.0     28588
          ...  
4065.0        1
4066.0        1
4069.0        1
4070.0        1
4072.0        1
4076.0        1
4083.0        1
4022.0        1
4019.0        1
4018.0        1
3972.0        1
3862.0        1
3865.0        1
3885.0        1
3893.0        1
3902.0        1
3928.0        1
3954.0        1
3963.0        1
3983.0        1
4016.0        1
3988.0        1
3995.0        1
3997.0        1
3999.0        1
4004.0        1
4005.0        1
4007.0        1
4012.0        1
112.0         1
Name: power, dtype: int64

In [41]:
power_day13 = homog_power.loc['2016-08-13']
ax = power_day13.plot(lw=1, alpha=.8)

power_day14 = homog_power.loc['2016-08-14']
power_day14.plot(ax=ax, lw=1, alpha=.8)


Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x123fc60f0>

In [30]:
power_day13.resample('1s').mean().fillna(method='bfill', limit=3).fillna(-1).round().astype(int)


Out[30]:
power    5242
dtype: int64

In [42]:
power_standby = homog_power.loc['2016-08-29']
power_standby.plot(lw=1, alpha=.8)


Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x126dfcd30>

In [103]:
mf5 = pd.Series(signal.medfilt(power_standby.power, kernel_size=5), index=power_standby.index, name='mf_5')
mf11 = pd.Series(signal.wiener(power_standby.power, 15), index=power_standby.index, name='wiener_11')

t0, tf = '3:10', '3:20'
ax = power_standby.between_time(t0, tf).plot(lw=1)
mf5.between_time(t0, tf).plot(ax=ax)
mf11.between_time(t0, tf).plot(ax=ax)
plt.show()


t0, tf = '3:20', '3:25'
ax = power_standby.between_time(t0, tf).plot(lw=1)
mf11.between_time(t0, tf).plot(ax=ax)
mf5.between_time(t0, tf).plot(ax=ax)


Out[103]:
<matplotlib.axes._subplots.AxesSubplot at 0x14f6a8e10>

In [100]:
power_standby.hist(bins=200, lw=0, log=True, alpha=.5)
mf11.hist(bins=200, lw=0, log=True, alpha=.5)


Out[100]:
<matplotlib.axes._subplots.AxesSubplot at 0x16f3594e0>

In [104]:
N = len(power_standby)
out_fft = np.fft.rfft((power_standby.power - power_standby.power.mean()).values)
out_fft_mf = np.fft.rfft((mf5 - mf5.mean()).values)
out_fft_mf2 = np.fft.rfft((mf11 - mf11.mean()).values)

plt.figure(figsize=(18, 15))
corte = 350 # seg

plt.subplot(3, 2, 1)
plt.plot(np.abs(out_fft[:corte + 1]), lw=1, alpha=.7)

plt.subplot(3, 2, 2)
plt.plot(np.abs(out_fft[corte:N//2]), lw=1, alpha=.7)

plt.subplot(3, 2, 3)
plt.plot(np.abs(out_fft_mf[:corte + 1]), lw=1, alpha=.7)

plt.subplot(3, 2, 4)
plt.plot(np.abs(out_fft_mf[corte:N//2]), lw=1, alpha=.7)

plt.subplot(3, 2, 5)
plt.plot(np.abs(out_fft_mf2[:corte + 1]), lw=1, alpha=.7)

plt.subplot(3, 2, 6)
plt.plot(np.abs(out_fft_mf2[corte:N//2]), lw=1, alpha=.7);



In [62]:
plt.plot(np.abs(out_fft_mf2[:60]), lw=1, alpha=.7);



In [ ]:

Extracting POWER events


In [312]:
@timeit('process_power', verbose=True)
def process_power(df_homog_power, 
                  kernel_size=15, roll_window_event=9, threshold_event=10,
                  margin_td=pd.Timedelta('120s'), 
                  margin_ant=pd.Timedelta('3s')):
    """
    Tomando una pd.DataFrame de 'POWER' homogéneo (con resample mean a 1s), 
        * aplica filtrado 'Wiener' a la señal raw,
        * calcula ∆_wiener, deltas de power entre samples
        * calcula ∆'_wiener con roll_window_event centrado, detectando máximos y mínimos locales 
        en los cambios de ∆_wiener, tomándolos como eventos de cambio.
        * Agrupa los eventos de cambio en subconjuntos separados, al menos, un pd.Timedelta 'margin_td'.
        (forma el evento desde inicio - pd.Timedelta 'margin_ant')
        
    return:
        df_power, list_index_events, df_feats_grupos
    """

    def _is_event(x):
        mid = len(x) // 2
        xmin = np.min(x)
        xmax = np.max(x)
        if ((xmax - xmin) > threshold_event):
            if (x[mid] == xmin):
                return -1
            elif (x[mid] == xmax):
                return 1
        return 0

    df_power = df_homog_power.copy()
    df_power['wiener'] = signal.wiener(df_power.power, kernel_size).astype('float32')
    df_power['medfilt'] = signal.medfilt(df_power.power, kernel_size).astype('float32')
    df_power['roll_std'] = df_power.wiener.rolling(11).std().astype('float32')
    
    df_power['deltas'] = df_power.wiener.diff().fillna(0)
    df_power['eventos'] = df_power['deltas'].diff().fillna(0).rolling(roll_window_event, center=True
                                                                     ).apply(_is_event).fillna(0).astype('int16')

    list_index_events = []
    last_event_init = None
    events_in_interval = []
    for t in df_power[df_power['eventos'] != 0].index:
        if last_event_init is None:
            events_in_interval.append(t)
        elif t > last_event_init + margin_td:
            list_index_events.append(events_in_interval)
            events_in_interval = [t]
        else:
            events_in_interval.append(t)
        last_event_init = t
    print_info('Nº de grupos de eventos: {} (para un nº de eventos total de: {})'
               .format(len(list_index_events), len(df_power[df_power['eventos'] != 0])))
    
    sub_events = []
    cols = ['power', 'medfilt', 'wiener', 'deltas', 'eventos']
    cols_feat_grupos = ['index_ge', 't0', 'duracion', 'wiener_min', 'wiener_mean', 'wiener_max', 'wiener_std', 'wiener_median', 
                        'eventos_sum', 'eventos_abs_sum', 'deltas_max', 'deltas_min', 'deltas_sum', 'deltas_abs_sum']    
    for i, ts in enumerate(list_index_events):
        t0, tf = ts[0] - margin_ant, ts[-1] + margin_td
        d_i = df_power.loc[t0:tf, cols]
        resumen_i = (i, d_i.index[0], len(d_i), 
                     d_i.wiener.min(), d_i.wiener.mean(), d_i.wiener.max(), d_i.wiener.std(), d_i.wiener.median(), 
                     d_i.eventos.sum(), d_i.eventos.abs().sum(), 
                     d_i.deltas.max(), d_i.deltas.min(), d_i.deltas.sum(), d_i.deltas.abs().sum())
        sub_events.append(resumen_i)
    df_feats_grupos = pd.DataFrame(sub_events, columns=cols_feat_grupos).set_index(cols_feat_grupos[0])
    return df_power, list_index_events, df_feats_grupos


df_power, list_index_events, df_feats_grupos = process_power(power_standby)
df_feats_grupos.describe()


Nº de grupos de eventos: 17 (para un nº de eventos total de: 76)
process_power TOOK: 0.617 s
Out[312]:
duracion wiener_min wiener_mean wiener_max wiener_std wiener_median eventos_sum eventos_abs_sum deltas_max deltas_min deltas_sum deltas_abs_sum
count 17.000000 17.000000 17.000000 17.000000 17.000000 17.000000 17.000000 17.000000 17.000000 17.000000 17.000000 17.000000
mean 135.411765 199.682035 223.403936 625.716531 48.622832 217.295883 -0.411765 3.823529 300.269726 -321.312862 11.041497 917.991433
std 13.647613 7.877142 12.715661 271.574096 30.633446 10.808972 0.618347 1.878673 206.774374 216.144354 10.972383 546.265832
min 125.000000 183.800476 195.313873 199.724258 1.908688 195.211060 -1.000000 2.000000 4.854843 -538.085815 -7.720901 60.890289
25% 126.000000 191.858337 214.732132 228.876953 3.838038 206.394714 -1.000000 2.000000 6.840714 -475.280762 6.910400 210.743942
50% 133.000000 201.769867 221.992096 782.892639 65.986412 222.654984 0.000000 3.000000 332.166199 -411.596222 14.278900 1240.151489
75% 139.000000 202.746536 235.968719 797.213623 67.981680 226.001053 0.000000 5.000000 468.395691 -7.878113 19.508942 1260.545532
max 184.000000 211.386139 236.496643 833.998474 72.090804 229.557693 1.000000 9.000000 534.843872 -1.787735 25.805740 1352.484741

In [510]:
# Análisis frecuencial de eventos / comparación de fft's

# Separación de eventos por duración. KMeans++ de k clusters
k = 9
km_dur_event = KMeans(n_clusters=k, n_jobs=-1).fit(df_feats_grupos_all_2.duracion.values.reshape(-1, 1))
print_info(sorted(km_dur_event.cluster_centers_))

amplitudes = []
grupo_durac = []
for i, ts in enumerate(lista_grupos_eventos_total_2):
    t0, tf = ts[0] - margin_ant, ts[-1] + margin_td
    d_i = POWER_FILT_2.loc[t0:tf, cols]
    if len(d_i) > 5:
        w = blackman(len(d_i))
        amplitudes_i = np.sqrt(np.abs(scipy.fftpack.rfft(d_i.power * w)))[:(len(d_i) // 2)]
        #amplitudes_i = np.sqrt(np.abs(np.fft.rfft(d_i.power)))[:(len(d_i) // 2)]
        
        amplitudes.append(amplitudes_i)
        grupo_durac.append(km_dur_event.labels_[i])
        

colors = tableau20[::2][:len(km_dur_event.cluster_centers_)]
plt.figure(figsize=(18, 12))
for a, g in zip(amplitudes, grupo_durac):
    color = colors[g]
    if len(a) > 200:
        params = dict(lw=.75, alpha=.3, color=color)
    else:
        params = dict(lw=.1, alpha=.01, color=color)
    plt.plot(a[2:], **params)
plt.xlim(0, 600);


[array([ 44.27913015]), array([ 94.02152318]), array([ 206.76315789]), array([ 478.75609756]), array([ 937.25]), array([ 2040.]), array([ 3634.2]), array([ 4933.]), array([ 6104.])]

In [511]:
f, axes = plt.subplots(1, 2, figsize=(20, 12))

ymax, ne = 0, 0
for a, g in zip(amplitudes, grupo_durac):
    color = colors[g]
    if len(a) > 300:
        params = dict(lw=.75, alpha=.35, color=color)
        axes[0].plot(a[2:], **params)
        ne += 1
        ymax = max(ymax, max(a[2:]))
axes[0].annotate('{} Eventos (de ∆T > {} s)'.format(ne, 300), (2000, ymax * .75), ha='left')

ymax, ne = 0, 0
for a, g in zip(amplitudes, grupo_durac):
    color = colors[g]
    if len(a) <= 300:
        if len(a) <= 100:
            params = dict(lw=.25, alpha=.03, color=color)
        else:
            params = dict(lw=.5, alpha=.1, color=color)
        axes[1].plot(a[2:], **params)
        ne += 1
        ymax = max(ymax, max(a[2:]))
axes[1].annotate('{} Eventos (de ∆T ≤ {} s)'.format(ne, 300), (200, ymax * .75), ha='left')
axes[1].set_xlim(0, 300);