In [2]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
from IPython.display import Image, HTML

import random
import math
import pysolar
import datetime as dt
import pytz
import matplotlib.dates as mpd
import scipy
from scipy import signal
from scipy.signal import medfilt
import numexpr as ne

from datacharm import *
from datacharm.dataplots import tableau20, ch_color

from enerpi.base import timeit
from enerpi.api import enerpi_data_catalog
from enerpi.enerplot import plot_tile_last_24h, plot_power_consumption_hourly
from enerpi.process import separa_ldr_artificial_natural, get_solar_day
from enerpi.radsolar import ajuste_ldr_con_barrido_irrad_inc, local_maxima_minima


LAT = 38.631463
LONG = -0.866402
ELEV_M = 500
TZ = pytz.timezone('Europe/Madrid')
FS = (16, 10)


def _tslim(ax, h0=12, hf=22, m0=0, mf=0):
    xlim = [mpd.num2date(x, tz=TZ) for x in ax.get_xlim()]
    new = [mpd.date2num(d.replace(hour=h, minute=m)) for d, h, m in zip(xlim, [h0, hf], [m0, mf])]
    ax.set_xlim(new)
    return ax


def set_sun_times(df, tz=TZ, lat_deg=LAT, long_deg=LONG, offset='5min'):
    if df.index[-1].date() == df.index[0].date():
        # Same day:
        ts_day = pd.Timestamp(df.index[0].date(), tz=tz)
        sunrise, sunset = pysolar.util.get_sunrise_sunset(lat_deg, long_deg, ts_day + pd.Timedelta('12h'))
        delta = pd.Timedelta(offset)
        sunrise, sunset = [sunrise - delta, sunset + delta]
        df['sun'] = False
        df.loc[sunrise:sunset, 'sun'] = True
        return df
    else:
        print_err('IMPLEMENTAR!')
        assert()


sns.set_style('whitegrid')        

# Catálogo y lectura de todos los datos.
cat = enerpi_data_catalog()
data, data_s = cat.get_all_data()
print_info(data_s.tail())

LDR = pd.DataFrame(data.ldr).tz_localize(TZ)
print_cyan(LDR.describe().T.astype(int))
LDR.hist(bins=(LDR.max() - LDR.min() + 1).values[0] // 5, log=True, figsize=(18, 8))

Image('/Users/uge/Desktop/LDR_analysis/LDR_análisis_día_2016_09_12.png')


 ==> Librerías, clases y métodos cargados:
 +++ "dt"                           +++ "os" 
 +++ "json" v:2.0.9                 +++ "pd" (pandas) v:0.18.1
 +++ "locale"                       +++ "plt" 
 +++ "math"                         +++ "re" v:2.2.1
 +++ "np" (numpy) v:1.11.1          +++ "sns" (seaborn) v:0.7.0
 +++ "sys" 
 ** "Colormap", "Line2D", "Normalize", "OrderedDict", "PathPatch", "namedtuple", "time"
 ==> Pretty printing funcs:
print_blue, print_bold, print_boldu, print_cyan, print_err, print_green, print_grey, print_greyb, print_info, print_infob, print_magenta, print_ok, print_red, print_redb, print_secc, print_tree_dict, print_warn, print_white, print_yellow, print_yellowb, printcolor
If you are in a jupyter notebook, insert this:
%matplotlib inline
%config InlineBackend.figure_format='retina'
If you are working with GEO data, insert this:
import geopandas as gpd
import shapely.geometry as sg
import cartopy.crs as ccrs
/Users/uge/anaconda/envs/py35/lib/python3.5/site-packages/matplotlib/__init__.py:1350: UserWarning:  This call to matplotlib.use() has no effect
because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

  warnings.warn(_use_error_msg)
***TIMEIT get_all_data TOOK: 1.085 s
                          kWh     t_ref  n_jump  n_exec   p_max  p_mean  p_min
ts                                                                            
2016-09-14 09:00:00  0.280975  0.999865       0       0   865.0   281.0  185.0
2016-09-14 10:00:00  0.348163  1.000116       0       0   422.0   348.0  273.0
2016-09-14 11:00:00  0.347181  1.000035       0       0   883.0   347.0  277.0
2016-09-14 12:00:00  0.312149  0.999837       0       0   391.0   312.0  269.0
2016-09-14 13:00:00  0.144251  0.331557       0       0  2564.0   435.0  284.0
       count  mean  std  min  25%  50%  75%  max
ldr  2725278   251  234   11   33  162  478  748
Out[2]:

In [14]:
pois = pd.read_csv('/Users/uge/Desktop/LDR_analysis/POIS_labeled.h5_.csv').set_index('ts_fin')

labeled = pois[pois.solo_natural.notnull()].copy()
labeled.solo_natural = labeled.solo_natural.astype(bool)
labeled.artificial_princ = labeled.artificial_princ.astype(bool)

labeled.head()


Out[14]:
intervalo fr_sun ini_sun fin_sun seconds altitud ldr_mean ldr_median pendiente_tot inicio fin pendiente_i pendiente_f pendiente_last_mf ts_ini step is_poi_to_label solo_natural artificial_princ
ts_fin
2016-08-12 11:36:50 1.0 1.0 1.0 1.0 3005.0 44.0 648.910150 649.0 -0.051581 656.0 641.0 -0.010000 -0.171717 -0.048414 2016-08-12 10:46:50 656.0 True False False
2016-08-12 11:42:55 2.0 1.0 1.0 1.0 365.0 49.0 627.041096 634.0 -1.780822 610.0 636.0 -1.780822 -1.780822 -0.944444 2016-08-12 11:36:55 -31.0 True False False
2016-08-12 11:59:50 4.0 1.0 1.0 1.0 965.0 51.0 630.113990 630.0 0.020725 636.0 632.0 -0.030000 0.040404 0.104167 2016-08-12 11:43:50 198.0 True False False
2016-08-12 12:07:15 6.0 1.0 1.0 1.0 355.0 54.0 422.042254 426.0 0.281690 411.0 426.0 0.281690 0.281690 0.157143 2016-08-12 12:01:25 -203.0 True True False
2016-08-12 12:19:05 10.0 1.0 1.0 1.0 310.0 55.0 469.903226 470.5 -1.354839 482.0 464.0 -1.354839 -1.354839 -0.803279 2016-08-12 12:14:00 56.0 True True False

In [21]:
#sns.regplot(['altitud', 'step'], 'solo_natural', data=labeled, logistic=True)
labeled.groupby('solo_natural').mean()


Out[21]:
intervalo fr_sun ini_sun fin_sun seconds altitud ldr_mean ldr_median pendiente_tot inicio fin pendiente_i pendiente_f pendiente_last_mf step is_poi_to_label artificial_princ
solo_natural
False 468.128205 0.320399 0.350427 0.290598 989.487179 4.760684 373.583583 378.901709 -3.623279 383.179487 375.170940 -3.483649 -3.596936 -1.701882 194.880342 1.0 0.435897
True 440.365639 0.667182 0.665198 0.709251 7505.947137 26.995595 285.235240 281.715859 0.724429 281.700441 293.295154 0.551183 1.009082 0.364263 -88.766520 1.0 0.185022

In [24]:
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score

labeled.columns


Out[24]:
Index(['intervalo', 'fr_sun', 'ini_sun', 'fin_sun', 'seconds', 'altitud', 'ldr_mean', 'ldr_median', 'pendiente_tot', 'inicio', 'fin', 'pendiente_i', 'pendiente_f',
       'pendiente_last_mf', 'ts_ini', 'step', 'is_poi_to_label', 'solo_natural', 'artificial_princ'],
      dtype='object')

In [72]:
y_art_p, X_art_p = dmatrices('''artificial_princ ~ step + pendiente_i + pendiente_f + altitud + ldr_median + inicio + fin +
                  C(solo_natural) + C(fin_sun)''',
                  labeled, return_type="dataframe")
# flatten y into a 1-D array
#y = np.ravel(y)
print(list(X.columns), y_art_p.shape, X_art_p.shape)

# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X_art_p, y_art_p.iloc[:,1])

# check the accuracy on the training set
print_ok(model.score(X_art_p, y_art_p.iloc[:,0].values))
print_ok(model.score(X_art_p, y_art_p.iloc[:,1].values))
y_art_p.mean()


['Intercept', 'C(solo_natural)[T.True]', 'C(fin_sun)[T.1.0]', 'step', 'pendiente_i', 'pendiente_f', 'altitud', 'ldr_median', 'inicio', 'fin'] (344, 2) (344, 10)
0.125
0.875
Out[72]:
artificial_princ[False]    0.729651
artificial_princ[True]     0.270349
dtype: float64

In [88]:
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_art_p, y_art_p.iloc[:,1], test_size=0.15, random_state=0)
model2 = LogisticRegression()
print_red(model2.fit(X_train, y_train))

# predict class labels for the test set
predicted = model2.predict(X_test)
print (predicted.astype(bool))

# generate class probabilities
probs = model2.predict_proba(X_test)
print((probs[:,1] * 100).astype(int))

# generate evaluation metrics
print_ok(metrics.accuracy_score(y_test, predicted))
print_ok(metrics.roc_auc_score(y_test, probs[:, 1]))

print(metrics.confusion_matrix(y_test, predicted))
print_info(metrics.classification_report(y_test, predicted))

# evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegression(), X_art_p, y_art_p.iloc[:,1], scoring='accuracy', cv=10)
print_info(scores)
print_ok(scores.mean())


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
[False False False  True False  True False False False False False  True
  True False False False False False False False  True False False False
  True False False False False False False False False False  True False
 False False False  True False False False  True  True False False False
 False  True False False]
[10  0 14 90 13 90  0  6  2  0  0 90 83  1 12 16 14 28  3  0 88  1 47  3 88
  2  0 46  3  0 31  6 14 35 91 44 44  0 42 51  7 34 44 98 84  0  0  3  6 76
 25 33]
0.846153846154
0.853781512605
[[34  1]
 [ 7 10]]
             precision    recall  f1-score   support

        0.0       0.83      0.97      0.89        35
        1.0       0.91      0.59      0.71        17

avg / total       0.86      0.85      0.84        52

[ 0.86111111  0.85714286  0.88571429  0.82352941  0.85294118  0.85294118
  0.70588235  0.88235294  0.97058824  0.76470588]
0.845690943044

In [59]:
pd.DataFrame(list(zip(X.columns, np.transpose(model.coef_))))


Out[59]:
0 1
0 Intercept [0.992081919337]
1 C(solo_natural)[T.True] [-0.855045760952]
2 C(fin_sun)[T.1.0] [0.872525675758]
3 step [-0.000418968057043]
4 pendiente_i [0.636800669707]
5 pendiente_f [-0.662661369119]
6 altitud [0.109164114236]
7 ldr_median [-0.0153136404396]
8 inicio [0.00183072161387]
9 fin [0.00677104774147]

In [65]:
pd.DataFrame(list(zip(X.columns, model.coef_.T[:,0])))
#model.coef_.T


Out[65]:
0 1
0 Intercept 0.992082
1 C(solo_natural)[T.True] -0.855046
2 C(fin_sun)[T.1.0] 0.872526
3 step -0.000419
4 pendiente_i 0.636801
5 pendiente_f -0.662661
6 altitud 0.109164
7 ldr_median -0.015314
8 inicio 0.001831
9 fin 0.006771

In [5]:
# MULTI Clustering - EVENTOS
eventos = pd.read_hdf('../enerpi/eventos_multi_clustering.h5', 'eventos').iloc[1:-1]
clusters = pd.read_hdf('../enerpi/eventos_multi_clustering.h5', 'clustering')
print('eventos.shape, clusters.shape: ', eventos.shape, clusters.shape)

grupo = clusters.groupby('super_cluster').get_group('cluster_subidas_7').drop('super_cluster', axis=1).reset_index()
grupo.set_index(['k_km', 'k_pca', 'k_dbscan', 'k_afp', 'k_ag']).sort_index().head(10)


eventos.shape, clusters.shape:  (1379, 23) (1379, 6)
Out[5]:
ts
k_km k_pca k_dbscan k_afp k_ag
0 0 -1 3 5 2016-09-13 11:24:27+02:00
0 3 5 2016-08-12 13:04:05+02:00
5 2016-08-24 10:08:55+02:00
5 2016-08-30 11:48:14+02:00
5 2016-09-13 10:23:28+02:00
5 2016-09-14 11:38:51+02:00
3 -1 1 5 2016-08-12 16:02:24+02:00
5 2016-08-12 16:02:30+02:00
5 2016-08-22 09:42:07+02:00
5 2016-09-09 11:46:17+02:00

In [6]:
num_cluster_combined = grupo.groupby(['k_km', 'k_pca', 'k_dbscan', 'k_afp', 'k_ag']).ts.count().sort_values(ascending=False)
identif_acum = num_cluster_combined.cumsum()
principales = num_cluster_combined[identif_acum < len(grupo) // 10 * 8]

print_cyan(principales)
ax = identif_acum.plot()
ax.hlines([len(grupo) // 10 * 8], 0, 100)

idx_comb_i = grupo.set_index(principales.index.names).sort_index().loc[principales.index[0], 'ts']
eventos.loc[pd.DatetimeIndex(idx_comb_i.reset_index(drop=True), tz=TZ)].head()


k_km  k_pca  k_dbscan  k_afp  k_ag
0     3       0        9      5       112
3     0       0        3      5        85
5     5       0        7      0        69
6     5       0        10     0        59
      1       0        10     0        37
2     4       3        5      4        26
6     5       0        7      0        26
5     1       0        7      0        25
6     3       0        9      5        20
1     2       2        4      1        18
4     6      -1        8      3        15
0     5       0        7      0        15
5     1       0        10     0        11
1     2      -1        4      1        11
Name: ts, dtype: int64
Out[6]:
steps median_filter slope_roll_-11 mf_roll_-11 slope_roll_-8 mf_roll_-8 slope_roll_-5 mf_roll_-5 slope_roll_-2 mf_roll_-2 ... mf_roll_4 slope_roll_7 mf_roll_7 slope_roll_10 mf_roll_10 es_subida altitude azimuth es_fin es_init
2016-08-12 11:43:37+02:00 80.5 472.0 9.142857 553.0 4.142857 528.0 4.857143 525.0 12.428571 513.0 ... 438.0 0.000000 438.0 0.000000 438.0 True 50.3 -63.2 False False
2016-08-12 12:07:19+02:00 11.5 436.0 0.000000 442.0 0.285714 442.0 0.857143 442.0 2.000000 439.0 ... 427.0 0.000000 427.0 0.142857 427.0 True 54.3 -56.5 False False
2016-08-12 12:13:15+02:00 16.5 405.0 0.714286 419.0 0.857143 417.0 1.142857 416.0 3.000000 410.0 ... 394.0 1.714286 394.0 3.142857 387.0 True 55.2 -54.7 False False
2016-08-12 12:13:51+02:00 43.0 444.0 0.571429 482.0 1.428571 479.0 2.857143 473.0 6.571429 468.0 ... 426.0 0.000000 426.0 0.000000 426.0 True 55.3 -54.5 False False
2016-08-12 12:28:38+02:00 14.5 434.0 1.142857 460.0 1.857143 457.0 2.714286 448.0 2.857143 441.0 ... 426.0 4.571429 420.0 8.000000 404.0 True 57.6 -49.5 False False

5 rows × 23 columns


In [3]:
homog = LDR.resample('1s').mean().fillna(-1).astype(int)
homog['median_filter'] = medfilt(homog.ldr, kernel_size=7)
homog.head(10)


Out[3]:
ldr median_filter
ts
2016-08-12 10:46:25+02:00 661 0.0
2016-08-12 10:46:26+02:00 -1 658.0
2016-08-12 10:46:27+02:00 660 658.0
2016-08-12 10:46:28+02:00 659 658.0
2016-08-12 10:46:29+02:00 658 657.0
2016-08-12 10:46:30+02:00 657 657.0
2016-08-12 10:46:31+02:00 657 657.0
2016-08-12 10:46:32+02:00 657 657.0
2016-08-12 10:46:33+02:00 657 657.0
2016-08-12 10:46:34+02:00 656 656.0

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [7]:
def plot_events_on_day(eventos, data_ldr_mf):
    delta = pd.Timedelta('2min')
    delta_big = pd.Timedelta('10min')
    ax, day_plot = None, None
    for t, row in eventos.iterrows():
        #print_secc(str(t))
        roundings = data_ldr_mf.loc[t-delta:t+delta].copy()
        roundings_big = data_ldr_mf.loc[t-delta_big:t+delta_big].copy()
        if not roundings.empty:
            if ax is None:
                day_plot = t.date()
            else:
                t = dt.datetime.combine(day_plot, t.time())
                new_index = pd.DatetimeIndex([dt.datetime.combine(day_plot, t) for t in roundings.index.time])
                new_index_big = pd.DatetimeIndex([dt.datetime.combine(day_plot, t) for t in roundings_big.index.time])
                roundings.index = new_index
                roundings_big.index = new_index_big
            if ax is None:
                ax = roundings['ldr'].plot(figsize=FS, lw=1, alpha=.8, color=tableau20[1])
            else:
                roundings['ldr'].plot(ax=ax, lw=1, alpha=.8, color=tableau20[1])
            ini = roundings['median_filter'].loc[:t]
            fin = roundings['median_filter'].loc[t:]
            if not ini.empty:
                ini.plot(ax=ax, lw=1.5, alpha=.8, color=tableau20[6])
            else:
                print_magenta('No hay INIT')
            if not fin.empty:
                fin.plot(ax=ax, lw=1.5, alpha=.8, color=tableau20[4])
            else:
                print_red('No hay FIN')
            #ax.vlines([t], 0, 800, lw=1, linestyle='--', alpha=.6)
            #roundings_big.ldr.plot(ax=ax, lw=.5, alpha=.5, color=tableau20[1])
    plt.show()



idx_comb_i = grupo.set_index(principales.index.names).sort_index().loc[principales.index[3], 'ts']
plot_events_on_day(eventos.loc[pd.DatetimeIndex(idx_comb_i.reset_index(drop=True), tz=TZ)], homog)



In [179]:
principales.index[3]


Out[179]:
(5, 0, 0, 7, 0)

In [181]:
homog['es_evento'] = False
homog.loc[eventos.index, 'es_evento'] = True
homog.head(10)


Out[181]:
ldr median_filter es_evento
ts
2016-08-12 10:46:25+02:00 661 0.0 False
2016-08-12 10:46:26+02:00 -1 658.0 False
2016-08-12 10:46:27+02:00 660 658.0 False
2016-08-12 10:46:28+02:00 659 658.0 False
2016-08-12 10:46:29+02:00 658 657.0 False
2016-08-12 10:46:30+02:00 657 657.0 False
2016-08-12 10:46:31+02:00 657 657.0 False
2016-08-12 10:46:32+02:00 657 657.0 False
2016-08-12 10:46:33+02:00 657 657.0 False
2016-08-12 10:46:34+02:00 656 656.0 False

In [6]:
import statsmodels.api as sm

data_stats = homog.loc[eventos.index[0]:eventos.index[-1]].iloc[1:-1]
data_stats
#sm.tsa.acf(data_stats.median_filter, nlags=60)


Out[6]:
(2855834, 2)

In [31]:
#def ewma_outlier(tsdf, stdlimit=5, span=15) :

# EWMA:
tsdf = data_stats.iloc[:10000].copy()
tsdf.loc[tsdf.ldr == -1, 'ldr'] = np.nan
stdlimit = 5
span = 7

ewm = tsdf['ldr'].ewm(adjust=True,ignore_na=True,min_periods=3,span=5)
tsdf['ldr_binpred'] = ewm.mean().shift(-1)
tsdf['ldr_binstd'] = ewm.std().shift(-1)
tsdf['ldr_stds'] = ((tsdf['ldr'] - tsdf['ldr_binpred']) / tsdf['ldr_binstd'])
tsdf['ldr_outlier'] = (tsdf['ldr_stds'].abs() > stdlimit)
tsdf.head(20)


Out[31]:
ldr median_filter ldr_binpred ldr_binstd ldr_stds ldr_outlier
ts
2016-08-12 11:42:40+02:00 616.0 616.0 NaN NaN NaN False
2016-08-12 11:42:41+02:00 611.0 611.0 611.578947 2.924488 -0.197965 False
2016-08-12 11:42:42+02:00 610.0 611.0 611.338462 2.148816 -0.622883 False
2016-08-12 11:42:43+02:00 611.0 610.0 610.440758 2.109213 0.265142 False
2016-08-12 11:42:44+02:00 609.0 609.0 608.087218 3.920855 0.232802 False
2016-08-12 11:42:45+02:00 604.0 604.0 602.745508 8.770131 0.143041 False
2016-08-12 11:42:46+02:00 593.0 593.0 593.121491 16.498555 -0.007364 False
2016-08-12 11:42:47+02:00 575.0 575.0 581.443900 22.579449 -0.285388 False
2016-08-12 11:42:48+02:00 559.0 559.0 570.777630 24.795965 -0.474982 False
2016-08-12 11:42:49+02:00 550.0 550.0 562.759051 23.773139 -0.536700 False
2016-08-12 11:42:50+02:00 547.0 547.0 554.441931 23.370380 -0.318434 False
2016-08-12 11:42:51+02:00 538.0 538.0 544.577267 24.591285 -0.267463 False
2016-08-12 11:42:52+02:00 525.0 525.0 534.349811 25.743339 -0.363193 False
2016-08-12 11:42:53+02:00 514.0 514.0 523.875955 26.733528 -0.369422 False
2016-08-12 11:42:54+02:00 503.0 503.0 510.897545 29.939369 -0.263785 False
2016-08-12 11:42:55+02:00 485.0 485.0 495.249147 34.766092 -0.294803 False
2016-08-12 11:42:56+02:00 464.0 464.0 480.155885 37.074373 -0.435770 False
2016-08-12 11:42:57+02:00 450.0 450.0 466.097582 37.548582 -0.428713 False
2016-08-12 11:42:58+02:00 438.0 438.0 452.060833 37.843708 -0.371550 False
2016-08-12 11:42:59+02:00 424.0 424.0 438.704545 37.423352 -0.392924 False

In [21]:
tsdf.plot(figsize=FS)


Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x106b78550>

In [32]:
tsdf.between_time('12:11', '12:21')[['ldr', 'ldr_binpred', 'median_filter']].plot(figsize=FS)


Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x1032d0208>

In [33]:
tsdf.between_time('12:11', '12:21')[['ldr', 'ldr_binpred']].plot(figsize=FS)


Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x1193f0c50>

In [45]:
np.all(np.isnan(np.abs(np.fft.rfft(tsdf.ldr, n=1000))))


Out[45]:
True

In [48]:
ldr_min = LDR.ldr.resample('1s').mean().fillna(-1).resample('5min').median()
ldr_min.head()


Out[48]:
ts
2016-08-12 10:45:00+02:00    656.0
2016-08-12 10:50:00+02:00    655.0
2016-08-12 10:55:00+02:00    654.0
2016-08-12 11:00:00+02:00    653.0
2016-08-12 11:05:00+02:00    651.0
Freq: 5T, Name: ldr, dtype: float64

In [9]:
ldr_min = LDR.ldr.resample('1s').mean().resample('5min').apply(lambda x: np.nanmedian(x) if np.any(np.isfinite(x)) else np.nan)
ldr_min.head()


Out[9]:
ts
2016-08-12 10:45:00+02:00    656.0
2016-08-12 10:50:00+02:00    656.0
2016-08-12 10:55:00+02:00    654.0
2016-08-12 11:00:00+02:00    653.0
2016-08-12 11:05:00+02:00    651.0
Freq: 5T, Name: ldr, dtype: float64

In [11]:
ldr_min.groupby(lambda x: x.time).std().plot()


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x106c348d0>

In [59]:
gb_time = ldr_min.groupby(lambda x: x.time)
def _f_perc(y):
    def __perc(x):
        return np.percentile(x[np.isfinite(x)], y) if np.isfinite(x).sum() > 5 else np.nan
    __perc.__name__ = 'perc_{}'.format(y)
    return __perc

#f1 = lambda x: _f_perc(x, 90)
##f1.__name__ = 'perc_90'
#f2 = lambda x: _f_perc(x, 10)
#f2.__name__ = 'perc_10'
typical_day = gb_time.agg(['min', 'max', 'mean', 'median', 'std', 
                           f1, f2])
typical_day.plot(figsize=FS)


Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x12398a160>

In [56]:
plt.fill_between?
#pd.DataFrame(ldr_min).apply(lambda x: x - typical_day.loc[x.index.time, 'median'].values).iloc[:1000].plot()

In [1]:
from enerpi.sunposition import sun_position
sun_position(ldr_min)
#ldr_min


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-2ee6f2bab002> in <module>()
      1 from enerpi.sunposition import sun_position
----> 2 sun_position(ldr_min)
      3 #ldr_min

NameError: name 'ldr_min' is not defined

In [63]:
import hdbscan


ldr_min


Out[63]:
ts
2016-08-12 10:45:00+02:00    656.0
2016-08-12 10:50:00+02:00    656.0
2016-08-12 10:55:00+02:00    654.0
2016-08-12 11:00:00+02:00    653.0
2016-08-12 11:05:00+02:00    651.0
2016-08-12 11:10:00+02:00    649.0
2016-08-12 11:15:00+02:00    647.0
2016-08-12 11:20:00+02:00    645.0
2016-08-12 11:25:00+02:00    645.0
2016-08-12 11:30:00+02:00    641.0
2016-08-12 11:35:00+02:00    634.0
2016-08-12 11:40:00+02:00    635.0
2016-08-12 11:45:00+02:00    632.0
2016-08-12 11:50:00+02:00    627.0
2016-08-12 11:55:00+02:00    630.0
2016-08-12 12:00:00+02:00    426.0
2016-08-12 12:05:00+02:00    426.0
2016-08-12 12:10:00+02:00    302.0
2016-08-12 12:15:00+02:00    467.0
2016-08-12 12:20:00+02:00    342.0
2016-08-12 12:25:00+02:00    450.0
2016-08-12 12:30:00+02:00    458.0
2016-08-12 12:35:00+02:00    421.0
2016-08-12 12:40:00+02:00    415.0
2016-08-12 12:45:00+02:00    552.0
2016-08-12 12:50:00+02:00    425.0
2016-08-12 12:55:00+02:00    417.0
2016-08-12 13:00:00+02:00    448.0
2016-08-12 13:05:00+02:00    532.0
2016-08-12 13:10:00+02:00    501.0
                             ...  
2016-09-14 10:50:00+02:00    578.0
2016-09-14 10:55:00+02:00    577.0
2016-09-14 11:00:00+02:00    570.0
2016-09-14 11:05:00+02:00    566.0
2016-09-14 11:10:00+02:00    562.0
2016-09-14 11:15:00+02:00    559.0
2016-09-14 11:20:00+02:00    556.0
2016-09-14 11:25:00+02:00    557.0
2016-09-14 11:30:00+02:00    557.0
2016-09-14 11:35:00+02:00    550.0
2016-09-14 11:40:00+02:00    555.0
2016-09-14 11:45:00+02:00    550.0
2016-09-14 11:50:00+02:00    540.0
2016-09-14 11:55:00+02:00    537.0
2016-09-14 12:00:00+02:00    536.0
2016-09-14 12:05:00+02:00    527.0
2016-09-14 12:10:00+02:00    524.0
2016-09-14 12:15:00+02:00    510.0
2016-09-14 12:20:00+02:00    383.0
2016-09-14 12:25:00+02:00    507.0
2016-09-14 12:30:00+02:00    505.0
2016-09-14 12:35:00+02:00    413.5
2016-09-14 12:40:00+02:00    397.0
2016-09-14 12:45:00+02:00    457.5
2016-09-14 12:50:00+02:00    487.0
2016-09-14 12:55:00+02:00    434.0
2016-09-14 13:00:00+02:00    474.0
2016-09-14 13:05:00+02:00    464.0
2016-09-14 13:10:00+02:00    456.0
2016-09-14 13:15:00+02:00    426.0
Freq: 5T, Name: ldr, dtype: float64

In [77]:
def _step(x):
    __name__ = 'step'
    return x[-1] - x[0]

features = homog.median_filter.iloc[:1000000].resample('1min').agg(['min', 'max', 'median', 'std', _step])
print_cyan(features.head())
f_sol = sun_position(features.index)
features = f_sol.join(features)
features['minuteofday'] = features.index.hour * 60 + features.index.minute
features['dayofyear'] = features.index.dayofyear
features.tail()


                             min    max  median         std  _step
ts                                                                
2016-08-12 10:46:00+02:00   -1.0  658.0   656.0  233.318562   -1.0
2016-08-12 10:47:00+02:00   -1.0  658.0   328.0  331.944664  658.0
2016-08-12 10:48:00+02:00  656.0  657.0   656.0    0.129099   -1.0
2016-08-12 10:49:00+02:00  653.0  656.0   654.0    0.911167   -2.0
2016-08-12 10:50:00+02:00  651.0  654.0   654.0    0.936395   -1.0
Out[77]:
altitude azimuth min max median std _step minuteofday dayofyear
ts
2016-08-24 00:29:00+02:00 -35.428864 150.403626 27.0 32.0 31.0 1.344060 0.0 29 237
2016-08-24 00:30:00+02:00 -35.525142 150.680822 26.0 31.0 31.0 1.477133 0.0 30 237
2016-08-24 00:31:00+02:00 -35.620594 150.958783 26.0 31.0 31.0 1.536284 0.0 31 237
2016-08-24 00:32:00+02:00 -35.715216 151.237510 25.0 31.0 31.0 1.580245 0.0 32 237
2016-08-24 00:33:00+02:00 -35.809005 151.516974 31.0 31.0 31.0 0.000000 0.0 33 237

In [79]:
model = hdbscan.HDBSCAN().fit(features)
model


Out[79]:
HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=True, core_dist_n_jobs=4, gen_min_span_tree=False,
    leaf_size=40, memory=Memory(cachedir=None), metric='euclidean',
    min_cluster_size=5, min_samples=None, p=None)

In [81]:
model.condensed_tree_.plot()


Out[81]:
<matplotlib.axes._subplots.AxesSubplot at 0x119e06f98>

In [83]:
sns.distplot(model.outlier_scores_[np.isfinite(model.outlier_scores_)])


/Users/uge/anaconda/envs/py35/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[83]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a5ab8d0>

In [84]:
model.outlier_scores_[np.isfinite(model.outlier_scores_)]


Out[84]:
array([ 0.3093807 ,  0.22379943,  0.03867117, ...,  0.01148965,
        0.01768995,  0.08133789])

In [94]:
plt.scatter(features.T.ix[0], features.T.ix[3], c='b', lw=0, s=5, alpha=.1)
frame = plt.gca()
frame.axes.get_xaxis().set_visible(False)
frame.axes.get_yaxis().set_visible(False)



In [103]:
from sklearn.manifold import TSNE
X = features.drop(['min', 'max'], axis=1).values
tsne = TSNE(n_components=2, random_state=0)
#np.set_printoptions(suppress=True)
tsne_reduced = tsne.fit_transform(X)
tsne_reduced[:10]


Out[103]:
array([[  1.30982160e+08,   2.50803557e+07],
       [  2.94328709e+07,   4.92886635e+07],
       [  1.03547426e+07,  -7.74192798e+04],
       [  2.93293363e+07,  -1.78001124e+07],
       [  6.96536245e+05,   3.84192842e+06],
       [  7.11628434e+06,  -6.22430751e+05],
       [  7.46185531e+06,   5.28138109e+06],
       [  6.55247459e+07,   2.23949017e+07],
       [  2.65212748e+08,   7.67494978e+06],
       [  1.45916860e+08,   3.76250872e+07]])

In [124]:
#frame = plt.gca()
#frame.axes.get_xaxis().set_visible(False)
#frame.axes.get_yaxis().set_visible(False)

In [ ]:


In [101]:
features.corr()


Out[101]:
altitude azimuth min max median std _step minuteofday dayofyear
altitude 1.000000 -0.012503 0.850954 0.850405 0.851051 0.022836 0.001321 0.386252 -0.072215
azimuth -0.012503 1.000000 0.035843 0.045389 0.040668 0.055951 -0.017556 0.540687 -0.034865
min 0.850954 0.035843 1.000000 0.987248 0.994077 -0.042512 -0.000487 0.315131 -0.010941
max 0.850405 0.045389 0.987248 1.000000 0.994085 0.112119 -0.001233 0.323431 -0.014805
median 0.851051 0.040668 0.994077 0.994085 1.000000 0.035016 -0.005029 0.318545 -0.012935
std 0.022836 0.055951 -0.042512 0.112119 0.035016 1.000000 -0.045698 0.056644 -0.028044
_step 0.001321 -0.017556 -0.000487 -0.001233 -0.005029 -0.045698 1.000000 -0.015437 -0.000361
minuteofday 0.386252 0.540687 0.315131 0.323431 0.318545 0.056644 -0.015437 1.000000 -0.069810
dayofyear -0.072215 -0.034865 -0.010941 -0.014805 -0.012935 -0.028044 -0.000361 -0.069810 1.000000

In [ ]:


In [212]:
features.head()


Out[212]:
altitude azimuth min max median std _step minuteofday dayofyear
ts
2016-08-12 10:46:00+02:00 39.711430 -76.190926 -1.0 658.0 656.0 233.318562 -1.0 646 225
2016-08-12 10:47:00+02:00 39.900770 -75.995521 -1.0 658.0 328.0 331.944664 658.0 647 225
2016-08-12 10:48:00+02:00 40.089949 -75.799303 656.0 657.0 656.0 0.129099 -1.0 648 225
2016-08-12 10:49:00+02:00 40.278964 -75.602273 653.0 656.0 654.0 0.911167 -2.0 649 225
2016-08-12 10:50:00+02:00 40.467812 -75.404416 651.0 654.0 654.0 0.936395 -1.0 650 225

In [168]:
from sklearn.preprocessing import StandardScaler

features_valid = features.drop(['min', 'max', 'minuteofday', 'dayofyear'], axis=1).drop(features.index[features['min'] < 0])
features_valid.loc[features_valid.altitude < -5, 'altitude'] = -5
print_red(features_valid.describe())

X = features_valid.values
X_std = StandardScaler().fit_transform(features_valid)
features_std = pd.DataFrame(X_std, columns=features_valid.columns, index=features_valid.index)
features_std.describe()


           altitude       azimuth        median           std         _step   minuteofday     dayofyear
count  16256.000000  16256.000000  16256.000000  16256.000000  16256.000000  16256.000000  16256.000000
mean      19.464209      0.386808    274.202664      2.987711     -0.064345    724.815637    230.763164
std       25.751859    106.938261    240.523119     13.126513     31.662052    411.117055      3.365742
min       -5.000000   -179.981709     27.000000      0.000000   -563.000000      0.000000    225.000000
25%       -5.000000    -98.707888     35.000000      0.950022     -1.000000    370.000000    228.000000
50%       10.304846      2.192448    229.000000      1.351291      0.000000    732.000000    231.000000
75%       44.189494     98.213242    519.000000      1.799168      1.000000   1076.250000    234.000000
max       66.121340    179.976951    730.000000    277.401361    553.000000   1439.000000    237.000000
Out[168]:
altitude azimuth median std _step minuteofday dayofyear
count 1.625600e+04 1.625600e+04 1.625600e+04 1.625600e+04 1.625600e+04 1.625600e+04 1.625600e+04
mean 3.781834e-16 -8.590296e-17 -6.146658e-16 3.332855e-18 -6.204427e-17 -7.477068e-17 1.363994e-13
std 1.000031e+00 1.000031e+00 1.000031e+00 1.000031e+00 1.000031e+00 1.000031e+00 1.000031e+00
min -9.500270e-01 -1.686712e+00 -1.027803e+00 -2.276159e-01 -1.778005e+01 -1.763094e+00 -1.712354e+00
25% -9.500270e-01 -9.266818e-01 -9.945407e-01 -1.552394e-01 -2.955220e-02 -8.630791e-01 -8.209927e-01
50% -3.556887e-01 1.688540e-02 -1.879406e-01 -1.246691e-01 2.032321e-03 1.747576e-02 7.036871e-02
75% 9.601654e-01 9.148217e-01 1.017802e+00 -9.054801e-02 3.361684e-02 8.548543e-01 9.617301e-01
max 1.811852e+00 1.679433e+00 1.895083e+00 2.090594e+01 1.746827e+01 1.737234e+00 1.853091e+00

In [125]:
tsne = TSNE(n_components=2, random_state=0)
tsne_reduced = tsne.fit_transform(X_std)
print_red(tsne_reduced[:10])

plt.scatter(tsne_reduced[:,0], tsne_reduced[:,1], c=tableau20[4], lw=0, s=5, alpha=.4)


[[  7.79396690e+08   5.93574991e+08]
 [  2.60908618e+09   8.67096119e+08]
 [  2.00054623e+08   1.40657944e+08]
 [ -4.44121644e+07   8.77604124e+07]
 [ -1.02722929e+08   2.69859072e+08]
 [ -5.01517877e+07   1.64084576e+07]
 [  3.24349944e+08   5.91601679e+07]
 [  1.46952871e+08   2.02462449e+07]
 [ -4.41277019e+07   1.76492424e+07]
 [  1.67422645e+09   3.90217070e+07]]
Out[125]:
<matplotlib.collections.PathCollection at 0x1503bf3c8>

In [208]:
from sklearn.decomposition import PCA, KernelPCA, MiniBatchSparsePCA, IncrementalPCA
from itertools import cycle


def _plot_scatter(x, labels=None):
    cmap_ctableau = cycle(tableau20[::2])
    f, axes = plt.subplots(1, 2, figsize=(12.5, 6))
    if labels is None:
        axes[0].scatter(x[:,0], x[:,1], c=tableau20[4], lw=0, s=2, alpha=.8)
        axes[1].scatter(x[:,0], x[:,1], c=tableau20[4], lw=0, s=10, alpha=.05)
    else:
        for k, col in zip(sorted(set(labels)), cmap_ctableau):
            mask = labels == k
            axes[0].scatter(x[mask,0], x[mask,1], c=col, lw=0, s=2, alpha=.8)
            axes[1].scatter(x[mask,0], x[mask,1], c=col, lw=0, s=10, alpha=.05)
    return axes


X_pca = PCA(n_components=2).fit_transform(X)
_plot_scatter(X_pca)

X_pca_std = PCA(n_components=2).fit_transform(X_std)
_plot_scatter(X_pca_std)


Out[208]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x159bcdcc0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x159fd8400>], dtype=object)

In [138]:
_plot_scatter(tsne_reduced)


Out[138]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x153b89588>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x1557a6ac8>], dtype=object)

In [139]:
X_kpca = KernelPCA(n_components=2).fit_transform(X)
_plot_scatter(X_kpca)

X_kpca_std = KernelPCA(n_components=2).fit_transform(X_std)
_plot_scatter(X_kpca_std)


Out[139]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x15252c048>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x1528bb240>], dtype=object)

In [149]:
from hdbscan import HDBSCAN

clusterer_hdbscan = HDBSCAN(min_cluster_size=60, min_samples=20).fit(X_std)
print_red(len(set(clusterer_hdbscan.labels_)))
clusterer_hdbscan


75
Out[149]:
HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=True, core_dist_n_jobs=4, gen_min_span_tree=False,
    leaf_size=40, memory=Memory(cachedir=None), metric='euclidean',
    min_cluster_size=60, min_samples=20, p=None)

In [150]:
_plot_scatter(X_kpca_std, clusterer_hdbscan.labels_)


Out[150]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x1546faf98>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x1547794e0>], dtype=object)

In [210]:
for x in [X, X_kpca_std, X_pca, tsne_reduced]:
    print_info(x.shape)
    clusterer_hdbscan = HDBSCAN(min_cluster_size=10, min_samples=100).fit(x)
    print_red(len(set(clusterer_hdbscan.labels_)))
    _plot_scatter(x, clusterer_hdbscan.labels_)
    _plot_scatter(X_pca, clusterer_hdbscan.labels_)
    plt.show()


(16256, 7)
8
(16256, 2)
12
(16256, 2)
9
(16256, 2)
4

In [159]:
features_std.corr()


Out[159]:
altitude azimuth median std _step minuteofday dayofyear
altitude 1.000000 0.006053 0.867180 0.012862 0.000229 0.290303 -0.073101
azimuth 0.006053 1.000000 0.067808 0.068802 -0.021213 0.539340 -0.023976
median 0.867180 0.067808 1.000000 0.030743 0.003553 0.354406 -0.025322
std 0.012862 0.068802 0.030743 1.000000 -0.104736 0.062399 -0.025908
_step 0.000229 -0.021213 0.003553 -0.104736 1.000000 -0.019271 0.003178
minuteofday 0.290303 0.539340 0.354406 0.062399 -0.019271 1.000000 -0.055530
dayofyear -0.073101 -0.023976 -0.025322 -0.025908 0.003178 -0.055530 1.000000

In [211]:
clusterer_hdbscan.labels_


Out[211]:
array([-1, -1, -1, ..., -1, -1, -1])

In [204]:
from sklearn.linear_model import LinearRegression

data_ls = features_valid.copy() #.drop(['_step'], axis=1)
y_ls = data_ls['median'].values
X_ls = data_ls.drop(['median'], axis=1).values

LR = LinearRegression(n_jobs=-1)
LR.fit(X_ls, y_ls)
LR.score(X_ls, y_ls)


Out[204]:
0.76552867912601141

In [ ]:


In [205]:
ax = plt.plot(y_ls[:4000])
plt.plot(LR.predict(X_ls)[:4000])


Out[205]:
[<matplotlib.lines.Line2D at 0x152565f28>]

In [213]:
features_valid.head()


Out[213]:
altitude azimuth median std _step minuteofday dayofyear
ts
2016-08-12 10:48:00+02:00 40.089949 -75.799303 656.0 0.129099 -1.0 648 225
2016-08-12 10:49:00+02:00 40.278964 -75.602273 654.0 0.911167 -2.0 649 225
2016-08-12 10:50:00+02:00 40.467812 -75.404416 654.0 0.936395 -1.0 650 225
2016-08-12 10:51:00+02:00 40.656493 -75.205719 656.0 0.945402 3.0 651 225
2016-08-12 10:52:00+02:00 40.844999 -75.006168 656.0 0.342803 0.0 652 225

In [ ]: