In [2]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
from IPython.display import Image, HTML
import random
import math
import pysolar
import datetime as dt
import pytz
import matplotlib.dates as mpd
import scipy
from scipy import signal
from scipy.signal import medfilt
import numexpr as ne
from datacharm import *
from datacharm.dataplots import tableau20, ch_color
from enerpi.base import timeit
from enerpi.api import enerpi_data_catalog
from enerpi.enerplot import plot_tile_last_24h, plot_power_consumption_hourly
from enerpi.process import separa_ldr_artificial_natural, get_solar_day
from enerpi.radsolar import ajuste_ldr_con_barrido_irrad_inc, local_maxima_minima
LAT = 38.631463
LONG = -0.866402
ELEV_M = 500
TZ = pytz.timezone('Europe/Madrid')
FS = (16, 10)
def _tslim(ax, h0=12, hf=22, m0=0, mf=0):
xlim = [mpd.num2date(x, tz=TZ) for x in ax.get_xlim()]
new = [mpd.date2num(d.replace(hour=h, minute=m)) for d, h, m in zip(xlim, [h0, hf], [m0, mf])]
ax.set_xlim(new)
return ax
def set_sun_times(df, tz=TZ, lat_deg=LAT, long_deg=LONG, offset='5min'):
if df.index[-1].date() == df.index[0].date():
# Same day:
ts_day = pd.Timestamp(df.index[0].date(), tz=tz)
sunrise, sunset = pysolar.util.get_sunrise_sunset(lat_deg, long_deg, ts_day + pd.Timedelta('12h'))
delta = pd.Timedelta(offset)
sunrise, sunset = [sunrise - delta, sunset + delta]
df['sun'] = False
df.loc[sunrise:sunset, 'sun'] = True
return df
else:
print_err('IMPLEMENTAR!')
assert()
sns.set_style('whitegrid')
# Catálogo y lectura de todos los datos.
cat = enerpi_data_catalog()
data, data_s = cat.get_all_data()
print_info(data_s.tail())
LDR = pd.DataFrame(data.ldr).tz_localize(TZ)
print_cyan(LDR.describe().T.astype(int))
LDR.hist(bins=(LDR.max() - LDR.min() + 1).values[0] // 5, log=True, figsize=(18, 8))
Image('/Users/uge/Desktop/LDR_analysis/LDR_análisis_día_2016_09_12.png')
Out[2]:
In [14]:
pois = pd.read_csv('/Users/uge/Desktop/LDR_analysis/POIS_labeled.h5_.csv').set_index('ts_fin')
labeled = pois[pois.solo_natural.notnull()].copy()
labeled.solo_natural = labeled.solo_natural.astype(bool)
labeled.artificial_princ = labeled.artificial_princ.astype(bool)
labeled.head()
Out[14]:
In [21]:
#sns.regplot(['altitud', 'step'], 'solo_natural', data=labeled, logistic=True)
labeled.groupby('solo_natural').mean()
Out[21]:
In [24]:
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
labeled.columns
Out[24]:
In [72]:
y_art_p, X_art_p = dmatrices('''artificial_princ ~ step + pendiente_i + pendiente_f + altitud + ldr_median + inicio + fin +
C(solo_natural) + C(fin_sun)''',
labeled, return_type="dataframe")
# flatten y into a 1-D array
#y = np.ravel(y)
print(list(X.columns), y_art_p.shape, X_art_p.shape)
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X_art_p, y_art_p.iloc[:,1])
# check the accuracy on the training set
print_ok(model.score(X_art_p, y_art_p.iloc[:,0].values))
print_ok(model.score(X_art_p, y_art_p.iloc[:,1].values))
y_art_p.mean()
Out[72]:
In [88]:
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_art_p, y_art_p.iloc[:,1], test_size=0.15, random_state=0)
model2 = LogisticRegression()
print_red(model2.fit(X_train, y_train))
# predict class labels for the test set
predicted = model2.predict(X_test)
print (predicted.astype(bool))
# generate class probabilities
probs = model2.predict_proba(X_test)
print((probs[:,1] * 100).astype(int))
# generate evaluation metrics
print_ok(metrics.accuracy_score(y_test, predicted))
print_ok(metrics.roc_auc_score(y_test, probs[:, 1]))
print(metrics.confusion_matrix(y_test, predicted))
print_info(metrics.classification_report(y_test, predicted))
# evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegression(), X_art_p, y_art_p.iloc[:,1], scoring='accuracy', cv=10)
print_info(scores)
print_ok(scores.mean())
In [59]:
pd.DataFrame(list(zip(X.columns, np.transpose(model.coef_))))
Out[59]:
In [65]:
pd.DataFrame(list(zip(X.columns, model.coef_.T[:,0])))
#model.coef_.T
Out[65]:
In [5]:
# MULTI Clustering - EVENTOS
eventos = pd.read_hdf('../enerpi/eventos_multi_clustering.h5', 'eventos').iloc[1:-1]
clusters = pd.read_hdf('../enerpi/eventos_multi_clustering.h5', 'clustering')
print('eventos.shape, clusters.shape: ', eventos.shape, clusters.shape)
grupo = clusters.groupby('super_cluster').get_group('cluster_subidas_7').drop('super_cluster', axis=1).reset_index()
grupo.set_index(['k_km', 'k_pca', 'k_dbscan', 'k_afp', 'k_ag']).sort_index().head(10)
Out[5]:
In [6]:
num_cluster_combined = grupo.groupby(['k_km', 'k_pca', 'k_dbscan', 'k_afp', 'k_ag']).ts.count().sort_values(ascending=False)
identif_acum = num_cluster_combined.cumsum()
principales = num_cluster_combined[identif_acum < len(grupo) // 10 * 8]
print_cyan(principales)
ax = identif_acum.plot()
ax.hlines([len(grupo) // 10 * 8], 0, 100)
idx_comb_i = grupo.set_index(principales.index.names).sort_index().loc[principales.index[0], 'ts']
eventos.loc[pd.DatetimeIndex(idx_comb_i.reset_index(drop=True), tz=TZ)].head()
Out[6]:
In [3]:
homog = LDR.resample('1s').mean().fillna(-1).astype(int)
homog['median_filter'] = medfilt(homog.ldr, kernel_size=7)
homog.head(10)
Out[3]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [7]:
def plot_events_on_day(eventos, data_ldr_mf):
delta = pd.Timedelta('2min')
delta_big = pd.Timedelta('10min')
ax, day_plot = None, None
for t, row in eventos.iterrows():
#print_secc(str(t))
roundings = data_ldr_mf.loc[t-delta:t+delta].copy()
roundings_big = data_ldr_mf.loc[t-delta_big:t+delta_big].copy()
if not roundings.empty:
if ax is None:
day_plot = t.date()
else:
t = dt.datetime.combine(day_plot, t.time())
new_index = pd.DatetimeIndex([dt.datetime.combine(day_plot, t) for t in roundings.index.time])
new_index_big = pd.DatetimeIndex([dt.datetime.combine(day_plot, t) for t in roundings_big.index.time])
roundings.index = new_index
roundings_big.index = new_index_big
if ax is None:
ax = roundings['ldr'].plot(figsize=FS, lw=1, alpha=.8, color=tableau20[1])
else:
roundings['ldr'].plot(ax=ax, lw=1, alpha=.8, color=tableau20[1])
ini = roundings['median_filter'].loc[:t]
fin = roundings['median_filter'].loc[t:]
if not ini.empty:
ini.plot(ax=ax, lw=1.5, alpha=.8, color=tableau20[6])
else:
print_magenta('No hay INIT')
if not fin.empty:
fin.plot(ax=ax, lw=1.5, alpha=.8, color=tableau20[4])
else:
print_red('No hay FIN')
#ax.vlines([t], 0, 800, lw=1, linestyle='--', alpha=.6)
#roundings_big.ldr.plot(ax=ax, lw=.5, alpha=.5, color=tableau20[1])
plt.show()
idx_comb_i = grupo.set_index(principales.index.names).sort_index().loc[principales.index[3], 'ts']
plot_events_on_day(eventos.loc[pd.DatetimeIndex(idx_comb_i.reset_index(drop=True), tz=TZ)], homog)
In [179]:
principales.index[3]
Out[179]:
In [181]:
homog['es_evento'] = False
homog.loc[eventos.index, 'es_evento'] = True
homog.head(10)
Out[181]:
In [6]:
import statsmodels.api as sm
data_stats = homog.loc[eventos.index[0]:eventos.index[-1]].iloc[1:-1]
data_stats
#sm.tsa.acf(data_stats.median_filter, nlags=60)
Out[6]:
In [31]:
#def ewma_outlier(tsdf, stdlimit=5, span=15) :
# EWMA:
tsdf = data_stats.iloc[:10000].copy()
tsdf.loc[tsdf.ldr == -1, 'ldr'] = np.nan
stdlimit = 5
span = 7
ewm = tsdf['ldr'].ewm(adjust=True,ignore_na=True,min_periods=3,span=5)
tsdf['ldr_binpred'] = ewm.mean().shift(-1)
tsdf['ldr_binstd'] = ewm.std().shift(-1)
tsdf['ldr_stds'] = ((tsdf['ldr'] - tsdf['ldr_binpred']) / tsdf['ldr_binstd'])
tsdf['ldr_outlier'] = (tsdf['ldr_stds'].abs() > stdlimit)
tsdf.head(20)
Out[31]:
In [21]:
tsdf.plot(figsize=FS)
Out[21]:
In [32]:
tsdf.between_time('12:11', '12:21')[['ldr', 'ldr_binpred', 'median_filter']].plot(figsize=FS)
Out[32]:
In [33]:
tsdf.between_time('12:11', '12:21')[['ldr', 'ldr_binpred']].plot(figsize=FS)
Out[33]:
In [45]:
np.all(np.isnan(np.abs(np.fft.rfft(tsdf.ldr, n=1000))))
Out[45]:
In [48]:
ldr_min = LDR.ldr.resample('1s').mean().fillna(-1).resample('5min').median()
ldr_min.head()
Out[48]:
In [9]:
ldr_min = LDR.ldr.resample('1s').mean().resample('5min').apply(lambda x: np.nanmedian(x) if np.any(np.isfinite(x)) else np.nan)
ldr_min.head()
Out[9]:
In [11]:
ldr_min.groupby(lambda x: x.time).std().plot()
Out[11]:
In [59]:
gb_time = ldr_min.groupby(lambda x: x.time)
def _f_perc(y):
def __perc(x):
return np.percentile(x[np.isfinite(x)], y) if np.isfinite(x).sum() > 5 else np.nan
__perc.__name__ = 'perc_{}'.format(y)
return __perc
#f1 = lambda x: _f_perc(x, 90)
##f1.__name__ = 'perc_90'
#f2 = lambda x: _f_perc(x, 10)
#f2.__name__ = 'perc_10'
typical_day = gb_time.agg(['min', 'max', 'mean', 'median', 'std',
f1, f2])
typical_day.plot(figsize=FS)
Out[59]:
In [56]:
plt.fill_between?
#pd.DataFrame(ldr_min).apply(lambda x: x - typical_day.loc[x.index.time, 'median'].values).iloc[:1000].plot()
In [1]:
from enerpi.sunposition import sun_position
sun_position(ldr_min)
#ldr_min
In [63]:
import hdbscan
ldr_min
Out[63]:
In [77]:
def _step(x):
__name__ = 'step'
return x[-1] - x[0]
features = homog.median_filter.iloc[:1000000].resample('1min').agg(['min', 'max', 'median', 'std', _step])
print_cyan(features.head())
f_sol = sun_position(features.index)
features = f_sol.join(features)
features['minuteofday'] = features.index.hour * 60 + features.index.minute
features['dayofyear'] = features.index.dayofyear
features.tail()
Out[77]:
In [79]:
model = hdbscan.HDBSCAN().fit(features)
model
Out[79]:
In [81]:
model.condensed_tree_.plot()
Out[81]:
In [83]:
sns.distplot(model.outlier_scores_[np.isfinite(model.outlier_scores_)])
Out[83]:
In [84]:
model.outlier_scores_[np.isfinite(model.outlier_scores_)]
Out[84]:
In [94]:
plt.scatter(features.T.ix[0], features.T.ix[3], c='b', lw=0, s=5, alpha=.1)
frame = plt.gca()
frame.axes.get_xaxis().set_visible(False)
frame.axes.get_yaxis().set_visible(False)
In [103]:
from sklearn.manifold import TSNE
X = features.drop(['min', 'max'], axis=1).values
tsne = TSNE(n_components=2, random_state=0)
#np.set_printoptions(suppress=True)
tsne_reduced = tsne.fit_transform(X)
tsne_reduced[:10]
Out[103]:
In [124]:
#frame = plt.gca()
#frame.axes.get_xaxis().set_visible(False)
#frame.axes.get_yaxis().set_visible(False)
In [ ]:
In [101]:
features.corr()
Out[101]:
In [ ]:
In [212]:
features.head()
Out[212]:
In [168]:
from sklearn.preprocessing import StandardScaler
features_valid = features.drop(['min', 'max', 'minuteofday', 'dayofyear'], axis=1).drop(features.index[features['min'] < 0])
features_valid.loc[features_valid.altitude < -5, 'altitude'] = -5
print_red(features_valid.describe())
X = features_valid.values
X_std = StandardScaler().fit_transform(features_valid)
features_std = pd.DataFrame(X_std, columns=features_valid.columns, index=features_valid.index)
features_std.describe()
Out[168]:
In [125]:
tsne = TSNE(n_components=2, random_state=0)
tsne_reduced = tsne.fit_transform(X_std)
print_red(tsne_reduced[:10])
plt.scatter(tsne_reduced[:,0], tsne_reduced[:,1], c=tableau20[4], lw=0, s=5, alpha=.4)
Out[125]:
In [208]:
from sklearn.decomposition import PCA, KernelPCA, MiniBatchSparsePCA, IncrementalPCA
from itertools import cycle
def _plot_scatter(x, labels=None):
cmap_ctableau = cycle(tableau20[::2])
f, axes = plt.subplots(1, 2, figsize=(12.5, 6))
if labels is None:
axes[0].scatter(x[:,0], x[:,1], c=tableau20[4], lw=0, s=2, alpha=.8)
axes[1].scatter(x[:,0], x[:,1], c=tableau20[4], lw=0, s=10, alpha=.05)
else:
for k, col in zip(sorted(set(labels)), cmap_ctableau):
mask = labels == k
axes[0].scatter(x[mask,0], x[mask,1], c=col, lw=0, s=2, alpha=.8)
axes[1].scatter(x[mask,0], x[mask,1], c=col, lw=0, s=10, alpha=.05)
return axes
X_pca = PCA(n_components=2).fit_transform(X)
_plot_scatter(X_pca)
X_pca_std = PCA(n_components=2).fit_transform(X_std)
_plot_scatter(X_pca_std)
Out[208]:
In [138]:
_plot_scatter(tsne_reduced)
Out[138]:
In [139]:
X_kpca = KernelPCA(n_components=2).fit_transform(X)
_plot_scatter(X_kpca)
X_kpca_std = KernelPCA(n_components=2).fit_transform(X_std)
_plot_scatter(X_kpca_std)
Out[139]:
In [149]:
from hdbscan import HDBSCAN
clusterer_hdbscan = HDBSCAN(min_cluster_size=60, min_samples=20).fit(X_std)
print_red(len(set(clusterer_hdbscan.labels_)))
clusterer_hdbscan
Out[149]:
In [150]:
_plot_scatter(X_kpca_std, clusterer_hdbscan.labels_)
Out[150]:
In [210]:
for x in [X, X_kpca_std, X_pca, tsne_reduced]:
print_info(x.shape)
clusterer_hdbscan = HDBSCAN(min_cluster_size=10, min_samples=100).fit(x)
print_red(len(set(clusterer_hdbscan.labels_)))
_plot_scatter(x, clusterer_hdbscan.labels_)
_plot_scatter(X_pca, clusterer_hdbscan.labels_)
plt.show()
In [159]:
features_std.corr()
Out[159]:
In [211]:
clusterer_hdbscan.labels_
Out[211]:
In [204]:
from sklearn.linear_model import LinearRegression
data_ls = features_valid.copy() #.drop(['_step'], axis=1)
y_ls = data_ls['median'].values
X_ls = data_ls.drop(['median'], axis=1).values
LR = LinearRegression(n_jobs=-1)
LR.fit(X_ls, y_ls)
LR.score(X_ls, y_ls)
Out[204]:
In [ ]:
In [205]:
ax = plt.plot(y_ls[:4000])
plt.plot(LR.predict(X_ls)[:4000])
Out[205]:
In [213]:
features_valid.head()
Out[213]:
In [ ]: