In [ ]:
import copy
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0)
from datetime import datetime,timedelta
from scipy.stats import norm,rayleigh
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import metrics
In [ ]:
with open('../data/df_demand.pkl','rb') as f:
df_orig = pickle.load(f)
In order to begin exploring the dataset, we will first plot a histogram of demand values, as well as one for each year. This plot clearly indicates a bimodal distribution. Further analysis shows this bimodality is largely based on a binary variable of peak vs. offpeak (9AM-9PM weekdays being peak).
In [9]:
start_date = datetime(2012,1,1)
end_date = datetime(2014,12,31)
df = copy.deepcopy(df_orig)[start_date:end_date]
#Year-by-Year
bins = 100
high_lim = max(df['Demanda'])
plt.figure()
plt.close('all')
fig = plt.figure()
ax1 = plt.subplot2grid((4,4),(0,0),rowspan=3,colspan=3)
ax1.hist(df[start_date:end_date]['Demanda'],bins,range=[0,high_lim])
plt.title('2012-2014')
plt.xlim([0 ,high_lim])
plt.xlabel('kW')
ax2 = plt.subplot2grid((4,4),(0,3),rowspan=1,colspan=2)
year = 2012
plt.title(year)
start_date = datetime(year,1,1)
end_date = datetime(year,12,31)
ax2.hist(df['Demanda'][df.index.year == year],bins,range=[0,high_lim])
plt.xlim([0 ,high_lim])
plt.ylim([0,400])
plt.xlabel('kW')
ax3 = plt.subplot2grid((4,4),(1,3),rowspan=1,colspan=2)
year = 2013
plt.title(year)
start_date = datetime(year,1,1)
end_date = datetime(year,12,31)
ax3.hist(df['Demanda'][df.index.year == year],bins,range=[0,high_lim])
plt.xlim([0 ,high_lim])
plt.ylim([0,400])
plt.xlabel('kW')
ax4 = plt.subplot2grid((4,4),(2,3),rowspan=1,colspan=2)
year = 2014
plt.title(year)
start_date = datetime(year,1,1)
end_date = datetime(year,12,31)
ax4.hist(df['Demanda'][df.index.year == year],bins,range=[0,high_lim])
plt.xlim([0 ,high_lim])
plt.ylim([0,400])
plt.xlabel('kW')
plt.tight_layout()
In [13]:
high_lim=max(df['Demanda'])
bins=100
start_date = datetime(2011,1,1)
end_date = datetime(2014,12,31)
plt.hist(df[start_date:end_date]['Demanda'],bins,range=[0,high_lim],label='all')
#off-peak is between 10PM-9AM and all weekends
df_hora_off_peak = df[df['HORA']<9].append(df[df['HORA']>=22])[df['weekend']==0].append(df[df['weekend']==1]).sort()
#peak is between 9AM-10PM on weekdays
df_hora_peak = df[df['HORA']>=9][df['HORA']<22][df['weekend']==0]
#df_hora_peak = df[df['HORA']>=8][df['HORA']<14]
#df_hora_peak = df_hora_peak.append(df[df['HORA']>=18]).sort()
df_hora_super_peak = df[df['HORA']>=18][df['HORA']<22][df['weekend']==0]
plt.hist(df_hora_off_peak[start_date:end_date]['Demanda'],bins,range=[0,high_lim],label='off-peak (9PM-9AM and weekends)',alpha=.6)
plt.hist(df_hora_peak[start_date:end_date]['Demanda'],bins,range=[0,high_lim],label='peak (9AM-9PM only weekdays)',alpha=.5)
plt.xlim([0 ,high_lim])
plt.title('Demand Distribution (Histogram), by feature')
plt.xlabel('#')
plt.ylabel('kW')
lgd = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
pass
The boxplots show a mean that increases each year, indicating overall greater energy usage as development in Nicaragua increases. 5 outliers occur:
In [10]:
df_bp = {}
demand_2012 = df['Demanda'][df.index.year == 2012]
demand_2013 = df['Demanda'][df.index.year == 2013]
demand_2014 = df['Demanda'][df.index.year == 2014]
bp = plt.boxplot([df['Demanda'],demand_2012,demand_2013,demand_2014])
plt.xticks([1, 2, 3,4,5], ['All','2012','2013','2014'])
df_bp['median'] = bp['medians'][0]._y[0]
df_bp['fences'] = [bp['caps'][1]._y[0], bp['caps'][0]._y[1]]
df_bp['IQR'] = [bp['whiskers'][0]._y[0], bp['whiskers'][1]._y[0]]
print 'Median: ' + str(df_bp['median'])
print 'Fences: ' + str(df_bp['fences'])
print 'IQR: ' + str(df_bp['IQR'])
In [12]:
df['Demanda'][df['Demanda']<200]
Out[12]:
In [17]:
df_pk_wd = copy.deepcopy(df_orig[df_orig['month']>1][df_orig['month']<5][df['weekend']==0])
df_opk_month_wd = copy.deepcopy(df_orig[df_orig['month']==1].append(df_orig[df_orig['month']>=5]).sort()[df['weekend']==0])
df_pk_month_we = copy.deepcopy(df_orig[df_orig['month']>1][df_orig['month']<5][df['weekend']==1])
df_opk_month_we = copy.deepcopy(df_orig[df_orig['month']==1].append(df_orig[df_orig['month']>=5]).sort()[df['weekend']==1])
In [21]:
def k_means_cluster(X,n_clusters):
np.random.seed(3)
estimator = KMeans(n_clusters=n_clusters,n_init=100)
estimator.fit(X)
sample_size = len(X)
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max].
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = X[:, 0].min() + 1, X[:, 0].max() - 1
y_min, y_max = X[:, 1].min() + 1, X[:, 1].max() - 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain labels for each point in mesh. Use last trained model.
Z = estimator.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.clf()
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect='auto', origin='lower')
plt.scatter(X[:, 0], X[:, 1], c=estimator.labels_.astype(np.float))
print(metrics.silhouette_score(X, estimator.labels_,metric='euclidean',sample_size=sample_size))
return estimator.labels_
In [25]:
df_wd = copy.deepcopy(df_orig[df_orig['weekend']==0])
df_we = copy.deepcopy(df_orig[df_orig['weekend']==1])
weekday_mean_hour = []
for val in range(0,24):
hour_of_weekday_df = df_wd[df_wd['HORA']==val]['Demanda']
weekday_mean_hour.append([val,np.mean(hour_of_weekday_df)])
plt.figure()
labels_month_2 = k_means_cluster(np.array(weekday_mean_hour),2)
plt.title('Weekday Cluster K=2 By Hour')
weekday_mean_hour = []
for val in range(0,24):
hour_of_weekday_df = df_we[df_we['HORA']==val]['Demanda']
weekday_mean_hour.append([vajl,np.mean(hour_of_weekday_df)])
plt.figure()
labels_month_2 = k_means_cluster(np.array(weekday_mean_hour),2)
plt.title('Weekend Cluster K=2 By Hour')
Out[25]:
In [ ]: