In [ ]:
import copy
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0)
from datetime import datetime,timedelta
from scipy.stats import norm,rayleigh
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import metrics

In [ ]:
with open('../data/df_demand.pkl','rb') as f:
    df_orig = pickle.load(f)

Plots for data Exploration (Histograms and Boxplots)

In order to begin exploring the dataset, we will first plot a histogram of demand values, as well as one for each year. This plot clearly indicates a bimodal distribution. Further analysis shows this bimodality is largely based on a binary variable of peak vs. offpeak (9AM-9PM weekdays being peak).


In [9]:
start_date = datetime(2012,1,1)
end_date = datetime(2014,12,31)
df = copy.deepcopy(df_orig)[start_date:end_date]

#Year-by-Year
bins = 100
high_lim = max(df['Demanda'])
plt.figure()
plt.close('all')
fig = plt.figure()
ax1 = plt.subplot2grid((4,4),(0,0),rowspan=3,colspan=3)
ax1.hist(df[start_date:end_date]['Demanda'],bins,range=[0,high_lim])
plt.title('2012-2014')
plt.xlim([0 ,high_lim])
plt.xlabel('kW')

ax2 = plt.subplot2grid((4,4),(0,3),rowspan=1,colspan=2)
year = 2012
plt.title(year)
start_date = datetime(year,1,1)
end_date = datetime(year,12,31)
ax2.hist(df['Demanda'][df.index.year == year],bins,range=[0,high_lim])
plt.xlim([0 ,high_lim])
plt.ylim([0,400])
plt.xlabel('kW')

ax3 = plt.subplot2grid((4,4),(1,3),rowspan=1,colspan=2)
year = 2013
plt.title(year)
start_date = datetime(year,1,1)
end_date = datetime(year,12,31)
ax3.hist(df['Demanda'][df.index.year == year],bins,range=[0,high_lim])
plt.xlim([0 ,high_lim])
plt.ylim([0,400])
plt.xlabel('kW')

ax4 = plt.subplot2grid((4,4),(2,3),rowspan=1,colspan=2)
year = 2014
plt.title(year)
start_date = datetime(year,1,1)
end_date = datetime(year,12,31)
ax4.hist(df['Demanda'][df.index.year == year],bins,range=[0,high_lim])
plt.xlim([0 ,high_lim])
plt.ylim([0,400])
plt.xlabel('kW')
plt.tight_layout()



In [13]:
high_lim=max(df['Demanda'])
bins=100
start_date = datetime(2011,1,1)
end_date = datetime(2014,12,31)

plt.hist(df[start_date:end_date]['Demanda'],bins,range=[0,high_lim],label='all')
#off-peak is between 10PM-9AM and all weekends
df_hora_off_peak = df[df['HORA']<9].append(df[df['HORA']>=22])[df['weekend']==0].append(df[df['weekend']==1]).sort()

#peak is between 9AM-10PM on weekdays
df_hora_peak = df[df['HORA']>=9][df['HORA']<22][df['weekend']==0]
#df_hora_peak = df[df['HORA']>=8][df['HORA']<14]
#df_hora_peak = df_hora_peak.append(df[df['HORA']>=18]).sort()

df_hora_super_peak = df[df['HORA']>=18][df['HORA']<22][df['weekend']==0]

plt.hist(df_hora_off_peak[start_date:end_date]['Demanda'],bins,range=[0,high_lim],label='off-peak (9PM-9AM and weekends)',alpha=.6)
plt.hist(df_hora_peak[start_date:end_date]['Demanda'],bins,range=[0,high_lim],label='peak (9AM-9PM only weekdays)',alpha=.5)
plt.xlim([0 ,high_lim])
plt.title('Demand Distribution (Histogram), by feature')
plt.xlabel('#')
plt.ylabel('kW')
lgd = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
pass


/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py:1808: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  "DataFrame index.", UserWarning)

Outlier Analysis

The boxplots show a mean that increases each year, indicating overall greater energy usage as development in Nicaragua increases. 5 outliers occur:

  • March 26th, 2012 at 13:00 (1.998 kWh)
  • May 25th, 2012 at 19:00 (165.765 kWh)
  • May 25th, 2013 at 7:00 (149.684 kWh)
  • December 5th, 2014 at 5:00 (8.080 kWh)
  • December 5th, 2014 at 6:00 (45.030 kWh)

In [10]:
df_bp = {}
demand_2012 = df['Demanda'][df.index.year == 2012]
demand_2013 = df['Demanda'][df.index.year == 2013]
demand_2014 = df['Demanda'][df.index.year == 2014]
bp = plt.boxplot([df['Demanda'],demand_2012,demand_2013,demand_2014])
plt.xticks([1, 2, 3,4,5], ['All','2012','2013','2014'])
df_bp['median'] = bp['medians'][0]._y[0]
df_bp['fences'] = [bp['caps'][1]._y[0], bp['caps'][0]._y[1]]
df_bp['IQR'] = [bp['whiskers'][0]._y[0], bp['whiskers'][1]._y[0]]
print 'Median: ' + str(df_bp['median'])
print 'Fences: ' + str(df_bp['fences'])
print 'IQR: ' + str(df_bp['IQR'])


Median: 437.56
Fences: [165.76499999999999, 636.07600000000002]
IQR: [374.774, 518.42099999999994]

In [12]:
df['Demanda'][df['Demanda']<200]


Out[12]:
datetime
2012-03-26 13:00:00      1.998
2012-05-25 19:00:00    165.765
2013-05-25 07:00:00    149.684
2014-12-05 05:00:00      8.080
2014-12-05 06:00:00     45.030
Name: Demanda, dtype: float64

Clustering to find peak vs. off-peak times

K-Means clustering was used to find peak vs off-peak times for both weekdays and weekends. This seperation of peak and off-peak confirms visual inspection of the average hourly demand data.


In [17]:
df_pk_wd = copy.deepcopy(df_orig[df_orig['month']>1][df_orig['month']<5][df['weekend']==0])
df_opk_month_wd = copy.deepcopy(df_orig[df_orig['month']==1].append(df_orig[df_orig['month']>=5]).sort()[df['weekend']==0])

df_pk_month_we = copy.deepcopy(df_orig[df_orig['month']>1][df_orig['month']<5][df['weekend']==1])
df_opk_month_we = copy.deepcopy(df_orig[df_orig['month']==1].append(df_orig[df_orig['month']>=5]).sort()[df['weekend']==1])


---------------------------------------------------------------------------
IndexingError                             Traceback (most recent call last)
<ipython-input-17-17db5c81b621> in <module>()
      1 df_pk_month_wd = copy.deepcopy(df_orig[df_orig['month']>1][df_orig['month']<5][df['weekend']==0])
----> 2 df_opk_month_wd = copy.deepcopy(df_orig[df_orig['month']==1].append(df_orig[df_orig['month']>=5]).sort()[df['weekend']==0])
      3 
      4 df_pk_month_we = copy.deepcopy(df_orig[df_orig['month']>1][df_orig['month']<5][df['weekend']==1])
      5 df_opk_month_we = copy.deepcopy(df_orig[df_orig['month']==1].append(df_orig[df_orig['month']>=5]).sort()[df['weekend']==1])

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1772         if isinstance(key, (Series, np.ndarray, Index, list)):
   1773             # either boolean or fancy integer index
-> 1774             return self._getitem_array(key)
   1775         elif isinstance(key, DataFrame):
   1776             return self._getitem_frame(key)

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _getitem_array(self, key)
   1812             # _check_bool_indexer will throw exception if Series key cannot
   1813             # be reindexed to match DataFrame rows
-> 1814             key = _check_bool_indexer(self.index, key)
   1815             indexer = key.nonzero()[0]
   1816             return self.take(indexer, axis=0, convert=False)

/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in _check_bool_indexer(ax, key)
   1637         mask = com.isnull(result.values)
   1638         if mask.any():
-> 1639             raise IndexingError('Unalignable boolean Series key provided')
   1640 
   1641         result = result.astype(bool).values

IndexingError: Unalignable boolean Series key provided

In [21]:
def k_means_cluster(X,n_clusters):
    np.random.seed(3)
    estimator = KMeans(n_clusters=n_clusters,n_init=100)
    estimator.fit(X)
    sample_size = len(X)
    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, m_max]x[y_min, y_max].

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = X[:, 0].min() + 1, X[:, 0].max() - 1
    y_min, y_max = X[:, 1].min() + 1, X[:, 1].max() - 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Obtain labels for each point in mesh. Use last trained model.
    Z = estimator.predict(np.c_[xx.ravel(), yy.ravel()])
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')
    plt.scatter(X[:, 0], X[:, 1], c=estimator.labels_.astype(np.float))
    print(metrics.silhouette_score(X, estimator.labels_,metric='euclidean',sample_size=sample_size))
    return estimator.labels_

In [25]:
df_wd = copy.deepcopy(df_orig[df_orig['weekend']==0])
df_we = copy.deepcopy(df_orig[df_orig['weekend']==1])
weekday_mean_hour = []
for val in range(0,24):
    hour_of_weekday_df = df_wd[df_wd['HORA']==val]['Demanda']
    weekday_mean_hour.append([val,np.mean(hour_of_weekday_df)])
plt.figure()
labels_month_2 = k_means_cluster(np.array(weekday_mean_hour),2)
plt.title('Weekday Cluster K=2 By Hour')

weekday_mean_hour = []
for val in range(0,24):
    hour_of_weekday_df = df_we[df_we['HORA']==val]['Demanda']
    weekday_mean_hour.append([vajl,np.mean(hour_of_weekday_df)])
plt.figure()
labels_month_2 = k_means_cluster(np.array(weekday_mean_hour),2)
plt.title('Weekend Cluster K=2 By Hour')


0.725894620573
0.654030304359
Out[25]:
<matplotlib.text.Text at 0x7f84cb32fb50>

In [ ]: