In [29]:
import copy
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0)
from datetime import datetime,timedelta
from scipy.stats import norm,rayleigh
import numpy as np
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
sns.set(context="paper", font="monospace")
from sklearn import cross_validation
Sources
The demand/generation data from the National Dispactch Center (CNDC) is available as an html table where each column is a different generation source, with the last column being the overall national demand, and each row being an hour of the day. A seperate table is generated in a seperate web page for every day, so the following code was used to collect these tables, clean the data, and organize all of the data into a single pandas dataframe for future analysis. With the exception of some null/missing values, the data provided is hourly. This data is available as far back as June 21st, 2011. In order to capture yearly trends, only the full years of data (2012-2014) were used for analysis.
The weather data is taken from a free api that provides 3-hour weather (temperature,wind,barometric pressure,rainfall,etc.) data taken from the Managua Airport. This data is available as far back as July 1st, 2008.
The code for scraping and cleaning data is in the Data_Collection notebook. The end result of this cleaning is a single dataframe that is indexed by datetime and contains generation,weather, and demand data at hourly intervals. Since the weather data is only available at 3-hour intervals, it is interpolated to fit the hourly electricity data.
Missing Data
There are 52 missing data points from different generation facilities within the 2012-2014 demand/generation dataset, and none from the weather dataset:
Additional Variables
In addition to demand, generation, and weather data, the following variables were added to the dataset:
The boxplot shows that the mean has been gradually increasing over the years. It also shows the outliers that occur.
Outliers
The histogram shows a bimodal distribution. Through domain knowledge of the existence of 'peak' and 'off-peak' time, along with K-means clustering to find the exact hours for 'peak' and 'off-peak' usage, it can be seen that this bimodality is two distributions, one for 'peak' times and one for 'off-peak' times.
In [2]:
with open('data/df_demand.pkl','rb') as f:
df_orig = pickle.load(f)
with open('data/df_all.pkl','rb') as f:
df_orig_all = pickle.load(f)
In [3]:
df = copy.deepcopy(df_orig)
df_bp = {}
demand_2012 = df['Demanda'][df.index.year == 2012]
demand_2013 = df['Demanda'][df.index.year == 2013]
demand_2014 = df['Demanda'][df.index.year == 2014]
bp = plt.boxplot([df['Demanda'],demand_2012,demand_2013,demand_2014])
plt.xticks([1, 2, 3,4,5], ['All','2012','2013','2014'])
df_bp['median'] = bp['medians'][0]._y[0]
df_bp['fences'] = [bp['caps'][1]._y[0], bp['caps'][0]._y[1]]
df_bp['IQR'] = [bp['whiskers'][0]._y[0], bp['whiskers'][1]._y[0]]
print 'Median: ' + str(df_bp['median'])
print 'Fences: ' + str(df_bp['fences'])
print 'IQR: ' + str(df_bp['IQR'])
In [15]:
high_lim=max(df['Demanda'])
bins=100
start_date = datetime(2011,1,1)
end_date = datetime(2014,12,31)
plt.hist(df[start_date:end_date]['Demanda'],bins,range=[0,high_lim],label='all')
#off-peak is between 10PM-9AM and all weekends
df_hora_off_peak = df[df['HORA']<9].append(df[df['HORA']>=22])[df['weekend']==0].append(df[df['weekend']==1]).sort()
#peak is between 9AM-10PM on weekdays
df_hora_peak = df[df['HORA']>=9][df['HORA']<22][df['weekend']==0]
#df_hora_peak = df[df['HORA']>=8][df['HORA']<14]
#df_hora_peak = df_hora_peak.append(df[df['HORA']>=18]).sort()
df_hora_super_peak = df[df['HORA']>=18][df['HORA']<22][df['weekend']==0]
plt.hist(df_hora_off_peak[start_date:end_date]['Demanda'],bins,range=[0,high_lim],
label='off-peak (9PM-9AM and weekends)',alpha=.6)
plt.hist(df_hora_peak[start_date:end_date]['Demanda'],bins,range=[0,high_lim],
label='peak (9AM-9PM on weekdays)',alpha=.5)
plt.xlim([0 ,high_lim])
plt.title('Demand Distribution (Histogram), by feature')
plt.ylabel('#')
plt.xlabel('kW')
lgd = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
pass
The TestedMethods python notebook has a larger number of MLRs, but below is the one that worked the best as well as the summary of the MLR without sub-group analysis in order to take a look at how important variables are. I used K-fold cross validation with 5 folds, and achieved MAPE (mean absolute percent error) of 2.6%, which is below most values I have found in the literature.
In [31]:
def mls(df_train,df_test,summary=False):
x_train = df_train.drop('Demanda_shift_1',1).drop('date',1)
x_test = df_test.drop('Demanda_shift_1',1).drop('date',1)
y_train = df_train['Demanda_shift_1']
y_test = df_test['Demanda_shift_1']
x_train = sm.add_constant(x_train, prepend=True) #add a constant
x_test = sm.add_constant(x_test, prepend=True) #add a constant
ols = sm.OLS(y_train,x_train)
res = ols.fit() #create a model and fit it
pred_y = res.predict(x_test)
error = (y_test - pred_y)/y_test
mape = np.mean(abs(error))
#print "MAPE ERROR: " + str(mape)
print "MAX ERROR: " + str(max(error))
if summary:
print res.summary()
return [mape,error,y_test]
def cross_validate(df,k,summary=False):
mape_error = []
errors = []
mapes = []
kf = cross_validation.KFold(len(df),k)
for train_index, test_index in kf:
[mape,error,index] = mls(df.iloc[train_index],df.iloc[test_index],summary)
mapes.append(mape)
errors.append(error)
mape_error.append([mape,error,index])
print
print "MIN MAPE:" + str(min(mapes))
return [min(mape_error)[0], min(mape_error)[1],min(mape_error)[2]]
In [32]:
df = copy.deepcopy(df_orig)
mapes = []
lengths = []
k=5
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)
df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()
df = df.drop('windspeedKmph',1).drop('windspeedMiles',1).drop('FeelsLikeF',1).drop('FeelsLikeC',1).drop('DewPointC',1)
df = df.drop('WindGustMiles',1).drop('WindGustKmph',1).drop('weatherCode',1).drop('payday',1).drop('tempF',1)
df = df.drop('tempC',1).drop('day',1).drop('month',1).drop('WindChillC',1).drop('HeatIndexC',1).drop('winddirDegree',1)
df = df.drop('cloudcover',1).drop('precipMM',1).drop('visibility',1).drop('timeinweek',1).drop('pressure',1)
df_wd_opk = copy.deepcopy(df[df['wd_peak']==0][df['weekend']==0])
print 'Weekday Off-Peak (9PM-9AM)'
mapes = []
[mape,error,index] = cross_validate(df_wd_opk,k)
mapes.append(mape)
lengths.append(len(error))
print
df_wd_pk = copy.deepcopy(df[df['wd_peak']==1][df['weekend']==0])
print 'Weekday Peak (9AM-9PM)'
[mape,error,index] = cross_validate(df_wd_pk,k)
mapes.append(mape)
lengths.append(len(error))
print
df_we_opk = copy.deepcopy(df[df['we_peak']==0][df['weekend']==1])
print 'Weekend Off-Peak (9PM-6PM)'
[mape,error,index] = cross_validate(df_we_opk,k)
mapes.append(mape)
lengths.append(len(error))
print
df_we_pk = copy.deepcopy(df[df['we_peak']==1][df['weekend']==1])
print 'Weekend Peak (6PM-9PM)'
[mape,error,index] = cross_validate(df_we_pk,k)
mapes.append(mape)
lengths.append(len(error))
print
mape_string = str(sum(np.array(mapes)*np.array(lengths))/sum(np.array(lengths)))
print "Overall MAPE for Split Between Peak/Off Peak for Weekday/Weekend:" + mape_string
pass
The primary purpose of the following work is to analyze how the grid uses certain generation sources. Generally, the utility company will plan out which forms of generation to use for forecasted demand about several hours in advance (grid planning). Minutes ahead of time, adjustments will be made to the power plants that are on to correct for any changes (primary/secondary reserves). There is also relatively real-time adjustments that take place to correct for any discrepencies (governor control).
The following analysis looks to see which forms of generation are used as primary/secondary reserves. As can be seen above, demand is a fairly predictable signal, which is why there was only a limited need for short-term flexibility. However, the stochastic nature of wind generation requires greater amounts of flexibility on a short-term basis. Bunker fuel and hydroelectricity are two of the primary sources of flexiblity at this level.
This summer, we will be implementing a pilot demand response program that allows for the control of thermostatic loads (freezers) in 30 small businesses in Managua. The purpose of this pilot is to showcase the ability for demand response programs to be used as a source of grid flexibility.
The recent drought in Nicaragua has reduced their hydroelectricity capacity, and thus placed an additional burden on bunker fuel for grid flexibility. This leads to increased costs, both economic and environmental. Our hypothesis for the following analysis is to show this phenonemon. Specifically, to show that the drought in 2014 has reduced the use of hydroelectricity as a form of grid flexibility. Therefore, hydroelectricity alone is not a resilient form of grid flexibility, and instead there should be other forms of grid flexibility available to the utility company. Essentially, we are proving the need for a demand response program for increased grid flexibility.
In [33]:
df_gen = copy.deepcopy(df_orig_all)
#changed EOLO to EOL, EEC50 to EEC, and TPT to TPC
#changed PMGA to PMG, PCH1 to PCHN
df_gen['Albanisa_Bunker'] = df_gen[['PCG1','PCG2','PCG3','PCG4','PCG5','PCG6','PCG7',
'PCG8','PCG9','PHC1','PHC2']].sum(axis=1)
df_gen['AEI_Wind'] = df_gen[['AMY1','AMY2']].sum(axis=1)
df_gen['Gruo_Terra_Wind'] = df_gen[['PBP']].sum(axis=1)
df_gen['Other_Wind'] = df_gen[['ABR','EOL']].sum(axis=1)
df_gen['AEI_Bunker'] = df_gen[['CEN','EEC20','EEC','TPC']].sum(axis=1)
df_gen['Enel_Hydro'] = df_gen[['PCA1','PCA2','PCF1','PCF2']].sum(axis=1)
df_gen['Enel_Bunker'] = df_gen[['PLB1','PLB2','PMG3','PMG4','PMG5']].sum(axis=1)
df_gen['Other_Bunker'] = df_gen[['PCHN','PNI1','PNI2','GSR']].sum(axis=1)
df_gen['Hidropanasma_Hydro'] = df_gen[['HPA1','HPA2']].sum(axis=1)
bunker_fuel = ['PCG1','PCG2','PCG3','PCG4','PCG5','PCG6','PCG7','PCG8','PCG9','PHC1','PHC2','CEN','EEC20',
'EEC','TPC','PLB1','PLB2','PMG3','PMG4','PMG5','PCHN','PNI1','PNI2','GSR']
interconnect = ['LNI-L9040','SND-L9090','AMY-L9030','TCPI-L9150']
hydro = ['PCA1','PCA2','PCF1','PCF2','HPA1','HPA2']
geothermal = ['PEN1 y 2','PEN3','PEN4','PMT1','PMT2']
biomass = ['MTR','NSL']
wind = ['AMY1','AMY2','ABR','EOL','PBP']
df_gen['bunker'] = df_gen[bunker_fuel].sum(axis=1)
df_gen['interconnect'] = df_gen[interconnect].sum(axis=1)
df_gen['hydro'] = df_gen[hydro].sum(axis=1)
df_gen['geothermal'] = df_gen[geothermal].sum(axis=1)
df_gen['biomass'] = df_gen[biomass].sum(axis=1)
df_gen['wind'] = df_gen[wind].sum(axis=1)
df_gen['all_power'] = df_gen['bunker']+df_gen['hydro']+df_gen['geothermal']+df_gen['biomass']+df_gen['wind']
df_big_gen = pd.DataFrame([df_gen['wind'],df_gen['biomass'],df_gen['interconnect'],df_gen['geothermal'],
df_gen['bunker'],df_gen['hydro'],df_gen['Demanda']]).T
Contribution Plots The following two plots show the contributions of different energy sources to the hourly Nicaraguan grid from 2012-2014.
The first plot shows the overall contribution for a 5 day period, with the largest signal being demand. An arbitrary 5-day period is used for this plot to show the variability of each energy source over the day.
The second plot shows the relative contribution of each energy source to the overall monthly demand. In this case, the interconnection and demand were added into a single signal, since the interconnection acts essentially as another source of demand. The installation of wind plants (PBP on July 25th, 2012, EOL on Nov. 10th, 2012, and ABR on Nov. 29th, 2013) can be seen in the plot as marginal increases in the contribution of wind.
The third plot focuses on the contribution of hydro and bunker fuel, and compares each souces monthly contribution to demand between each year.
In [34]:
fuel_types = ['bunker','interconnect','hydro','geothermal','biomass','wind','Demanda']
fuel_types_array = []
start_date = datetime(2013,1,1)
end_date = datetime(2013,1,5)
df_plot = df_gen[start_date:end_date]
#plt.plot(df_plot.index,df_plot['all_power'],label='all')
for key in fuel_types:
fuel_types_array.append(df_plot[key])
plt.plot(df_plot.index,df_plot[key],label=key)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('5 Days of Electricity by Generation Type (to show signal shape)')
plt.xlabel('Time')
plt.ylabel('kWh')
Out[34]:
In [35]:
df_flex = copy.deepcopy(df_big_gen)
df_flex['wind_diff'] = df_flex['wind'].diff(1)
df_flex['hydro_diff'] = df_flex['hydro'].diff(1)
df_flex['bunker_diff'] = df_flex['bunker'].diff(1)
df_flex['biomass_diff'] = df_flex['biomass'].diff(1)
df_flex['geothermal_diff'] = df_flex['geothermal'].diff(1)
df_flex['interconnect_diff'] = df_flex['interconnect'].diff(1)
df_flex['Demanda_diff'] = df_flex['Demanda'].diff(1)
df_flex['Demanda_diff_12'] = df_flex['Demanda'].diff(12)
df_flex['Demandcorrected'] = df_flex['Demanda'] - df_flex['wind'] + df_flex['interconnect']
df_flex['Demandinterconnect'] = df_flex['Demanda']+df_flex['interconnect']
df_flex['Dnowinddiff'] = df_flex['Demandcorrected'].diff(1)
df_flex['Dnowinddiff_12'] = df_flex['Demandcorrected'].diff(12)
df_hydro_contrib = copy.deepcopy(df_flex['hydro']/df_flex['Demandinterconnect']).dropna()
df_bunker_contrib = copy.deepcopy(df_flex['bunker']/df_flex['Demandinterconnect']).dropna()
df_wind_contrib = copy.deepcopy(df_flex['wind']/df_flex['Demandinterconnect']).dropna()
df_biomass_contrib = copy.deepcopy(df_flex['biomass']/df_flex['Demandinterconnect']).dropna()
df_geotherm_contrib = copy.deepcopy(df_flex['geothermal']/df_flex['Demandinterconnect']).dropna()
plt.plot(df_hydro_contrib.resample('M',how='mean').index,
df_hydro_contrib.resample('M',how='mean'),'b',label='Hydro')
plt.plot(df_bunker_contrib.resample('M',how='mean').index,
df_bunker_contrib.resample('M',how='mean'),'k',label='Bunker')
plt.plot(df_wind_contrib.resample('M',how='mean').index,
df_wind_contrib.resample('M',how='mean'),'r',label='Wind')
plt.plot(df_biomass_contrib.resample('M',how='mean').index,
df_biomass_contrib.resample('M',how='mean'),'g',label='Biomass')
plt.plot(df_geotherm_contrib.resample('M',how='mean').index,
df_geotherm_contrib.resample('M',how='mean'),'y',label='Geothermal')
plt.legend(title="Sources",bbox_to_anchor=(1.2, .48),
bbox_transform=plt.gcf().transFigure)
locs, labels = plt.xticks()
plt.setp(labels, rotation=90)
plt.ylabel('% of Demand+interconnection')
pass
In [36]:
start = datetime(2012,1,1)
end = datetime(2012,12,31)
plt.plot(range(1,13),df_flex['hydro'][start:end].resample('M',how='mean'),'b',label='Hydro 2012')
plt.plot(range(1,13),df_flex['bunker'][start:end].resample('M',how='mean'),'r',label='Bunker 2012')
start = datetime(2013,1,1)
end = datetime(2013,12,31)
plt.plot(range(1,13),df_flex['hydro'][start:end].resample('M',how='mean'),'b:',label='Hydro 2013')
plt.plot(range(1,13),df_flex['bunker'][start:end].resample('M',how='mean'),'r:',label='Bunker 2013')
start = datetime(2014,1,1)
end = datetime(2014,12,31)
plt.plot(range(1,13),df_flex['hydro'][start:end].resample('M',how='mean'),'b--',label='Hydro 2014')
plt.plot(range(1,13),df_flex['bunker'][start:end].resample('M',how='mean'),'r--',label='Bunker 2014')
plt.ylabel('% Contribution')
plt.xlabel('Month of Year')
plt.title('% Contributions of Hydro and Bunker Fuel Overall')
plt.legend(title="Years",bbox_to_anchor=(1.2, .92),
bbox_transform=plt.gcf().transFigure)
plt.xlim([1,12])
pass
#t-tests
Correlation Plot
The following correlation plot shows that the strongest correlation is between Demand and bunker fuel (0.732). This can be visually confirmed by the plot above, which shows the bunker signal following the peak and off-peak nature of the demand signal.
Further, there is a negative correlation for bunker vs. wind and hydro vs. wind, indicating that hydro and bunker fuel are the primary sources for dealing with decreases in wind generation.
In [37]:
df_big_gen.keys()
df_corr = copy.deepcopy(df_big_gen.dropna())
In [38]:
#The Quantities Correlated
corr_df = pd.DataFrame(np.corrcoef(df_corr.T),columns=df_big_gen.keys()).T
corr_df.columns = df_big_gen.keys()
print corr_df
plt.figure()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_df.as_matrix(), linewidths=0, robust=True, square=False,xticklabels=df_big_gen.keys(),
yticklabels=df_big_gen.keys())
Out[38]:
In order to further look into how these sources are used with each other, it is important to see not only the overall use of these sources, but how they change over time. The timeframe for primary and secondary reserves is typically on the minutes scale, but the data is limited to 1-hour time intervals. However, by looking at how energy sources change from one hour to the next, information can be optained regarding what sources are being used as reserves. As is shown above, hydro and bunker fuel are the two energy sources inversely correlated with wind generation, which confirms the assertion that these sources are the ones primarily used as reserves for wind intermittency.
In order to incorporate wind intermittency into the demand requirements for hydro and bunker fuel, a signal X(t) = (Demand - wind + interconnect) is considered as a more important signal than demand alone. The addition of interconnection incorporates the needs put on the grid by the interconnection agreements. The subtraction of wind incorporates the demand that is remaning for the other energy sources after wind generation has been taken into account. The other singals, hydroelectricity H(t), and bunker fuel B(t) will also be used in the following explanation.
To further focus on the use of hydro as a form of grid reserve, the data is filtered to only include datapoints where the derivative of X(t) is positive, meaning that there was an increase in X(t) from the hour before. This indicates that a corresponding increase in supply was needed to match this increase in energy needs. The data is then further filtered to only include datapoints where the derivative of hydroelectricity (H'(t)) is positive, meaning that there was an increase in the amount of hydro generation from the hour before. This indicates that H(t) was being used to partially account for the increase in X(t).
The value of H'(t) indicates how much additional hydro was used during that hour. Similarly, the value of B'(t) indicates how much additional bunker fuel was used during that hour.
This data will be further analyzed to see how maintenence work on the various generators may have impacted their ability to be used for reserves, as well as how the drought in Nicaragua may have impacted the use of hydro for reserves.
In [39]:
df_filter = copy.deepcopy(df_flex[df_flex['Dnowinddiff']>0][df_flex['hydro_diff']>0])
df_hydro_contrib = copy.deepcopy(df_filter['hydro_diff']).dropna()
df_bunker_contrib = copy.deepcopy(df_filter['bunker_diff']).dropna()
df_geothermal_contrib = copy.deepcopy(df_filter['geothermal_diff']).dropna()
plt.plot(df_filter['Dnowinddiff'].resample('M',how='mean').index,
df_filter['Dnowinddiff'].resample('M',how='mean'),'g',label='Demand( -wind +interconnect)')
plt.plot(df_filter['hydro_diff'].resample('M',how='mean').index,
df_filter['hydro_diff'].resample('M',how='mean'),'b',label='Hydro')
plt.plot(df_filter['bunker_diff'].resample('M',how='mean').index,
df_filter['bunker_diff'].resample('M',how='mean'),'k',label='Bunker')
plt.legend(title="Sources",bbox_to_anchor=(1.3, .9),
bbox_transform=plt.gcf().transFigure)
locs, labels = plt.xticks()
plt.ylabel('kW')
plt.title('Use of Different Energy Sources as Grid Reserves')
plt.setp(labels, rotation=90)
pass