In [1]:
from epa_client import *
###################
# set up regions
###################
# these are approximations of geographic regions associated with the electricity markets
regions = {
'isone': ['ct', 'me', 'ma', 'vt', 'nh', 'ri'],
'ercot': ['tx'],
'bpa': ['wa', 'or', 'id'],
'pjm': ['pa', 'nj', 'dc', 'md', 'va', 'de', 'ky', 'wv', 'oh'],
'miso': ['ia', 'mo', 'il', 'mi', 'wi', 'nd', 'sd', 'ne', 'mn', 'in'],
'spp': ['ks', 'ok', 'ms', 'la', 'ar', 'mo'],
'caiso': ['ca'],
}
Consider the problem of predicting the marginal emissions on day $i$ ($M_i = E_i/G_i$) using the generation on day $i$ ($G_i$) and the generation and emissions of other "similar" days $j$ ($G_j$ and $E_j$). Let's start by seeing how well we can do simply by taking the average of $G_i \times E_j/G_j$ for the $n$ most similar days to day $i$. In other words, consider a $n$-term linear model with fixed coefficients of $1/n$, i.e., $\hat{E_i} = \frac{1}{n}\sum_{j=1}^{n} G_i \times E_j/G_j$. For a measure of similarity between days, we use the RMS error in the hourly generation timeseries. The plots show the RMS prediction error of this model for a range of values of $n$, grouped by month and by percentile of daily generation.
In [58]:
import calendar
yr = '2012'
n_prediction_days = 15
for reg in regions.keys():
# set up
gen = total_gen_by_hour(yr, regions[reg])
em = total_em_by_hour(yr, regions[reg])
daily_gen = gen.sum(level=['OP_DATE'])
daily_em = em.sum(level=['OP_DATE'])
day_labels = daily_gen.keys()
n_days = day_labels.size
# RMS error of generation timeseries between days (symmetrical)
rms_gen = np.zeros([n_days, n_days])
# pred_em_each[i1, i2] is prediction of emissions on day 1 based on day 2 data
pred_em_each = np.zeros([n_days, n_days])
# compute pairwise matrices for all pairs of days
for iday1, (day1, group1) in enumerate(gen.groupby(level=['OP_DATE'])):
for iday2, (day2, group2) in enumerate(gen.groupby(level=['OP_DATE'])):
if iday1 != iday2:
rms_gen[iday1, iday2] = error_between_groups(group1, group2)
pred_em_each[iday1, iday2] = daily_em[iday2] * daily_gen[iday1]/daily_gen[iday2]
# pred_em_by_n[i1, n] is predicted emissions on day 1 based on closest n days
pred_em_by_n = np.zeros([n_days, n_prediction_days])
# pred_em_err_by_n[i1, n] is actual - predicted emissions on day 1 based on closest n days
pred_em_err_by_n = np.zeros([n_days, n_prediction_days])
# pred_mef_err_by_n[i1, n] is actual - predicted MEF on day 1 based on emissions prediction from closest n days
pred_mef_err_by_n = np.zeros([n_days, n_prediction_days])
# compute pairwise matrices for all pairs of days
for iday in range(n_days):
# sort by rms
sorted_inds = np.argsort(rms_gen[iday])
# summarize day 1
for n in range(n_prediction_days):
# get vals to predict on (first value should be zero, so skip)
inds = sorted_inds[1:(n+1)]
# select prediction values for closest n days
vals = pred_em_each[iday, inds]
pred_em_by_n[iday, n] = np.mean(vals)
pred_em_err_by_n[iday, n] = daily_em[iday] - pred_em_by_n[iday, n]
pred_mef_err_by_n[iday, n] = pred_em_err_by_n[iday, n] / daily_gen[iday]
# bin errors by month
pred_mef_err_by_mo = np.zeros([12, n_prediction_days])
first_of_mo = 0
for imo in range(12):
wkd_in_mo, ndays_in_mo = calendar.monthrange(int(yr), imo+1)
vals_in_mo = pred_mef_err_by_n[first_of_mo : (first_of_mo+ndays_in_mo)]
pred_mef_err_by_mo[imo] = np.linalg.norm(vals_in_mo, axis=0)
first_of_mo += ndays_in_mo
# plot by month
plt.figure()
colors = cm.rainbow(np.linspace(0, 1, n_prediction_days))
for i in range(n_prediction_days):
plt.plot(pred_mef_err_by_mo[:,i], 'o-', c=colors[i], label=str(i+1))
plt.legend(fontsize='x-small', ncol=5, numpoints=1)
plt.title(' '.join([yr, reg.upper()]))
plt.xticks( arange(12), calendar.month_name[1:13], rotation=90 )
plt.ylabel('RMS prediction error of MEF [lb/MW]')
# bin errors by generation
nbins = 20
pred_mef_err_by_gen = np.zeros([nbins, n_prediction_days])
daily_gen_iles = np.percentile(daily_gen, range(0, 101, 100/nbins))
for ibin in range(nbins):
vals_in_bin = pred_mef_err_by_n[(daily_gen >= daily_gen_iles[ibin]) & (daily_gen < daily_gen_iles[ibin+1])]
pred_mef_err_by_gen[ibin] = np.linalg.norm(vals_in_bin, axis=0)
# plot by generation
plt.figure()
for i in range(n_prediction_days):
plt.plot(daily_gen_iles[:-1] + np.diff(daily_gen_iles), pred_mef_err_by_gen[:,i],
'o-', c=colors[i], label=str(i+1))
plt.legend(fontsize='x-small', ncol=5, numpoints=1)
plt.title(' '.join([yr, reg.upper()]))
plt.xlabel('total daily gen [MW]')
plt.ylabel('RMS prediction error of MEF [lb/MW]')