Predicting emissions

How well can we predict emissions based on other things?


In [1]:
from epa_client import *

###################
# set up regions
###################

# these are approximations of geographic regions associated with the electricity markets
regions = {
    'isone': ['ct', 'me', 'ma', 'vt', 'nh', 'ri'],
    'ercot': ['tx'],
    'bpa': ['wa', 'or', 'id'],
    'pjm': ['pa', 'nj', 'dc', 'md', 'va', 'de', 'ky', 'wv', 'oh'],
    'miso': ['ia', 'mo', 'il', 'mi', 'wi', 'nd', 'sd', 'ne', 'mn', 'in'],
    'spp': ['ks', 'ok', 'ms', 'la', 'ar', 'mo'],
    'caiso': ['ca'],
}

Consider the problem of predicting the marginal emissions on day $i$ ($M_i = E_i/G_i$) using the generation on day $i$ ($G_i$) and the generation and emissions of other "similar" days $j$ ($G_j$ and $E_j$). Let's start by seeing how well we can do simply by taking the average of $G_i \times E_j/G_j$ for the $n$ most similar days to day $i$. In other words, consider a $n$-term linear model with fixed coefficients of $1/n$, i.e., $\hat{E_i} = \frac{1}{n}\sum_{j=1}^{n} G_i \times E_j/G_j$. For a measure of similarity between days, we use the RMS error in the hourly generation timeseries. The plots show the RMS prediction error of this model for a range of values of $n$, grouped by month and by percentile of daily generation.


In [58]:
import calendar

yr = '2012'
n_prediction_days = 15

for reg in regions.keys():
    # set up
    gen = total_gen_by_hour(yr, regions[reg])
    em = total_em_by_hour(yr, regions[reg])
    daily_gen = gen.sum(level=['OP_DATE'])
    daily_em = em.sum(level=['OP_DATE'])
    day_labels = daily_gen.keys()
    n_days = day_labels.size
    
    # RMS error of generation timeseries between days (symmetrical)
    rms_gen = np.zeros([n_days, n_days])
    # pred_em_each[i1, i2] is prediction of emissions on day 1 based on day 2 data
    pred_em_each = np.zeros([n_days, n_days])
    
    # compute pairwise matrices for all pairs of days
    for iday1, (day1, group1) in enumerate(gen.groupby(level=['OP_DATE'])):
        for iday2, (day2, group2) in enumerate(gen.groupby(level=['OP_DATE'])):
            if iday1 != iday2:
                rms_gen[iday1, iday2] = error_between_groups(group1, group2)
                pred_em_each[iday1, iday2] = daily_em[iday2] * daily_gen[iday1]/daily_gen[iday2]
                

    # pred_em_by_n[i1, n] is predicted emissions on day 1 based on closest n days
    pred_em_by_n = np.zeros([n_days, n_prediction_days])
    # pred_em_err_by_n[i1, n] is actual - predicted emissions on day 1 based on closest n days
    pred_em_err_by_n = np.zeros([n_days, n_prediction_days])
    # pred_mef_err_by_n[i1, n] is actual - predicted MEF on day 1 based on emissions prediction from closest n days
    pred_mef_err_by_n = np.zeros([n_days, n_prediction_days])
            
    # compute pairwise matrices for all pairs of days
    for iday in range(n_days):
        # sort by rms
        sorted_inds = np.argsort(rms_gen[iday])
        
        # summarize day 1
        for n in range(n_prediction_days):
            # get vals to predict on (first value should be zero, so skip)
            inds = sorted_inds[1:(n+1)]

            # select prediction values for closest n days
            vals = pred_em_each[iday, inds]
            pred_em_by_n[iday, n] = np.mean(vals)
            pred_em_err_by_n[iday, n] = daily_em[iday] - pred_em_by_n[iday, n]
            pred_mef_err_by_n[iday, n] = pred_em_err_by_n[iday, n] / daily_gen[iday]
                
    # bin errors by month
    pred_mef_err_by_mo = np.zeros([12, n_prediction_days])
    first_of_mo = 0
    for imo in range(12):
        wkd_in_mo, ndays_in_mo = calendar.monthrange(int(yr), imo+1)
        vals_in_mo = pred_mef_err_by_n[first_of_mo : (first_of_mo+ndays_in_mo)]
        pred_mef_err_by_mo[imo] = np.linalg.norm(vals_in_mo, axis=0)
        first_of_mo += ndays_in_mo
                
    # plot by month
    plt.figure()
    colors = cm.rainbow(np.linspace(0, 1, n_prediction_days))
    for i in range(n_prediction_days):
        plt.plot(pred_mef_err_by_mo[:,i], 'o-', c=colors[i], label=str(i+1))
    plt.legend(fontsize='x-small', ncol=5, numpoints=1)
    plt.title(' '.join([yr, reg.upper()]))
    plt.xticks( arange(12), calendar.month_name[1:13], rotation=90 )
    plt.ylabel('RMS prediction error of MEF [lb/MW]')

    # bin errors by generation
    nbins = 20
    pred_mef_err_by_gen = np.zeros([nbins, n_prediction_days])
    daily_gen_iles = np.percentile(daily_gen, range(0, 101, 100/nbins))
    for ibin in range(nbins):
        vals_in_bin = pred_mef_err_by_n[(daily_gen >= daily_gen_iles[ibin]) & (daily_gen < daily_gen_iles[ibin+1])]
        pred_mef_err_by_gen[ibin] = np.linalg.norm(vals_in_bin, axis=0)
                
    # plot by generation
    plt.figure()
    for i in range(n_prediction_days):
        plt.plot(daily_gen_iles[:-1] + np.diff(daily_gen_iles), pred_mef_err_by_gen[:,i],
                 'o-', c=colors[i], label=str(i+1))
    plt.legend(fontsize='x-small', ncol=5, numpoints=1)
    plt.title(' '.join([yr, reg.upper()]))
    plt.xlabel('total daily gen [MW]')
    plt.ylabel('RMS prediction error of MEF [lb/MW]')