Critical Peak Pricing (CPP) programs are used by utilities to reduce electricity demand for the highest-demand hours on a handful of the highest-demand days each year. This is good for the utilities because it reduces the capacity requirements for expensive-to-maintain "peaker" plants in order to meet the excess demand. Conventional wisdom says that peaker plants are gas plants that are dirtier than the average gas plant. There are three possible scenarios:
This notebook tests the underlying assumption, and evaluates the potential carbon savings of CPP programs, in several regions of the US using EPA data. We conclude that WattTime should prioritize traditional CPP-like programs in ISONE and PJM, and should avoid them in BPA (at least before conducting an analysis based on a better representation of the BPA generation portfolio).
In [819]:
##########
# imports
##########
# this was run using pandas v0.11.0, scipy v0.12.0, numpy v1.8.0
import pandas as pd
from scipy import stats
import numpy as np
import os
In [585]:
#############################
# initialize or clear data
#############################
data = {}
The data for this analysis was downloaded from the EPA Air Markets Program Data website (http://ampd.epa.gov/ampd/).
To download the data by hand:
./data/epa/
directory.Or use the download
function to download data programatically.
We will focus on three key variables: tons of CO$_2$ emissions, MW of electricity generation, and fraction of running time. All of these quantities are available for every unit of every >25MW fossil fuel-powered plant for every hour of the year for every year since 1995.
In [746]:
######################
# read and parse data
######################
# path from ipynb to data
data_dir = 'data/epa'
def download(yr, st, mo):
"""download data from EPA and put in directory structure"""
# import
import ftplib
import zipfile
# set up
zipfname = '%s%s%s.zip'% (yr, st, mo)
ftpfpath = 'dmdnload/emissions/hourly/monthly'
ftpfname = os.path.join([ftpfpath, yr, zipfname])
# open ftp client
client = ftplib.FTP('ftp.epa.gov')
client.login()
# if file is on the ftp server...
if ftpfname in client.nlst(ftpfpath):
# download
with open(zipfname, "wb") as f:
print '... downloading %s' % mo
client.retrbinary('RETR %s' % ftpfname , f.write)
# unzip into target directory
zipf = zipfile.ZipFile(zipfname)
zipf.extractall(path=data_dir)
zipf.close()
# clean up
os.remove(zipfname)
# if not, log
else:
print '... skipping %s' % mo
# always close the client
client.close()
def load(yr, st):
"""read year and state from csv into data[yr][st]"""
# add year if it's not there
if not yr in data.keys():
data[yr] = {}
# check if we've already read it
if not st in data[yr].keys():
print "loading %s %s" % (yr, st)
# storage
pieces = []
for mo in ['%02.f' % i for i in range(1,13)]:
# set up path
csvfname = os.path.join(data_dir, '%s%s%s.csv' % (yr, st, mo))
# download file if it doesn't exist
if not os.access(csvfname, os.F_OK):
download(yr, st, mo)
# csv -> dataframe
try:
pieces.append(pd.read_csv(csvfname, usecols=[0,1,3,4,5,6,7,17,18]))
except:
print '... skipping %s' % mo
# combine into one big dataframe
data[yr][st] = pd.concat(pieces)
def get_unit_group(yr, st):
"""group by facility name and unit id"""
# load and group
try:
this_data = data[yr][st].groupby(['FACILITY_NAME','UNITID'])
except KeyError:
load(yr, st)
this_data = data[yr][st].groupby(['FACILITY_NAME','UNITID'])
return this_data
def get_hr_group(yr, st):
"""group by date and hour"""
# load and group
try:
this_data = data[yr][st].groupby(['OP_DATE','OP_HOUR'])
except KeyError:
load(yr, st)
this_data = data[yr][st].groupby(['OP_DATE','OP_HOUR'])
return this_data
def get_emissions(df):
"""lb of CO2 emissions"""
try:
s = df['CO2_MASS (tons)']
except KeyError:
s = df['CO2_MASS']
return s * 2000.0
def get_generation(df):
"""MW of generation"""
try:
s = df['GLOAD (MW)']
except KeyError:
s = df['GLOAD']
return s
def get_runtime(df):
"""fraction of time it was running"""
return df['OP_TIME']
# test
load('2012', 'ca')
assert(list(data['2012']['ca'].keys()) == ['STATE', 'FACILITY_NAME', 'UNITID',
'OP_DATE', 'OP_HOUR', 'OP_TIME',
'GLOAD (MW)', 'CO2_MASS (tons)', 'CO2_MASS_MEASURE_FLG'])
In [ ]:
#####################
# load facility data
#####################
#facilities = pd.read_csv('data/epa/EPADownload/facility_01-16-2014_notrailing.csv', header=0,
# usecols=[0,1,2,3,4,11,12,13,15,17,18,21,22,23],
# names=['STATE', 'FACILITY_NAME', 'FAC_ID', 'UNITID', 'Year', 'EPA Region Code',
# 'NERC Region', 'County', 'Source Category', 'Latitude', 'Longitude',
# 'Unit Type', 'Primary Fuel', 'Secondary Fuel'])
#plt.scatter(facilities[' Facility Longitude'].where(facilities[' NERC Region']=='MAPP'),
# facilities[' Facility Latitude'].where(facilities[' NERC Region']=='MAPP'))
#mapp = facilities[facilities[' NERC Region']=='MAPP'].groupby(['State', ' Year', ' Facility ID (ORISPL)', ' Unit ID'])
#mapp_facilities = facilities[facilities['Year']==2006][facilities['NERC Region']=='MAPP']
#mapp_states = [s.lower() for s in mapp_facilities['STATE'].unique()]
#print mapp_states
#pieces = []
#for st in mapp_states:
# load('2006', st)
# pieces.append(pd.merge(mapp_facilities, data['2006'][st]))
#merges = pd.concat(pieces)
#print merged.keys()
#print merged.head()
In [743]:
###################
# set up regions
###################
# these are approximations of geographic regions associated with the electricity markets
regions = {
'isone': ['ct', 'me', 'ma', 'vt', 'nh', 'ri'],
'ercot': ['tx'],
'bpa': ['wa', 'or', 'id'],
'pjm': ['pa', 'nj', 'dc', 'md', 'va', 'de', 'ky', 'wv', 'oh'],
'miso': ['ia', 'mo', 'il', 'mi', 'wi', 'nd', 'sd', 'ne', 'mn', 'in'],
'spp': ['ks', 'ok', 'ms', 'la', 'ar', 'mo'],
'caiso': ['ca'],
}
For every hour for every unit, we can compute the carbon emissions factor $EF$ as the ratio of the carbon emissions $E$ to the power generation $G$:
In practice, we aggregate the emissions factor in two ways:
annual_ef_by_unit
gives $EF$ data with low (annual) temporal resolution and high (unit-level) spatial resolution, i.e., for year $Y$, $\frac{\sum_{h \in Y} E(u,h)}{\sum_{h \in Y} G(u,h)}$hourly_ef_by_region
gives $EF$ data with high (hourly) temporal resolution and low (regional) spatial resolution, i.e, for region $R$, $\frac{\sum_{u \in R} E(u,h)}{\sum_{u \in R} G(u,h)}$
In [1147]:
def annual_ef_by_unit(yr, region):
"""lb of CO2 emissions per MW fossil generation at each unit"""
pieces = []
for st in region:
# sums by unit
grouped = get_unit_group(yr,st)
sums = grouped.sum()
# compute
total_em = get_emissions(sums)#.fillna(0)
total_gen = get_generation(sums)#.fillna(0)
ratio = total_em / total_gen
# store
pieces.append(ratio)
return pd.concat(pieces).replace(np.inf, np.nan).fillna(0)
def total_gen_by_hour(yr, region):
"""total MW fossil generation over all units in an hour"""
gen = None
for st in region:
this_gen = get_generation(get_hr_group(yr,st).sum()).fillna(0)
if gen is None:
gen = this_gen
else:
gen += this_gen
return gen.fillna(0)
def total_em_by_hour(yr, region):
"""total lb CO2 over all units in an hour"""
em = None
for st in region:
this_em = get_emissions(get_hr_group(yr,st).sum()).fillna(0)
if em is None:
em = this_em
else:
em += this_em
return em.fillna(0)
# test: EF cannot be NaN or inf
test_ef = annual_ef_by_unit('2012', regions['pjm'])
assert(not np.any(np.isnan(test_ef)))
assert(not np.any(np.isinf(test_ef)))
The capacity factor $CF$ is the fraction of the possible power generation that a plant actually generates. Here, we approximate it as the fraction of time a plant is on. The annual capacity factor is accessible via two functions:
annual_cf_by_unit
gives $CF$ data with low (annual) temporal resolution and high (unit-level) spatial resolutionannual_frac_off_by_unit
is simply $1-CF$, a measure of how "peaker-y" a unit is
In [615]:
def annual_cf_by_unit(yr, region):
"""fraction of time per year each unit is on"""
pieces = []
for st in region:
# group by unit
grouped = get_unit_group(yr,st)
# average fraction of each hour the unit is running, ie sum(runtime) / count(runtime)
frac_off = get_runtime(grouped).mean()
pieces.append(frac_off)
return pd.concat(pieces)
def annual_frac_off_by_unit(yr, region):
"""fraction of time per year each unit is off"""
return 1.0 - annual_cf_by_unit(yr, region)
To test the underlying assumption that peakers are dirty, we plot carbon emissions per MW generated by every unit vs the fraction of the time it was on, for various regions in 2012. The left panel of each figure shows the full data set for each region, while the right panel zooms in on the peaker units. The colored lines are guides to the eye to indicate percentiles of $EF$ (calculated for the full region). Note that many units have zero $EF$ but non-1 fraction off--these results should be taken as preliminary until this apparent discrepancy is resolved.
By eye, ISONE, PJM, and SPP have some peakers that are clearly dirtier than baseline units. The story is less clear in other regions.
In [1146]:
# set up
keys_to_plot = [('2012', reg) for reg in regions.keys()]
# loop over keys
for ik, (yr, reg) in enumerate(keys_to_plot):
# compute
fracs = annual_frac_off_by_unit(yr, regions[reg])
ems = annual_ef_by_unit(yr, regions[reg])
# set up
plt.figure()
labels = ['all', 'peaker']
cf_threshold = 0.95
ef_iles = [(q, np.percentile(ems, q)) for q in [50, 75, 95]]
for ip in range(2):
# plot
plt.subplot(1,2,ip+1)
plt.scatter(fracs, ems, c='k', s=3, label='units')
for q, val in ef_iles:
plt.plot([val for i in range(2)], label='%d%%ile' % q)
# apply zoom
if ip == 0:
plt.xlim([0, 1])
plt.legend(loc=2)
else:
plt.xlim([cf_threshold, 1])
# make legible
plt.xlabel('fraction of time unit is off')
plt.ylabel('EF [lb CO2 / MW]')
# plt.ylim([0, 8e3])
plt.title(' '.join([yr, reg.upper(), labels[ip]]))
plt.tight_layout()
plt.savefig('figures/'+'_'.join([yr, reg.upper(), 'peaker'])+'.pdf')
We estimated the marginal emissions factor (MEF) as a function of total fossil generation by first binning by $G$, then regressing the hourly $\Delta C$ against the hourly $\Delta G$ within each bin (like Fig 3 of Siler-Evans). If the MEF for a generation range (red) is greater than the global-fit reference MEF (black), then hours with those values of generation are predicted to have higher emissions (i.e., to be dirtier) than a generic hour. This is the case at high $G$ for ISONE and CAISO. On the other hand, ERCOT, SPP, and MISO are all cleaner at peak hours, while PJM and BPA have interesting non-monotonicity. (Data points are centered within 5-percentile bins. Standard deviations obtained by bootstrapping and refitting within each bin.)
In [1155]:
def delta_hr(df):
"""difference in value between this hour and the last; first value is NaN"""
return pd.rolling_apply(df, 2, lambda x: x[1]-x[0])
# set up
keys_to_plot = [('2012', reg) for reg in regions.keys()]
# loop over keys
for ik, (yr, reg) in enumerate(keys_to_plot):
# compute
gen = total_gen_by_hour(yr, regions[reg])
dgen = delta_hr(gen)
dem = delta_hr(total_em_by_hour(yr, regions[reg]))
# set up generation bins
n_bin_centers = 20
# bins = np.linspace(np.min(gen), np.max(gen), n_bin_centers+1)
bins = np.percentile(gen, [i * 100.0/n_bin_centers for i in range(n_bin_centers+1)])
bin_centers = bins[:-1] + np.diff(bins)/2.0
# fit to get slopes within generation bins
slope_means = np.zeros(n_bin_centers)
slope_sds = np.zeros(n_bin_centers)
for ibin in range(n_bin_centers):
# create mask
dgen_masked = []
dem_masked = []
for ig, g in enumerate(gen.values):
# first hour's value of the deltas is NaN, so skip
if ig==0:
continue
else:
if g > bins[ibin] and g <= bins[ibin+1]:
dgen_masked.append(dgen.iloc[ig])
dem_masked.append(dem.iloc[ig])
# finish mask
n_vals = len(dgen_masked)
dgen_masked_arr = np.array(dgen_masked)
dem_masked_arr = np.array(dem_masked)
# bootstrap in bin
n_samples = 1000
sampled_slopes = np.zeros(n_samples)
for isample in range(n_samples):
# resample indices
sample_inds = np.random.randint(0, n_vals, n_vals)
# fit
lr = stats.linregress(dgen_masked_arr[sample_inds], dem_masked_arr[sample_inds])
# store
sampled_slopes[isample] = lr[0]
# summarize
slope_sds[ibin] = np.std(sampled_slopes)
slope_means[ibin] = np.mean(sampled_slopes)
# bootstrap overall
ref_slopes = np.zeros(1000)
for i in range(len(ref_slopes)):
inds = np.random.choice(range(dgen.size), size=dgen.size)
lr = stats.linregress(dgen[inds].dropna(), dem[inds].dropna())
ref_slopes[i] = lr[0]
ref_mean = np.mean(ref_slopes)
ref_sd = np.std(ref_slopes)
# plot
plt.figure()
plt.plot(bin_centers, [ref_mean for i in bin_centers], '-', label='ref mean', c='k')
plt.plot(bin_centers, [ref_mean + ref_sd for i in bin_centers], '--', label='ref SD', c='k')
plt.plot(bin_centers, [ref_mean - ref_sd for i in bin_centers], '--', label=None, c='k')
plt.plot(bin_centers, slope_means, 'o-', label='data mean', c='r')
plt.plot(bin_centers, slope_means + slope_sds, '--', label='data SD', c='r')
plt.plot(bin_centers, slope_means - slope_sds, '--', label=None, c='r')
plt.xlabel('$G$ [MW]')
plt.ylabel('MEF [lb/MWh]')
plt.legend(loc=3)
plt.title(' '.join([yr, reg.upper(),
'(ref %d lb/MW)' % ref_mean]))
plt.savefig('figures/'+'_'.join([yr, reg.upper(), 'intensity'])+'.pdf')
Consider a simple critical peak shaving scenario where an hour in the 99th percentile of generation is replaced with an hour in a lower percentile. The marginal emissions benefit is then the difference between the predicted emissions during the "shaved" hour and the actual emissions during the 99th percentile hour ($\Delta E$) over the difference in generation between those hours ($\Delta G$). The larger this ratio, the larger the carbon savings of implementing such a peak shavings procedure. By this measure, ISONE and PJM have the best estimated mean marginal carbon savings, although all estimated means are positive (i.e., peak shaving is pretty good everywhere). Larger generation reductions (i.e., larger -$\Delta G$) have surprisingly little effect on the estimated mean marginal carbon savings, but do increase the confidence in the estimate. [Data points (solid red) are centered within 1-percentile bins. Standard deviations (dashed) obtained by resampling within each bin.]
This should be compared to the reference estimated mean carbon savings due to exchanging random hours. Specifically, I considered exchanging random hours that are within 10 percentile of each other and above 50%ile in $G$; the estimate of the mean MEF for this quantity is shown as the black line. Only ISONE and CAISO have marginal carbon savings due to peak shaving that are better than random.
The bottom panel of each plot shows the probability that reducing generation leads to increased emissions (ie., $\Delta G$ < 0 while $\Delta E$ > 0). Only BPA has a large risk of peak shaving being carbon-costly.
In [1210]:
a = np.linspace(0, 1, num=10).reshape([2,5])
print a.mean(axis=0)
print a.shape
In [1215]:
def hourly_diffs_to_ile(ile_ref, ile_cmp, gen, em, n_samples):
""" Get estimates of $\Delta E$, $\Delta G$, and $\Delta E$/$\Delta G$ for each hour in ile_ref
due to exchanging with a random hour in ile_cmp.
"""
# create masks to select indices in each percentile bin
mask_ref = (gen >= np.percentile(gen, ile_ref)) & (gen < np.percentile(gen, ile_ref+1))
inds_ref = np.where(mask_ref)[0]
mask_cmp = (gen >= np.percentile(gen, ile_cmp)) & (gen < np.percentile(gen, ile_cmp+1))
inds_cmp = np.where(mask_cmp)[0]
n_ref = inds_ref.size
# set up storage for bootstrap
dems = np.zeros([n_samples, n_ref])
dgens = np.zeros([n_samples, n_ref])
mefs = np.zeros([n_samples, n_ref])
# bootstrap
for isample in range(n_samples):
# resample indices in ref and cmp bins
sample_inds_ref = np.random.choice(inds_ref, n_ref)
sample_inds_cmp = np.random.choice(inds_cmp, n_ref)
# compute
sample_dgens = gen[sample_inds_cmp].values - gen[sample_inds_ref].values
sample_dems = em[sample_inds_cmp].values - em[sample_inds_ref].values
# store
mefs[isample] = sample_dems / sample_dgens
dems[isample] = sample_dems
dgens[isample] = sample_dgens
# return mean for each hour in ile_ref
# return dems.mean(axis=0), dgens.mean(axis=0), mefs.mean(axis=0)
return dems, dgens, mefs
# set up
keys_to_plot = [('2012', reg) for reg in regions.keys()]
peak_ile = 99
min_replace_ile = 90
max_replace_ile = 98
# loop over keys
for ik, (yr, reg) in enumerate(keys_to_plot):
# compute
gen = total_gen_by_hour(yr, regions[reg])
em = total_em_by_hour(yr, regions[reg])
# set up peak mask
peak_thresh = np.percentile(gen, peak_ile)
peak_inds = gen >= peak_thresh
n_peak = np.count_nonzero(peak_inds)
n_samples = 500
# compute reference
ref_mefs = []
for i in range(500):
ile_ref = np.random.randint(50, 99)
ile_cmp = np.random.randint(ile_ref-10, ile_ref)
try:
# estimate differences for each hour in ile_ref
dems, dgens, mefs = hourly_diffs_to_ile(ile_ref, ile_cmp, gen, em, n_samples)
# we're sampling the mean of the differences
ref_mefs.append(np.mean(mefs))
except ValueError as e:
print ile_ref, ile_cmp, e
continue
ref_mefs_arr = np.array(ref_mefs) #np.concatenate(ref_mefs)
mean_ref_mef = np.mean(ref_mefs_arr)
lo_ref_mef = np.percentile(ref_mefs_arr, 2.5) #mean_ref_mef - np.std(ref_mefs) #
hi_ref_mef = np.percentile(ref_mefs_arr, 97.5) #mean_ref_mef + np.std(ref_mefs) #
# set up bins
bin_iles = range(min_replace_ile, max_replace_ile+2)
bins = np.percentile(gen, bin_iles)
n_bin_centers = len(bins)-1
# set up data storage
mean_mef = np.zeros(n_bin_centers)
lo_mef = np.zeros(n_bin_centers)
hi_mef = np.zeros(n_bin_centers)
mean_dgen = np.zeros(n_bin_centers)
prob_harmful = np.zeros(n_bin_centers)
# get data
for ibin in range(n_bin_centers):
dems, dgens, mefs = hourly_diffs_to_ile(peak_ile, bin_iles[ibin], gen, em, n_samples)
# summarize
mean_dgen[ibin] = np.mean(dgens)
mean_mef[ibin] = np.mean(mefs)
lo_mef[ibin] = mean_mef[ibin] - np.std(mefs) #np.percentile(mefs, 2.5)
hi_mef[ibin] = mean_mef[ibin] + np.std(mefs) #np.percentile(mefs, 97.5)
prob_harmful[ibin] = np.count_nonzero(dems > 0) / float(dems.size)
# set up plot
plt.figure()
plt.subplot(2,1,1)
# plot reference
ref_xs = [np.min(-mean_dgen), np.max(-mean_dgen)]
plt.plot(ref_xs, [mean_ref_mef for i in ref_xs], '-', label='ref mean', c='k')
# plt.plot(ref_xs, [lo_ref_mef for i in ref_xs], '--', label='ref CI', c='k')
# plt.plot(ref_xs, [hi_ref_mef for i in ref_xs], '--', label=None, c='k')
# plot data
plt.plot(-mean_dgen, mean_mef, 'o-', label='data mean', c='r')
plt.plot(-mean_dgen, lo_mef, '--', label='data SD', c='r')
plt.plot(-mean_dgen, hi_mef, '--', label=None, c='r')
# decorate
plt.xlabel('-$\Delta G$ [MWh/h]')
plt.ylabel('MEF [lb/MW]')
plt.title(' '.join([yr, reg.upper(),
'(ref %.f lb/MW)' % mean_ref_mef
]))
# plot prob harmful
plt.subplot(2,1,2)
plt.plot(-mean_dgen, prob_harmful, 'o-', label='mean')
plt.xlabel('-$\Delta G$ [MWh/h]')
plt.ylabel('prob $\Delta E$>0')
plt.ylim([0,0.18])
# save
plt.savefig('figures/'+'_'.join([yr, reg.upper(), 'peakhours'])+'.pdf')
Now consider a more complex critical peak shaving scenario where a "critical peak" day (i.e., a day with its peak hour in the 99th percentile of generation) is replaced with a "shaved" day (i.e., a "similar" day with its peak hour in a lower percentile). The marginal emissions benefit is then the difference between the predicted total emissions during the "shaved" day and the actual total emissions during the 99th-percentile-containing day ($\sum_{h=0}^{23}\Delta E(h)$) over the difference in generation between those days ($\sum_{h=0}^{23}\Delta G(h)$). Again, the larger this ratio, the larger the carbon savings.
First, let's check that shaved days with the minimal root-mean-square hourly generation difference to the critical peak day ($\sqrt{\frac{1}{24}\sum_{h=0}^{23}\Delta G(h)^2}$) look sufficiently "similar." Looks pretty good to me! In this example, the shaved days are pulled from days with peaks in the 96th, 97th, and 98th percentiles, x-axis is hour of the day, y-axis is hourly generation normalized to the peak of the critical day, red line is the critical day, and green line is the shaved day.
In [1070]:
def peak_hour_each_day(s):
""" Return a set of (dy, hr) keys for the peak hour of each day.
s should be a Series with keys [dy, hr].
"""
return set([(dy, np.argmax(group)) for dy, group in s.groupby(level=['OP_DATE'])])
def vals_in_ile(s, n_ile_min, n_ile_max=None):
"""Return a set of keys for rows with values in the nth percentile (or in the range if max is given)."""
# default is 1 percentile bin
if n_ile_max is None:
n_ile_max = n_ile_min
# minimum and maximum values
val_min = np.percentile(s, n_ile_min)
val_max = np.percentile(s, n_ile_max+1)
# keys in range
return set(s[(s >= val_min) & (s < val_max)].keys())
def days_with_peak_in_ile(s, n_ile_min, np_ile_max=None):
""" Return a grouped Series of days whose peak is in the percentile range of hours.
s should be a Series with keys [dy, hr].
"""
# intersect sets of keys
keys = vals_in_ile(s, n_ile_min, np_ile_max) & peak_hour_each_day(s)
day_keys = [k[0] for k in keys]
# select
return s.select(lambda k: k[0] in day_keys)
def error_between_groups(g1, g2):
"""Error function is simple RMS for now."""
return np.sqrt(np.mean((g1.values - g2.values)**2))
# test
range_min = 96
range_max = 98
for reg in regions.keys():
gen = total_gen_by_hour('2012', regions[reg])
peak_days = days_with_peak_in_ile(gen, 99).groupby(level='OP_DATE')
shaved_days = days_with_peak_in_ile(gen, range_min, range_max).groupby(level='OP_DATE')
plt.figure()
plt.suptitle(' '.join([yr, reg.upper()]))
for i, (peak_label, peak_group) in enumerate(peak_days):
shaved_label, shaved_group = min(shaved_days, key=lambda x: error_between_groups(x[1], peak_group))
plt.subplot(4,5,i+1)
plt.plot(peak_group.values / np.max(peak_group), c='r')
plt.plot(shaved_group.values / np.max(peak_group), c='g')
plt.ylim([0,1.1])
plt.tight_layout()
plt.savefig('figures/'+'_'.join([yr, reg.upper(), 'matchdays', str(range_min), str(range_max)])+'.pdf')
Now let's check the marginal carbon savings for each day. Because there are fewer data points in each bin, I've shown the data distribution as a scatterplot instead of only with summary statistics; the MEFs in the legends are the median estimated hourly carbon savings for each generation reduction scenario. As with the simple peak hour shaving scenario, PJM and ISONE have the best marginal carbon savings for the full range of generation reduction scenarios. The reference calculation here is still a bit wonky, sorry!
In [1196]:
def stats_for_percentile_pairs(range1, range2, gen, em):
"""Returns arrays of daily $\Delta G$ and daily $\Delta E$/$\Delta G$ for paired days within the percentile ranges."""
# get days to be paired up
days1_gen = days_with_peak_in_ile(gen, *range1).groupby(level='OP_DATE')
days2_gen = days_with_peak_in_ile(gen, *range2).groupby(level='OP_DATE')
# storage
mefs = np.zeros(len(days1_gen))
dgens = np.zeros(len(days1_gen))
for iday, (label1, group1_gen) in enumerate(days1_gen):
# find best match to this peak day
label2, group2_gen = min(days2_gen, key=lambda x: error_between_groups(x[1], group1_gen))
# emissions for each day
group1_em = em.select(lambda k: k[0] == label1)
group2_em = em.select(lambda k: k[0] == label2)
# carbon savings
dgen = (group1_gen.values - group2_gen.values).sum()
dem = (group1_em.values - group2_em.values).sum()
mefs[iday] = dem / dgen
dgens[iday] = dgen
# return
return dgens, mefs
ref_ranges = [(i, i+2) for i in range(0, 98, 3)]
ranges = ref_ranges[-3:]
peak_range = (99, 99)
yr = '2012'
for reg in regions.keys():
# compute
gen = total_gen_by_hour(yr, regions[reg])
em = total_em_by_hour(yr, regions[reg])
# set up plot
plt.figure()
plt.xlabel('-$\Delta G$ [MWh/h]')
plt.ylabel('MEF [lb/MW]')
plt.title(' '.join([yr, reg.upper()]))
colors = ['g', 'b', 'r']
ref_dgens = []
for irange, shaved_range in enumerate(ranges):
# compute
dgens, mefs = stats_for_percentile_pairs(shaved_range, peak_range, gen, em)
# plot
plt.scatter(-dgens/24, mefs, c=colors[irange],
label='%d-%d%%ile (MEF %d lb/MW)' % (shaved_range[0], shaved_range[1], np.median(mefs)))
ref_dgens.append(dgens)
ref_dgen_arr = np.concatenate(ref_dgens)
# compute reference
ref_mefs = []
for i in range(500):
# get reference ranges
irange1, irange2 = 0, 0
while irange1 == irange2:
irange1 = np.random.randint(len(ref_ranges))
irange2 = np.random.randint(len(ref_ranges))
# compute
try:
dgens, mefs = stats_for_percentile_pairs(ref_ranges[irange1], ref_ranges[irange2], gen, em)
ref_mefs.append(mefs)
except ValueError: # no days in range
continue
# concatenate and plot
ref_mef_arr = np.concatenate(ref_mefs)
ref_mef = np.mean(ref_mef_arr)
ref_mef_ci_lower = np.percentile(ref_mef_arr, 2.5)
ref_mef_ci_upper = np.percentile(ref_mef_arr, 97.5)
xs = -np.array([np.min(ref_dgen_arr), np.max(ref_dgen_arr)])/24
# plt.plot(xs, [ref_mef for i in xs], '-', c='k', label='ref (%d lb/MW)' % ref_mef)
# plt.plot(xs, [ref_mef_ci_lower for i in xs], '--', c='gray', label='95% CI of ref')
# plt.plot(xs, [ref_mef_ci_upper for i in xs], '--', c='gray', label=None)
plt.legend()
# plt.ylim([-1000, 6000])
plt.savefig('figures/'+'_'.join([yr, reg.upper(), 'matchdays_intensity'])+'.pdf')
We define critical peak days as the days that contain the hours in the 99th percentile of generation. In 2012, this was 11-19 days depending on the region, almost entirely in June, July, and August.
We define the critical peak period as the hour that is the peak hour of the day most often within the set of critical peak days, and the two hours on either side of that hour (i.e., the critical peak period is 5 hours long). In 2012, this was 1pm-5pm in all regions.
In [957]:
def critical_peak_day_labels(s, n_ile=99):
""" Return the labels for the days with the nth percentile of hours (default n=99).
s should be a Series with keys [OP_DATE, OP_HOUR].
Result is sorted by the daily generation peak, descending.
"""
# find highest hours
ile_val = np.percentile(s, n_ile)
top_hours = s[s > ile_val]
# get day labels for those hours
top_days = np.array([dy for dy, hr in top_hours.order(ascending=False).keys()])
# order unique labels by top hour
uniq_days_sort_by_time, uniq_inds_sort_by_time = np.unique(top_days, return_index=True)
uniq_days_sort_by_val = top_days[np.sort(uniq_inds_sort_by_time)]
# return
return uniq_days_sort_by_val
def select_days(s, day_labels, inverse=False):
""" Return a selection of the Series s with the given labels.
s should be a Series with keys [OP_DATE, OP_HOUR].
If inverse=True, return the inverse of the selection.
"""
if inverse:
return s.select(lambda k: k[0] not in day_labels)
else:
return s.select(lambda k: k[0] in day_labels)
def critical_peak_hour_labels(s):
""" Return the label for the hour that is the peak hour of the day most often.
s should be a Series with keys [OP_DATE, OP_HOUR].
"""
peak_hour_each_day = s.groupby(level='OP_DATE').aggregate(np.argmax)
mode = stats.mode(peak_hour_each_day)
hr_labels = ['%02.f' % (mode[0][0] + i) for i in [-2, -1, 0, 1, 2]]
return hr_labels
# test
for reg in regions.keys():
print '%s:' % reg.upper()
gen_hourly = total_gen_by_hour('2012', regions[reg])
cpd_labels = critical_peak_day_labels(gen_hourly)
print'\tn peak days : %d' % len(cpd_labels)
print '\tdays : %s' % cpd_labels
gen_cpd = select_days(gen_hourly, cpd_labels)
cph_labels = critical_peak_hour_labels(gen_cpd)
print '\tpeak hours : %s' % cph_labels
In [846]:
########################
# critical peak pricing
########################
# set up
region = isone
yr = '2012'
# get total generation and emissions for every hour
gen_hourly = total_gen_by_hour(yr, region)
em_hourly = total_em_by_hour(yr, region)
# sort by highest hour
days_by_peak = []
unused_days = set(day for day, hr in gen_hourly.keys())
for day, hr in gen_hourly.order(ascending=False).keys():
try:
unused_days.remove(day)
days_by_peak.append(day)
except KeyError:
pass
# plot ts for each day
n_peak_days = 12
if False:
for day in sorted(days_by_peak[:4]):
plt.plot(gen_hourly[day], label=day)
plt.xlabel('hour of day')
plt.ylabel('MWh/h')
plt.title('%s %s' % (yr, region))
#plt.legend()
# sanity check for peak ordering
#timeorder_days = sorted(days_by_peak)
#plt.plot([366-days_by_peak.index(dl) for dl in timeorder_days])
#plt.xlabel('day in year')
#plt.ylabel('generation peak rank')
# total gen and em for each day for on- and off-peak
gen_daily = gen_hourly.groupby(level='OP_DATE').sum()
em_daily = em_hourly.groupby(level='OP_DATE').sum()
gen_daily_peak = gen_daily.select(lambda x: x in days_by_peak[:n_peak_days])
em_daily_peak = em_daily.select(lambda x: x in days_by_peak[:n_peak_days])
gen_daily_offpeak = gen_daily.select(lambda x: x not in days_by_peak[:n_peak_days])
em_daily_offpeak = em_daily.select(lambda x: x not in days_by_peak[:n_peak_days])
# slopes for peak vs overall
if False:
lr_peak = stats.linregress(gen_daily_peak, em_daily_peak)
lr_offpeak = stats.linregress(gen_daily_offpeak, em_daily_offpeak)
print 'daily lb CO2/daily MW: peak %0.2f, offpeak %0.2f' % (lr_peak[0], lr_offpeak[0])
plt.scatter(gen_daily_peak, em_daily_peak, label='peak days', c='r')
plt.plot(gen_daily_peak, lr_peak[0] * gen_daily_peak + lr_peak[1], label='peak fit', c='r')
plt.scatter(gen_daily_offpeak, em_daily_offpeak, label='offpeak days', c='b')
plt.plot(gen_daily_offpeak, lr_offpeak[0] * gen_daily_offpeak + lr_offpeak[1], label='offpeak fit', c='b')
plt.xlabel('daily MW')
plt.ylabel('daily lb CO2')
plt.title('%s %s' % (yr, region))
plt.legend(loc=2)
# highest gen hour of day
peak_hr_label = gen_hourly.select(lambda x: x[0] in days_by_peak[:n_peak_days]).groupby(level='OP_HOUR').median().argmax()
peak_hr_label_list = [peak_hr_label+i for i in [-2, -1, 0, 1, 2]]
# similarity measure for offpeak times of day for peak days
gen_hourly_peak = gen_hourly.select(lambda x: x[1] in peak_hr_label_list)
gen_hourly_offpeak = gen_hourly.select(lambda x: x[1] not in peak_hr_label_list)
def mean_diff(dy1, dy2, flag):
"""mean of day 1 - day 2 (asymmetric)"""
if flag=='peak':
s = gen_hourly_peak
else:
s = gen_hourly_offpeak
diff = s.select(lambda x: x[0]==dy1).values - s.select(lambda x: x[0]==dy2).values
return np.mean(diff)
def rms_diff(dy1, dy2, flag):
"""rms of day 1 - day 2 (symmetric)"""
if flag=='peak':
s = gen_hourly_peak
else:
s = gen_hourly_offpeak
diff = s.select(lambda x: x[0]==dy1).values - s.select(lambda x: x[0]==dy2).values
return np.sqrt(np.mean(diff**2))
def order_param(dy1, dy2):
"""maximize diff in peak while minimizing diff in offpeak"""
peak_mean = mean_diff(dy1, dy2, 'peak')
offpeak_rms = rms_diff(dy1, dy2, 'offpeak')
return peak_mean - offpeak_rms*2
best_match_days = []
for idl, ref_dl in enumerate(days_by_peak[:n_peak_days]):
# get match val between ref day and every other day
vals = [order_param(ref_dl, dl) for dl in timeorder_days]
try:
# take best match
best_val = sorted(vals, reverse=True)[0]
# get the label of match day
best_match_day = timeorder_days[vals.index(best_val)]
assert(best_match_day != ref_dl)
except AssertionError: # if best day was today
# take second best match
best_val = sorted(vals, reverse=True)[1]
# get the label of match day
best_match_day = timeorder_days[vals.index(best_val)]
assert(best_match_day != ref_dl)
# save
best_match_days.append(best_match_day)
if False:
# plot ref and match days to compare by eye
plt.figure()
for idl in range(n_peak_days):
plt.subplot(3,4,idl+1)
plt.plot(gen_hourly.select(lambda x: x[0]==days_by_peak[idl]))
plt.plot(gen_hourly.select(lambda x: x[0]==best_match_days[idl]))
plt.title('%2.1f' % best_val)
plt.tight_layout()
In [847]:
# compare excess emissions per excess MW on critical and matched non-critical days
if True:
vals = []
for idl in range(n_peak_days):
em_excess = em_hourly.select(lambda x: x[0] == days_by_peak[idl]).values - em_hourly.select(lambda x: x[0] == best_match_days[idl]).values
gen_excess = gen_hourly.select(lambda x: x[0] == days_by_peak[idl]).values - gen_hourly.select(lambda x: x[0] == best_match_days[idl]).values
vals.append(em_excess.sum() / gen_excess.sum())
plt.hist(vals)
In [ ]: