In [5]:
from epa_client import *
###################
# set up regions
###################
# these are approximations of geographic regions associated with the electricity markets
regions = {
'isone': ['ct', 'me', 'ma', 'vt', 'nh', 'ri'],
'ercot': ['tx'],
'bpa': ['wa', 'or', 'id'],
'pjm': ['pa', 'nj', 'dc', 'md', 'va', 'de', 'ky', 'wv', 'oh'],
'miso': ['ia', 'mo', 'il', 'mi', 'wi', 'nd', 'sd', 'ne', 'mn', 'in'],
'spp': ['ks', 'ok', 'ms', 'la', 'ar', 'mo'],
'caliso': ['ca'],
'nyiso': ['ny'],
}
In [112]:
ferc_data = {}
enduser_data = {}
def ferc_load(reg):
"""read FERC region from csv into ferc_data[reg]"""
# add year if it's not there
if not reg in ferc_data.keys():
ferc_data[reg] = {}
# check if we've already read it
if not reg in ferc_data[reg].keys():
print "loading %s" % reg
# set up path
csvfname = os.path.join('data/ferc_714', '%s_data.csv' % reg.upper())
# csv -> dataframe
df = pd.read_csv(csvfname, usecols=[4]+range(6,30), index_col=0, parse_dates=True)
df.columns = range(24)
stacked = df.stack()
stacked.index.names = ['OP_DATE', 'OP_HOUR']
ferc_data[reg] = stacked
def enduser_load(reg):
"""read end-user price data from csv into enduser_data[reg]"""
# add year if it's not there
if not reg in enduser_data.keys():
enduser_data[reg] = {}
# check if we've already read it
if not reg in enduser_data[reg].keys():
print "loading %s" % reg
# set up path
csvfname = os.path.join('data/end_user', '%s_data.csv' % reg.upper())
# csv -> dataframe
df = pd.read_csv(csvfname, parse_dates=True, index_col=[0,1], header=0)
df.columns = ['Flat ($)', 'Day/night ($)', 'TOU ($)']
df.index.names = ['OP_DATE', 'OP_HOUR']
enduser_data[reg] = df
In [113]:
# load FERC data
ferc_load('nyiso')
print ferc_data['nyiso'].head()
print ferc_data['nyiso'].tail()
print ferc_data['nyiso'].describe()
# load end user price
enduser_load('nyseg')
print enduser_data['nyseg'].head()
print enduser_data['nyseg'].tail()
print enduser_data['nyseg'].describe()
In [8]:
# load EPA data
load('2011','ny')
load('2012','ny')
print data['2012']['ny'].head()
In [114]:
# join FERC data to EPA data
gen = pd.concat([total_gen_by_hour(yr,regions['nyiso']) for yr in ['2010', '2011', '2012']])
em = pd.concat([total_em_by_hour(yr,regions['nyiso']) for yr in ['2010', '2011', '2012']])
aoer = em / gen
def delta_hr(df):
"""difference in value between this hour and the last; first value is NaN"""
return pd.rolling_apply(df, 2, lambda x: x[1]-x[0])
def calc_moer(em, gen):
dem = delta_hr(em)
dgen = delta_hr(gen)
moer = dem / dgen
moer[dgen < 500] = np.nan
return moer
moer = calc_moer(em, gen)
to_concat = [gen, em, aoer, moer, ferc_data['nyiso']] + [enduser_data['nyseg'][s] for s in enduser_data['nyseg']]
nydf = pd.concat(to_concat, axis=1)
nydf.columns = ['GLOAD (MW)', 'CO2_MASS (lb)', 'AOER (lb/MW)', 'MOER (lb/MW)', 'Lambda ($/MW)'] + [s for s in enduser_data['nyseg']]
print nydf.describe()
In [115]:
print 'pearson'
print nydf.corr()
print 'spearman'
print nydf.corr(method='spearman')
In [72]:
# correlation between price and emissions
xname = 'Lambda ($/MW)'
for yname in ['AOER (lb/MW)', 'MOER (lb/MW)']:
# basic plot
plt.figure()
plt.scatter(nydf[xname], nydf[yname], s=3, alpha=0.5, label='data')
plt.xlabel(xname)
plt.ylabel(yname)
# fit to valid data
valid_mask = nydf[yname].notnull() & (nydf[xname] < 200)
pf = np.polyfit(nydf[valid_mask][xname], nydf[valid_mask][yname], 2)
lr = stats.linregress(nydf[valid_mask][xname], nydf[valid_mask][yname])
xs = np.linspace(np.percentile(nydf[xname], 0.5), np.percentile(nydf[xname], 99.5), 100)
plt.plot(xs, pf[0]*xs**2 + pf[1]*xs + pf[2], c='r', label='quadratic')#, label='r=%.3f' % lr[2])
plt.plot(xs, lr[0]*xs + lr[1], c='g', label='linear r=%.3f' % lr[2])
plt.legend()