In [5]:
from epa_client import *

###################
# set up regions
###################

# these are approximations of geographic regions associated with the electricity markets
regions = {
    'isone': ['ct', 'me', 'ma', 'vt', 'nh', 'ri'],
    'ercot': ['tx'],
    'bpa': ['wa', 'or', 'id'],
    'pjm': ['pa', 'nj', 'dc', 'md', 'va', 'de', 'ky', 'wv', 'oh'],
    'miso': ['ia', 'mo', 'il', 'mi', 'wi', 'nd', 'sd', 'ne', 'mn', 'in'],
    'spp': ['ks', 'ok', 'ms', 'la', 'ar', 'mo'],
    'caliso': ['ca'],
    'nyiso': ['ny'],
}

In [112]:
ferc_data = {}
enduser_data = {}

def ferc_load(reg):
    """read FERC region from csv into ferc_data[reg]"""
    # add year if it's not there
    if not reg in ferc_data.keys():
        ferc_data[reg] = {}
        
    # check if we've already read it
    if not reg in ferc_data[reg].keys():
        print "loading %s" % reg

        # set up path
        csvfname = os.path.join('data/ferc_714', '%s_data.csv' % reg.upper())
        
        # csv -> dataframe
        df = pd.read_csv(csvfname, usecols=[4]+range(6,30), index_col=0, parse_dates=True)
        df.columns = range(24)
        stacked = df.stack()
        stacked.index.names = ['OP_DATE', 'OP_HOUR']
        ferc_data[reg] = stacked
            
def enduser_load(reg):
    """read end-user price data from csv into enduser_data[reg]"""
    # add year if it's not there
    if not reg in enduser_data.keys():
        enduser_data[reg] = {}
        
    # check if we've already read it
    if not reg in enduser_data[reg].keys():
        print "loading %s" % reg

        # set up path
        csvfname = os.path.join('data/end_user', '%s_data.csv' % reg.upper())
        
        # csv -> dataframe
        df = pd.read_csv(csvfname, parse_dates=True, index_col=[0,1], header=0)
        df.columns = ['Flat ($)', 'Day/night ($)', 'TOU ($)']
        df.index.names = ['OP_DATE', 'OP_HOUR']
        enduser_data[reg] = df

In [113]:
# load FERC data
ferc_load('nyiso')
print ferc_data['nyiso'].head()
print ferc_data['nyiso'].tail()
print ferc_data['nyiso'].describe()

# load end user price
enduser_load('nyseg')
print enduser_data['nyseg'].head()
print enduser_data['nyseg'].tail()
print enduser_data['nyseg'].describe()


loading nyiso
OP_DATE     OP_HOUR
2010-01-01  0          50.49
            1           0.00
            2          21.11
            3          64.89
            4           3.83
dtype: float64
OP_DATE     OP_HOUR
2012-09-09  19         29.86
            20         28.85
            21         28.39
            22         26.97
            23         28.00
dtype: float64
count    26304.000000
mean        37.670601
std         26.869958
min        -97.340000
25%         27.900000
50%         34.400000
75%         41.630000
max       1148.250000
dtype: float64
loading nyseg
                    Flat ($)  Day/night ($)  TOU ($)
OP_DATE    OP_HOUR                                  
2010-01-01 0             127            101      103
           1             127            101      103
           2             127            101      103
           3             127            101      103
           4             127            101      103
                    Flat ($)  Day/night ($)  TOU ($)
OP_DATE    OP_HOUR                                  
2012-12-31 19            127            101      123
           20            127            101      123
           21            127            101      123
           22            127            101      123
           23            127            101      103
       Flat ($)  Day/night ($)       TOU ($)
count     26304   26304.000000  26304.000000
mean        127     107.708333    113.907847
std           0      10.454381     20.202940
min         127     101.000000    103.000000
25%         127     101.000000    103.000000
50%         127     101.000000    103.000000
75%         127     124.000000    123.000000
max         127     124.000000    236.000000

In [8]:
# load EPA data
load('2011','ny')
load('2012','ny')
print data['2012']['ny'].head()


loading 2011 ny
loading 2012 ny
  STATE      FACILITY_NAME UNITID             OP_DATE  OP_HOUR  OP_TIME  \
0    NY  Dynegy Danskammer      1 2012-01-01 00:00:00        0        0   
1    NY  Dynegy Danskammer      1 2012-01-01 00:00:00        1        0   
2    NY  Dynegy Danskammer      1 2012-01-01 00:00:00        2        0   
3    NY  Dynegy Danskammer      1 2012-01-01 00:00:00        3        0   
4    NY  Dynegy Danskammer      1 2012-01-01 00:00:00        4        0   

   GLOAD (MW)  CO2_MASS (tons) CO2_MASS_MEASURE_FLG  
0         NaN              NaN                  NaN  
1         NaN              NaN                  NaN  
2         NaN              NaN                  NaN  
3         NaN              NaN                  NaN  
4         NaN              NaN                  NaN  

In [114]:
# join FERC data to EPA data
gen = pd.concat([total_gen_by_hour(yr,regions['nyiso']) for yr in ['2010', '2011', '2012']])
em = pd.concat([total_em_by_hour(yr,regions['nyiso']) for yr in ['2010', '2011', '2012']])
aoer = em / gen

def delta_hr(df):
    """difference in value between this hour and the last; first value is NaN"""
    return pd.rolling_apply(df, 2, lambda x: x[1]-x[0])
def calc_moer(em, gen):
    dem = delta_hr(em)
    dgen = delta_hr(gen)
    moer = dem / dgen
    moer[dgen < 500] = np.nan
    return moer
moer = calc_moer(em, gen)

to_concat = [gen, em, aoer, moer, ferc_data['nyiso']] + [enduser_data['nyseg'][s] for s in enduser_data['nyseg']]
nydf = pd.concat(to_concat, axis=1)
nydf.columns = ['GLOAD (MW)', 'CO2_MASS (lb)', 'AOER (lb/MW)', 'MOER (lb/MW)', 'Lambda ($/MW)'] + [s for s in enduser_data['nyseg']]
print nydf.describe()


         GLOAD (MW)    CO2_MASS (lb)  AOER (lb/MW)  MOER (lb/MW)  \
count  26304.000000     26304.000000  26304.000000   4059.000000   
mean    7255.415564   8787696.127661   1214.871906   1089.140311   
std     2849.524593   3484051.977668    129.209919    215.857797   
min     2062.000000   2396000.000000    851.948151   -359.165303   
25%     5365.000000   6410948.500000   1119.888174    956.839181   
50%     6690.000000   8101168.000000   1201.013432   1083.968317   
75%     8254.000000  10171746.500000   1295.730724   1218.730556   
max    21200.000000  26574330.000000   2490.227602   2181.937107   

       Lambda ($/MW)  Flat ($)  Day/night ($)       TOU ($)  
count   26304.000000     26304   26304.000000  26304.000000  
mean       37.670601       127     107.708333    113.907847  
std        26.869958         0      10.454381     20.202940  
min       -97.340000       127     101.000000    103.000000  
25%        27.900000       127     101.000000    103.000000  
50%        34.400000       127     101.000000    103.000000  
75%        41.630000       127     124.000000    123.000000  
max      1148.250000       127     124.000000    236.000000  

In [115]:
print 'pearson'
print nydf.corr()
print 'spearman'
print nydf.corr(method='spearman')


pearson
               GLOAD (MW)  CO2_MASS (lb)  AOER (lb/MW)  MOER (lb/MW)  \
GLOAD (MW)       1.000000       0.969043     -0.072532      0.074268   
CO2_MASS (lb)    0.969043       1.000000      0.166691      0.152751   
AOER (lb/MW)    -0.072532       0.166691      1.000000      0.393952   
MOER (lb/MW)     0.074268       0.152751      0.393952      1.000000   
Lambda ($/MW)    0.396356       0.430813      0.136600      0.139563   
Flat ($)              NaN            NaN           NaN           NaN   
Day/night ($)    0.298334       0.266633     -0.107568      0.021963   
TOU ($)          0.187610       0.136870     -0.187119     -0.048302   

               Lambda ($/MW)  Flat ($)  Day/night ($)   TOU ($)  
GLOAD (MW)          0.396356       NaN       0.298334  0.187610  
CO2_MASS (lb)       0.430813       NaN       0.266633  0.136870  
AOER (lb/MW)        0.136600       NaN      -0.107568 -0.187119  
MOER (lb/MW)        0.139563       NaN       0.021963 -0.048302  
Lambda ($/MW)       1.000000       NaN       0.129416  0.084016  
Flat ($)                 NaN       NaN            NaN       NaN  
Day/night ($)       0.129416       NaN       1.000000  0.317338  
TOU ($)             0.084016       NaN       0.317338  1.000000  
spearman
               GLOAD (MW)  CO2_MASS (lb)  AOER (lb/MW)  MOER (lb/MW)  \
GLOAD (MW)       1.000000       0.938826     -0.051858      0.067467   
CO2_MASS (lb)    0.938826       1.000000      0.268795      0.176656   
AOER (lb/MW)    -0.051858       0.268795      1.000000      0.395988   
MOER (lb/MW)     0.067467       0.176656      0.395988      1.000000   
Lambda ($/MW)    0.539635       0.611317      0.297965      0.152912   
Flat ($)              NaN            NaN           NaN           NaN   
Day/night ($)    0.303886       0.258943     -0.096602      0.022648   
TOU ($)          0.398035       0.314127     -0.182178     -0.055342   

               Lambda ($/MW)  Flat ($)  Day/night ($)   TOU ($)  
GLOAD (MW)          0.539635       NaN       0.303886  0.398035  
CO2_MASS (lb)       0.611317       NaN       0.258943  0.314127  
AOER (lb/MW)        0.297965       NaN      -0.096602 -0.182178  
MOER (lb/MW)        0.152912       NaN       0.022648 -0.055342  
Lambda ($/MW)       1.000000       NaN       0.170130  0.182167  
Flat ($)                 NaN       NaN            NaN       NaN  
Day/night ($)       0.170130       NaN       1.000000  0.347738  
TOU ($)             0.182167       NaN       0.347738  1.000000  

In [72]:
# correlation between price and emissions
xname = 'Lambda ($/MW)'

for yname in ['AOER (lb/MW)', 'MOER (lb/MW)']:
    # basic plot
    plt.figure()
    plt.scatter(nydf[xname], nydf[yname], s=3, alpha=0.5, label='data')
    plt.xlabel(xname)
    plt.ylabel(yname)
    
    # fit to valid data
    valid_mask = nydf[yname].notnull() & (nydf[xname] < 200)
    pf = np.polyfit(nydf[valid_mask][xname], nydf[valid_mask][yname], 2)
    lr = stats.linregress(nydf[valid_mask][xname], nydf[valid_mask][yname])
    xs = np.linspace(np.percentile(nydf[xname], 0.5), np.percentile(nydf[xname], 99.5), 100)
    plt.plot(xs, pf[0]*xs**2 + pf[1]*xs + pf[2], c='r', label='quadratic')#, label='r=%.3f' % lr[2])
    plt.plot(xs, lr[0]*xs + lr[1], c='g', label='linear r=%.3f' % lr[2])
    plt.legend()