This notebook makes use of data created in the notebooks (nested levels indicate a chain of calculations):
In [1]:
%matplotlib inline
from __future__ import division
import matplotlib.pyplot as plt
import seaborn as sns
# import plotly.plotly as py
# import plotly.graph_objs as go
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import pandas as pd
import os
import numpy as np
# init_notebook_mode(connected=True)
import datetime as dt
Create some helper functions to add datetime and quarter columns
In [2]:
def add_datetime(df, year='year', month='month'):
df['datetime'] = pd.to_datetime(df[year].astype(str) + '-' + df[month].astype(str),
format='%Y-%m')
def add_quarter(df, year='year', month='month'):
add_datetime(df, year, month)
df['quarter'] = df['datetime'].dt.quarter
In [3]:
path = os.path.join('Facility gen fuels and CO2.csv')
eia_facility = pd.read_csv(path, parse_dates=['datetime'], low_memory=False)
Because EIA tracks all fuel consumption at facilities that might produce both electricity and useful thermal output (CHP), CO2 emissions can be from one 4 categories:
We are interested in Category 4. EPA reports total emissions (Category 1), which need to be adjusted. To do this, we calculate a ratio
$$CO_2 \ Ratio = \frac{Category \ 4}{Category \ 1}$$Will will apply the CO2 ratio factors to EPA data later in this notebook.
In [4]:
cols = ['all fuel fossil CO2 (kg)','elec fuel fossil CO2 (kg)',
'all fuel total CO2 (kg)','elec fuel total CO2 (kg)', 'generation (MWh)']
eia_facility_grouped = eia_facility.groupby(['year', 'month', 'plant id'])[cols].sum()
eia_facility_grouped.reset_index(inplace=True)
eia_facility_grouped['CO2 ratio'] = eia_facility_grouped['elec fuel fossil CO2 (kg)'] / eia_facility_grouped['all fuel total CO2 (kg)']
eia_facility_grouped['CO2 ratio'].fillna(0, inplace=True)
eia_facility_grouped.head()
Out[4]:
In [5]:
path = os.path.join('EIA country-wide gen fuel CO2.csv')
eia_total = pd.read_csv(path, parse_dates=['datetime'], low_memory=False)
In [6]:
eia_total['type'].unique()
Out[6]:
In [7]:
keep_types = [u'WWW', u'WND', u'WAS', u'SUN', 'DPV', u'NUC', u'NG',
u'PEL', u'PC', u'OTH', u'COW', u'OOG', u'HPS', u'HYC', u'GEO']
keep_cols = ['generation (MWh)', 'total fuel (mmbtu)', 'elec fuel (mmbtu)',
'all fuel CO2 (kg)', 'elec fuel CO2 (kg)']
eia_total_monthly = eia_total.loc[(eia_total['type'].isin(keep_types))].groupby(['type', 'year', 'month'])[keep_cols].sum()
In [8]:
eia_total_monthly.head()
Out[8]:
Pretty sure that I don't need to keep eia_total_annual
In [9]:
keep_types = [u'WWW', u'WND', u'WAS', u'TSN', u'NUC', u'NG',
u'PEL', u'PC', u'OTH', u'COW', u'OOG', u'HPS', u'HYC', u'GEO']
eia_total_annual = eia_total_monthly.reset_index().groupby('year').sum()
eia_total_annual['index (g/kWh)'] = eia_total_annual['elec fuel CO2 (kg)'] / eia_total_annual['generation (MWh)']
In [10]:
path = os.path.join('Monthly EPA emissions.csv')
epa = pd.read_csv(path)
In [11]:
add_quarter(epa, year='YEAR', month='MONTH')
epa.head()
Out[11]:
Fill nan's with 0
In [12]:
epa.loc[:,'CO2_MASS (kg)'].fillna(0, inplace=True)
Use an inner merge rather than left
Justification: a left merge will retain CO2 emissions from facilities that aren't included in 923. But the generation and emissions for those facilities are included in the state-level estimates.
In [13]:
eia_keep = ['month', 'year', 'all fuel total CO2 (kg)', 'CO2 ratio', 'plant id']
epa_adj = epa.merge(eia_facility_grouped[eia_keep], left_on=['ORISPL_CODE', 'YEAR', 'MONTH'],
right_on=['plant id', 'year', 'month'], how='inner') # how='left
epa_adj.drop(['month', 'year', 'plant id'], axis=1, inplace=True)
epa_adj['epa index'] = epa_adj.loc[:,'CO2_MASS (kg)'] / epa_adj.loc[:,'GLOAD (MW)']
epa_adj.head()
Out[13]:
In [14]:
sns.jointplot('CO2_MASS (kg)', 'all fuel total CO2 (kg)', epa_adj, marker='.')
Out[14]:
In [28]:
# Calaculated with an "inner" merge of the dataframes
for year in range(2001, 2017):
total_co2 = epa_adj.loc[epa_adj['YEAR']==year, 'CO2_MASS (kg)'].sum()
union_co2 = epa_adj.loc[(epa_adj['YEAR']==year) &
~(epa_adj['CO2 ratio'].isnull()), 'CO2_MASS (kg)'].sum()
missing = total_co2 - union_co2
print year, '{:.3%}'.format(union_co2/total_co2), 'accounted for', \
missing/1000, 'metric tons missing'
Look back at this to ensure that I'm correctly accounting for edge cases
Start by setting all adjusted CO2 (adj CO2 (kg)
) values to the reported CO2 value
In [15]:
epa_adj['adj CO2 (kg)'] = epa_adj.loc[:,'CO2_MASS (kg)']
If CEMS reported CO2 emissions are 0 but heat inputs are >0 and calculated CO2 emissions are >0, change the adjusted CO2 to NaN. These NaN values will be replaced by the calculated value later. Do the same for low index records (<300 g/kWh). If there is a valid CO2 ratio, multiply the adjusted CO2 column by the CO2 ratio.
In [16]:
epa_adj.loc[~(epa_adj['CO2_MASS (kg)']>0) &
(epa_adj['HEAT_INPUT (mmBtu)']>0) &
(epa_adj['all fuel total CO2 (kg)']>0), 'adj CO2 (kg)'] = np.nan
epa_adj.loc[(epa_adj['epa index']<300) &
(epa_adj['HEAT_INPUT (mmBtu)']>0) &
(epa_adj['all fuel total CO2 (kg)']>0), 'adj CO2 (kg)'] = np.nan
epa_adj.loc[epa_adj['CO2 ratio'].notnull(), 'adj CO2 (kg)'] *= epa_adj.loc[epa_adj['CO2 ratio'].notnull(), 'CO2 ratio']
In [17]:
for year in range(2001,2017):
num_missing = len(epa_adj.loc[(epa_adj['adj CO2 (kg)'].isnull()) &
(epa_adj['YEAR']==year), 'ORISPL_CODE'].unique())
total = len(epa_adj.loc[epa_adj['YEAR']==year, 'ORISPL_CODE'].unique())
print 'In', str(year) + ',', num_missing, 'plants missing some data out of', total
In [18]:
eia_facility['fuel'].unique()
Out[18]:
In [19]:
# OG and BFG are included in Other because I've included OOG in Other below
# Pet liquids and pet coke are included here because they line up with how the state-level
# EIA data are reported
facility_fuel_cats = {'COW' : ['SUB','BIT','LIG', 'WC','SC','RC','SGC'],
'NG' : ['NG'],
'PEL' : ['DFO', 'RFO', 'KER', 'JF', 'PG', 'WO', 'SGP'],
'PC' : ['PC'],
'HYC' : ['WAT'],
'HPS' : [],
'GEO' : ['GEO'],
'NUC' : ['NUC'],
'OOG' : ['BFG', 'OG', 'LFG'],
'OTH' : ['OTH', 'MSN', 'MSW', 'PUR', 'TDF', 'WH'],
'SUN' : ['SUN'],
'DPV' : [],
'WAS' : ['OBL', 'OBS', 'OBG', 'MSB', 'SLW'],
'WND' : ['WND'],
'WWW' : ['WDL', 'WDS', 'AB', 'BLQ']
}
Create a new df that groups the facility data into more general fuel types that match up with the EIA generation and fuel use totals.
In [20]:
eia_facility_fuel = eia_facility.copy()
for key in facility_fuel_cats.keys():
eia_facility_fuel.loc[eia_facility_fuel['fuel'].isin(facility_fuel_cats[key]),'type'] = key
eia_facility_fuel = eia_facility_fuel.groupby(['type', 'year', 'month']).sum()
# eia_facility_fuel.reset_index(inplace=True)
eia_facility_fuel.head()
Out[20]:
In [21]:
eia_total_monthly.head()
Out[21]:
In [22]:
iterables = [eia_total_monthly.index.levels[0], range(2001, 2017), range(1, 13)]
index = pd.MultiIndex.from_product(iterables=iterables, names=['type', 'year', 'month'])
eia_extra = pd.DataFrame(index=index, columns=['total fuel (mmbtu)', 'generation (MWh)',
'elec fuel (mmbtu)'])
In [23]:
idx = pd.IndexSlice
In [24]:
use_columns=['total fuel (mmbtu)', 'generation (MWh)',
'elec fuel (mmbtu)']
eia_extra = (eia_total_monthly.loc[idx[:,:,:], use_columns] -
eia_facility_fuel.loc[idx[:,:,:], use_columns])
# I have lumped hydro pumped storage in with conventional hydro in the facility data.
# Because of this, I need to add HPS rows so that the totals will add up correctly.
# Also need to add DPV because it won't show up otherwise
eia_extra.loc[idx[['HPS', 'DPV'],:,:], use_columns] = eia_total_monthly.loc[idx[['HPS', 'DPV'],:,:], use_columns]
# eia_extra = eia_extra.loc[idx[:, 2003:, :],:]
In [25]:
eia_extra.head()
Out[25]:
In [26]:
eia_extra.loc[idx['DPV',:,:]]
Out[26]:
In [27]:
path = os.path.join('Final emission factors.csv')
ef = pd.read_csv(path, index_col=0)
We need to approximate some of the emission factors because the state-level EIA data is only available in the bulk download at an aggregated level. Natural gas usually makes up the bulk of this extra electric generation/fuel use (consumption not reported by facilities, estimated by EIA), and it is still a single fuel here.
In [28]:
fuel_factors = {'NG' : ef.loc['NG', 'Fossil Factor'],
'PEL': ef.loc[['DFO', 'RFO'], 'Fossil Factor'].mean(),
'PC' : ef.loc['PC', 'Fossil Factor'],
'COW' : ef.loc[['BIT', 'SUB'], 'Fossil Factor'].mean(),
'OOG' : ef.loc['OG', 'Fossil Factor']}
In [29]:
# Start with 0 emissions in all rows
# For fuels where we have an emission factor, replace the 0 with the calculated value
eia_extra['all fuel CO2 (kg)'] = 0
eia_extra['elec fuel CO2 (kg)'] = 0
for fuel in fuel_factors.keys():
eia_extra.loc[idx[fuel,:,:],'all fuel CO2 (kg)'] = \
eia_extra.loc[idx[fuel,:,:],'total fuel (mmbtu)'] * fuel_factors[fuel]
eia_extra.loc[idx[fuel,:,:],'elec fuel CO2 (kg)'] = \
eia_extra.loc[idx[fuel,:,:],'elec fuel (mmbtu)'] * fuel_factors[fuel]
# eia_extra.reset_index(inplace=True)
# add_quarter(eia_extra)
In [30]:
eia_extra.loc[idx['NG',:,:],].tail()
Out[30]:
The dataframes start at a facility level. Extra EIA emissions for estimated state-level data are added after they are aggregated by year/month in the "Monthly Index" section below.
In [31]:
epa_cols = ['ORISPL_CODE', 'YEAR', 'MONTH', 'adj CO2 (kg)']
final_co2_gen = eia_facility_grouped.merge(epa_adj.loc[:,epa_cols], left_on=['plant id', 'year', 'month'],
right_on=['ORISPL_CODE', 'YEAR', 'MONTH'], how='left')
final_co2_gen.drop(['ORISPL_CODE', 'YEAR', 'MONTH'], axis=1, inplace=True)
final_co2_gen['final CO2 (kg)'] = final_co2_gen['adj CO2 (kg)']
final_co2_gen.loc[final_co2_gen['final CO2 (kg)'].isnull(), 'final CO2 (kg)'] = final_co2_gen.loc[final_co2_gen['final CO2 (kg)'].isnull(), 'elec fuel fossil CO2 (kg)']
add_quarter(final_co2_gen)
In [32]:
final_co2_gen.head()
Out[32]:
Start with some helper functions to convert units and calculate % change from 2005 annual value
In [33]:
def g2lb(df):
"""
Convert g/kWh to lb/MWh and add a column to the df
"""
kg2lb = 2.2046
df['index (lb/MWh)'] = df['index (g/kWh)'] * kg2lb
def change_since_2005(df):
"""
Calculate the % difference from 2005 and add as a column in the df
"""
# first calculate the index in 2005
index_2005 = ((df.loc[df['year']==2005,'index (g/kWh)'] *
df.loc[df['year']==2005,'generation (MWh)']) /
df.loc[df['year']==2005,'generation (MWh)'].sum()).sum()
# Calculated index value in 2005 is 599.8484560355034
# If the value above is different throw an error
if (index_2005 > 601) or (index_2005 < 599.5):
raise ValueError('Calculated 2005 index value', index_2005,
'is outside expected range. Expected value is 599.848')
if type(index_2005) != float:
raise TypeError('index_2005 is', type(index_2005), 'rather than a float.')
df['change since 2005'] = (df['index (g/kWh)'] - index_2005) / index_2005
In [34]:
monthly_index = final_co2_gen.groupby(['year', 'month'])['generation (MWh)', 'final CO2 (kg)'].sum()
monthly_index.reset_index(inplace=True)
# Add extra generation and emissions not captured by facility-level data
monthly_index.loc[:,'final CO2 (kg)'] += eia_extra.reset_index().groupby(['year', 'month'])['elec fuel CO2 (kg)'].sum().values
monthly_index.loc[:,'generation (MWh)'] += eia_extra.reset_index().groupby(['year', 'month'])['generation (MWh)'].sum().values
add_quarter(monthly_index)
monthly_index['index (g/kWh)'] = monthly_index.loc[:, 'final CO2 (kg)'] / monthly_index.loc[:, 'generation (MWh)']
change_since_2005(monthly_index)
g2lb(monthly_index)
monthly_index.dropna(inplace=True)
In [36]:
monthly_index.tail()
Out[36]:
In [205]:
path = os.path.join('Data for plots', 'Monthly index.csv')
monthly_index.to_csv(path, index=False)
In [35]:
quarterly_index = monthly_index.groupby(['year', 'quarter'])['generation (MWh)', 'final CO2 (kg)'].sum()
quarterly_index.reset_index(inplace=True)
quarterly_index['index (g/kWh)'] = quarterly_index.loc[:, 'final CO2 (kg)'] / quarterly_index.loc[:, 'generation (MWh)']
quarterly_index['year_quarter'] = quarterly_index['year'].astype(str) + ' Q' + quarterly_index['quarter'].astype(str)
change_since_2005(quarterly_index)
g2lb(quarterly_index)
In [37]:
quarterly_index.tail()
Out[37]:
In [209]:
path = os.path.join('Data for plots', 'Quarterly index.csv')
quarterly_index.to_csv(path, index=False)
In [38]:
annual_index = quarterly_index.groupby('year')['generation (MWh)', 'final CO2 (kg)'].sum()
annual_index.reset_index(inplace=True)
annual_index['index (g/kWh)'] = annual_index.loc[:, 'final CO2 (kg)'] / annual_index.loc[:, 'generation (MWh)']
change_since_2005(annual_index)
g2lb(annual_index)
annual_index.tail()
Out[38]:
In [211]:
path = os.path.join('Data for plots', 'Annual index.csv')
annual_index.to_csv(path, index=False)
In [41]:
'US POWER SECTOR CO2 EMISSIONS INTENSITY'.title()
Out[41]:
In [44]:
path = os.path.join('..', 'Calculated values', 'US Power Sector CO2 Emissions Intensity.xlsx')
writer = pd.ExcelWriter(path)
monthly_index.to_excel(writer, sheet_name='Monthly', index=False)
quarterly_index.to_excel(writer, sheet_name='Quarterly', index=False)
annual_index.to_excel(writer, sheet_name='Annual', index=False)
writer.save()
In [46]:
fuel_cats = {'Coal' : [u'COW'],
'Natural Gas' : [u'NG'],
'Nuclear' : ['NUC'],
'Renewables' : [u'GEO', u'HYC', u'SUN', 'DPV',
u'WAS', u'WND', u'WWW'],
'Other' : [u'OOG', u'PC', u'PEL', u'OTH', u'HPS']
}
keep_types = [u'WWW', u'WND', u'WAS', u'SUN', 'DPV', u'NUC', u'NG',
u'PEL', u'PC', u'OTH', u'COW', u'OOG', u'HPS', u'HYC', u'GEO']
In [94]:
eia_gen_monthly = eia_total.loc[eia_total['type'].isin(keep_types)].groupby(['type', 'year', 'month']).sum()
eia_gen_monthly.reset_index(inplace=True)
eia_gen_monthly.drop(['end', 'sector', 'start'], inplace=True, axis=1)
for key, values in fuel_cats.iteritems():
eia_gen_monthly.loc[eia_gen_monthly['type'].isin(values),'fuel category'] = key
eia_gen_monthly = eia_gen_monthly.groupby(['fuel category', 'year', 'month']).sum()
eia_gen_monthly.reset_index(inplace=True)
add_quarter(eia_gen_monthly)
eia_gen_quarterly = eia_gen_monthly.groupby(['fuel category', 'year', 'quarter']).sum()
eia_gen_quarterly.reset_index(inplace=True)
eia_gen_quarterly['year_quarter'] = (eia_gen_quarterly['year'].astype(str) +
' Q' + eia_gen_quarterly['quarter'].astype(str))
eia_gen_quarterly.drop('month', axis=1, inplace=True)
eia_gen_annual = eia_gen_monthly.groupby(['fuel category', 'year']).sum()
eia_gen_annual.reset_index(inplace=True)
eia_gen_annual.drop(['month', 'quarter'], axis=1, inplace=True)
In [95]:
def generation_index(gen_df, index_df, group_by='year'):
"""
Calculate the emissions intensity of each fuel in each time period. Use the
adjusted total emissions from the index dataframe to ensure that the weighted
sum of fuel emission intensities will equal the total index value.
"""
final_adj_co2 = index_df.loc[:,'final CO2 (kg)'].copy()
calc_total_co2 = gen_df.groupby(group_by)['elec fuel CO2 (kg)'].sum().values
# calc_total_co2 = calc_total_co2.reset_index()
for fuel in gen_df['fuel category'].unique():
gen_df.loc[gen_df['fuel category']==fuel, 'adjusted CO2 (kg)'] = (gen_df.loc[gen_df['fuel category']==fuel,
'elec fuel CO2 (kg)'] /
calc_total_co2 *
final_adj_co2.values)
gen_df['adjusted index (g/kWh)'] = gen_df['adjusted CO2 (kg)'] / gen_df['generation (MWh)']
gen_df['adjusted index (lb/MWh)'] = gen_df['adjusted index (g/kWh)'] * 2.2046
Apply the function above to each generation dataframe
In [96]:
generation_index(eia_gen_annual, annual_index, 'year')
In [97]:
generation_index(eia_gen_monthly, monthly_index, ['year', 'month'])
In [98]:
generation_index(eia_gen_quarterly, quarterly_index, 'year_quarter')
In [100]:
eia_gen_annual.head()
Out[100]:
In [ ]:
path = os.path.join('Data for plots', 'Monthly generation.csv')
eia_gen_monthly.to_csv(path, index=False)
In [219]:
path = os.path.join('Data for plots', 'Quarterly generation.csv')
eia_gen_quarterly.to_csv(path, index=False)
In [220]:
path = os.path.join('Data for plots', 'Annual generation.csv')
eia_gen_annual.to_csv(path, index=False)
In [75]:
path = os.path.join('..', 'Calculated values', 'US Generation By Fuel Type.xlsx')
writer = pd.ExcelWriter(path, engine='xlsxwriter')
eia_gen_monthly.to_excel(writer, sheet_name='Monthly', index=False)
eia_gen_quarterly.to_excel(writer, sheet_name='Quarterly', index=False)
eia_gen_annual.to_excel(writer, sheet_name='Annual', index=False)
writer.save()