This notebook creates dataframes with monthly facility generation and fuel use data, merges them, and exports the results. The code assumes that you have already downloaded an ELEC.txt
file from EIA's bulk download website.
In [1]:
import json
import pandas as pd
import os
from os.path import join
import numpy as np
from joblib import Parallel, delayed
import sys
cwd = os.getcwd()
data_path = join(cwd, '..', 'Data storage')
In [ ]:
file_date = '2018-03-06'
In [2]:
%load_ext watermark
In [3]:
%watermark -iv -v
In [4]:
# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1
In [5]:
# add the 'src' directory as one where we can import modules
src_dir = join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
In [6]:
%aimport Data.data_extraction
from Data.data_extraction import facility_line_to_df
%aimport Analysis.index
from Analysis.index import add_datetime, add_quarter
ELEC.txt
fileDownload the most current file from EIA's bulk download site. Save it to \Data storage\Raw data
. I've tried to do this via the requests library, but the data file often gets corrupted.
In [7]:
path = join(data_path, 'Raw EIA bulk', '{} ELEC.txt'.format(file_date))
with open(path, 'r') as f:
raw_txt = f.readlines()
In [8]:
gen_rows = [row for row in raw_txt if 'ELEC.PLANT.GEN' in row
and 'series_id' in row
and 'ALL.M' in row
and 'ALL-' not in row]
total_fuel_rows = [row for row in raw_txt if 'ELEC.PLANT.CONS_TOT_BTU' in row
and 'series_id' in row
and 'ALL.M' in row
and 'ALL-' not in row]
eg_fuel_rows = [row for row in raw_txt if 'ELEC.PLANT.CONS_EG_BTU' in row
and 'series_id' in row
and 'ALL.M' in row
and 'ALL-' not in row]
In [9]:
if __name__ == '__main__':
exception_list = []
facility_gen = pd.concat(Parallel(n_jobs=-1)(delayed(facility_line_to_df)(json.loads(row)) for row in gen_rows))
facility_gen.reset_index(drop=True, inplace=True)
facility_gen.rename({'value':'generation (MWh)'}, axis=1, inplace=True)
In [10]:
facility_gen.loc[:,'lat'] = facility_gen.loc[:,'lat'].astype(float)
facility_gen.loc[:,'lon'] = facility_gen.loc[:,'lon'].astype(float)
facility_gen.loc[:, 'plant id'] = facility_gen.loc[:, 'plant id'].astype(int)
In [11]:
#drop
facility_gen.tail()
Out[11]:
In [12]:
if __name__ == '__main__':
exception_list = []
facility_all_fuel = pd.concat(Parallel(n_jobs=-1)(delayed(facility_line_to_df)(json.loads(row)) for row in total_fuel_rows))
facility_all_fuel.reset_index(drop=True, inplace=True)
facility_all_fuel.rename({'value':'total fuel (mmbtu)'}, axis=1, inplace=True)
In [13]:
facility_all_fuel.loc[:,'lat'] = facility_all_fuel.loc[:,'lat'].astype(float)
facility_all_fuel.loc[:,'lon'] = facility_all_fuel.loc[:,'lon'].astype(float)
facility_all_fuel.loc[:,'plant id'] = facility_all_fuel.loc[:,'plant id'].astype(int)
In [14]:
if __name__ == '__main__':
exception_list = []
facility_eg_fuel = pd.concat(Parallel(n_jobs=-1)(delayed(facility_line_to_df)(json.loads(row)) for row in eg_fuel_rows))
facility_eg_fuel.reset_index(drop=True, inplace=True)
facility_eg_fuel.rename({'value':'elec fuel (mmbtu)'}, axis=1, inplace=True)
In [15]:
facility_eg_fuel.loc[:,'lat'] = facility_eg_fuel.loc[:,'lat'].astype(float)
facility_eg_fuel.loc[:,'lon'] = facility_eg_fuel.loc[:,'lon'].astype(float)
facility_eg_fuel.loc[:,'plant id'] = facility_eg_fuel.loc[:,'plant id'].astype(int)
In [16]:
keep_cols = ['fuel', 'generation (MWh)', 'month', 'plant id', 'prime mover', 'year',
'geography', 'lat', 'lon', 'last_updated']
merge_cols = ['fuel', 'month', 'plant id', 'year']
gen_total_fuel = facility_all_fuel.merge(facility_gen.loc[:,keep_cols],
how='outer', on=merge_cols)
In [17]:
def fill_missing(df):
cols = [col[:-2] for col in df.columns if '_x' in col]
# Create new column from the _x version, fill missing values from the _y version
for col in cols:
df[col] = df.loc[:, col + '_x']
df.loc[df[col].isnull(), col] = df.loc[df[col].isnull(), col + '_y']
df.drop([col+'_x', col+'_y'], axis=1, inplace=True)
In [18]:
fill_missing(gen_total_fuel)
In [19]:
keep_cols = ['fuel', 'elec fuel (mmbtu)', 'month', 'plant id', 'prime mover', 'year',
'geography', 'lat', 'lon', 'last_updated']
all_facility_data = gen_total_fuel.merge(facility_eg_fuel.loc[:,keep_cols],
how='outer', on=merge_cols)
In [20]:
fill_missing(all_facility_data)
In [21]:
all_facility_data.drop(['units', 'series_id'], axis=1, inplace=True)
all_facility_data.head()
Out[21]:
In [22]:
add_quarter(all_facility_data)
These are mostly EIA emission factors
In [23]:
path = join(data_path, 'Final emission factors.csv')
ef = pd.read_csv(path, index_col=0)
In [24]:
fossil_factors = dict(zip(ef.index, ef['Fossil Factor']))
total_factors = dict(zip(ef.index, ef['Total Factor']))
fossil_factors, total_factors
Out[24]:
In [25]:
# Start with 0 emissions in all rows
# For fuels where we have an emission factor, replace the 0 with the calculated value
all_facility_data['all fuel fossil CO2 (kg)'] = 0
all_facility_data['elec fuel fossil CO2 (kg)'] = 0
all_facility_data['all fuel total CO2 (kg)'] = 0
all_facility_data['elec fuel total CO2 (kg)'] = 0
for fuel in total_factors.keys():
# All fuel CO2 emissions
all_facility_data.loc[all_facility_data['fuel']==fuel,'all fuel fossil CO2 (kg)'] = \
all_facility_data.loc[all_facility_data['fuel']==fuel,'total fuel (mmbtu)'] * fossil_factors[fuel]
all_facility_data.loc[all_facility_data['fuel']==fuel,'all fuel total CO2 (kg)'] = \
all_facility_data.loc[all_facility_data['fuel']==fuel,'total fuel (mmbtu)'] * total_factors[fuel]
# Electric fuel CO2 emissions
all_facility_data.loc[all_facility_data['fuel']==fuel,'elec fuel fossil CO2 (kg)'] = \
all_facility_data.loc[all_facility_data['fuel']==fuel,'elec fuel (mmbtu)'] * fossil_factors[fuel]
all_facility_data.loc[all_facility_data['fuel']==fuel,'elec fuel total CO2 (kg)'] = \
all_facility_data.loc[all_facility_data['fuel']==fuel,'elec fuel (mmbtu)'] * total_factors[fuel]
In [26]:
# Fossil CO2
all_facility_data.loc[~(all_facility_data['all fuel fossil CO2 (kg)']>=0),
'all fuel fossil CO2 (kg)'] = 0
all_facility_data.loc[~(all_facility_data['elec fuel fossil CO2 (kg)']>=0),
'elec fuel fossil CO2 (kg)'] = 0
# Total CO2
all_facility_data.loc[~(all_facility_data['all fuel total CO2 (kg)']>=0),
'all fuel total CO2 (kg)'] = 0
all_facility_data.loc[~(all_facility_data['elec fuel total CO2 (kg)']>=0),
'elec fuel total CO2 (kg)'] = 0
In [27]:
path = join(data_path, 'Derived data',
'Facility gen fuels and CO2 {}.csv'.format(file_date))
all_facility_data.to_csv(path, index=False)