This notebook generates lists of new plants and utilities that need to be assigned PUDL IDs. It helps with the process of integrating new data each fall when the agencies make their new annual release for the previous year. In addition, it has some functions that assist in the manual categorization of new freeform strings within the FERC Form 1 database, describing fuel types, fuel units, plant types, etc.
unmapped_utilities_ferc1.csv
: Respondent IDs and respondent names of utilities which appear in the FERC Form 1 DB, but which do not appear in the PUDL ID mapping spreadsheet.unmapped_plants_ferc1.csv
: Plant names, respondent names, and respondent IDs associated with plants that appear in the FERC Form 1 DB, but which do not appear in the PUDL ID Mapping spreadsheet.unmapped_utilities_eia.csv
: EIA Utility IDs and names of utilities which appear in the PUDL DB, but which do not appear in the PUDL ID mapping spreadsheet.unmapped_plants_eia.csv
: EIA Plant IDs and Plant Names of plants which appear in the PUDL DB, but which do not appear in the PUDL ID mapping spreadsheet. The Utility ID and Name for the primary plant operator, as well as the aggregate plant capacity and the state the plant is located in are also proved to aid in PUDL ID mapping.lost_utilities_eia.csv
: The Utility IDs and Names of utilities which appear in the PUDL ID mapping spreadsheet but which do not appear in the PUDL DB. Likely because EIA revised previous years of data, and removed those utilities, after we had mapped them.lost_plants_eia.csv
: The Plant IDs and Names of plants which appear in the PUDL ID mapping spreadsheet but which do not appear in the PUDL DB. Likely because EIA revised previous years of data, and removed those plants, after we had mapped them.
In [ ]:
%load_ext autoreload
%autoreload 2
import sqlalchemy as sa
import pandas as pd
import pudl
import re
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_settings
In [ ]:
ferc1_engine = sa.create_engine(pudl_settings["ferc1_db"])
pudl_engine = sa.create_engine(pudl_settings["pudl_db"])
ferc1_years = pudl.constants.data_years["ferc1"]
print("Searching for new FERC 1 plants, utilities and strings in the following years:")
print(ferc1_years)
In [ ]:
unmapped_plants_ferc1 = pudl.glue.ferc1_eia.get_unmapped_plants_ferc1(pudl_settings, years=ferc1_years)
n_ferc1_unmapped_plants = len(unmapped_plants_ferc1)
print(f"{n_ferc1_unmapped_plants} unmapped FERC 1 plants found in {min(ferc1_years)}-{max(ferc1_years)}.")
if n_ferc1_unmapped_plants > 0:
unmapped_plants_ferc1_outfile = f"unmapped_plants_ferc1.csv"
print(f"Writing {n_ferc1_unmapped_plants} out to {unmapped_plants_ferc1_outfile}")
unmapped_plants_ferc1.to_csv(unmapped_plants_ferc1_outfile, index=False)
unmapped_plants_ferc1
In [ ]:
unmapped_utils_ferc1 = pudl.glue.ferc1_eia.get_unmapped_utils_ferc1(pudl_settings, years=ferc1_years)
n_ferc1_unmapped_utils = len(unmapped_utils_ferc1)
print(f"{n_ferc1_unmapped_utils} unmapped FERC 1 utilities found in {min(ferc1_years)}-{max(ferc1_years)}.")
if n_ferc1_unmapped_utils > 0:
unmapped_utils_ferc1_outfile= f"unmapped_utilities_ferc1.csv"
print(f"Writing {n_ferc1_unmapped_utils} out to {unmapped_utils_ferc1_outfile}")
unmapped_utils_ferc1.to_csv(unmapped_utils_ferc1_outfile, index=False)
unmapped_utils_ferc1
In [ ]:
unmapped_plants_eia = pudl.glue.ferc1_eia.get_unmapped_plants_eia(pudl_engine)
print(f"Found {len(unmapped_plants_eia)} unmapped EIA plants.")
if len(unmapped_plants_eia) > 0:
unmapped_plants_eia.to_csv(f"unmapped_plants_eia.csv")
unmapped_plants_eia.sample(max([10, len(unmapped_plants_eia)]))
In [ ]:
lost_plants_eia = pudl.glue.ferc1_eia.get_lost_plants_eia(pudl_engine)
print(f"Found {len(lost_plants_eia)} lost EIA plants.")
lost_plants_eia.to_csv(f"lost_plants_eia.csv")
lost_plants_eia.sample(10)
In [ ]:
unmapped_utils_eia = pudl.glue.ferc1_eia.get_unmapped_utils_eia(pudl_engine)
print(f"Found {len(unmapped_utils_eia)} unmapped EIA utilities.")
if len(unmapped_utils_eia) > 0:
unmapped_utils_eia.to_csv("all_unmapped_utilities_eia.csv")
miss_utils = pudl.glue.ferc1_eia.get_unmapped_utils_with_plants_eia(pudl_engine)
print(f"Found {len(miss_utils)} unmapped utilities with plants/ownership.")
miss_utils.to_csv("planted_unmapped_utilities_eia.csv")
unmapped_utils_eia.sample(10)
In [ ]:
pudl_raw = pudl.output.pudltabl.PudlTabl(pudl_engine, freq=None)
frc_eia923 = pudl_raw.frc_eia923()
gf_eia923 = pudl_raw.gf_eia923()
gen_eia923 = pudl_raw.gen_eia923()
bf_eia923 = pudl_raw.bf_eia923()
missing_frc = frc_eia923[frc_eia923.utility_id_pudl.isna()][["utility_id_eia", "utility_name_eia"]]
missing_gf = gf_eia923[gf_eia923.utility_id_pudl.isna()][["utility_id_eia", "utility_name_eia"]]
missing_bf = bf_eia923[bf_eia923.utility_id_pudl.isna()][["utility_id_eia", "utility_name_eia"]]
missing_gens = gen_eia923[gen_eia923.utility_id_pudl.isna()][["utility_id_eia", "utility_name_eia"]]
missing_utils = (
pd.concat([missing_frc, missing_bf, missing_gf, missing_gens])
.drop_duplicates(subset="utility_id_eia")
.set_index("utility_id_eia")
)
print(f"Found {len(missing_utils)} utilities with EIA 923 data but no PUDL Utility ID.")
if len(missing_utils) > 0:
missing_utils.to_csv("dataful_unmaped_utilities_eia.csv")
missing_utils.sample(min(len(missing_utils), 10))
In [ ]:
missing_utils.iloc[50:100]
In [ ]:
lost_utils_eia = pudl.glue.ferc1_eia.get_lost_utils_eia(pudl_engine)
print(f"Found {len(lost_utils_eia)} lost EIA utilities.")
if len(lost_utils_eia) > 0:
lost_utils_eia.to_csv("lost_utilities_eia.csv")
pudl.constants
, then re-run the cell with new search terms, until everything left is impossible to confidently categorize.
In [ ]:
def new_ferc1_strings(table, field, start_year, end_year, ferc1_engine, strdict):
all_strings = pd.read_sql(f"SELECT * FROM {table} WHERE report_year>={start_year} AND report_year<={end_year};", ferc1_engine)
all_strings = (
all_strings.
pipe(pudl.helpers.strip_lower, columns=[field])[field].
unique()
)
old_strings = []
for x in strdict:
old_strings = old_strings + strdict[x]
new_strings = [s for s in all_strings if s not in old_strings]
return new_strings
clean_me = {
"fuel": {
"table": "f1_fuel",
"field": "fuel",
"strdict": pudl.constants.ferc1_fuel_strings,
},
"fuel_unit": {
"table": "f1_fuel",
"field": "fuel_unit",
"strdict": pudl.constants.ferc1_fuel_unit_strings,
},
"plant_kind": {
"table": "f1_steam",
"field": "plant_kind",
"strdict": pudl.constants.ferc1_plant_kind_strings,
},
"type_const": {
"table": "f1_steam",
"field": "type_const",
"strdict": pudl.constants.ferc1_const_type_strings,
},
}
# This must be one of the keys in the above dictionary....
type_of_string = "fuel"
unmapped_strings = new_ferc1_strings(
start_year=min(ferc1_years),
end_year=max(ferc1_years),
ferc1_engine=ferc1_engine,
**clean_me[type_of_string])
n_unmapped_strings = len(unmapped_strings)
print(f"{n_unmapped_strings} unmapped {type_of_string} strings found.")
# These numbers represent how many unmapped strings remained as of 2019-10-03
if type_of_string == "fuel":
assert n_unmapped_strings == 0
elif type_of_string == "fuel_unit":
assert n_unmapped_strings <= 80
elif type_of_string == "plant_kind":
assert n_unmapped_strings <= 64
elif type_of_string == "type_const":
assert n_unmapped_strings <= 94
else:
assert False
# Choose your own colleciton of search strings here to match the
# category you're attempting to identify. For all except the "fuel"
# string type, there will be leftovers that can't be categorized
# because they're a mess. In the "fuel" type we use "other" so that
# we don't lose the fuel records altogether.
[
s for s in unmapped_strings
if re.match('.*', s)
#if re.match('.*semi.*', s)
#and not re.match('.*out.*', s)
#and not re.match('.*in.*', s)
#and not re.match('.*less.*', s)
#and not re.match('.*under.*', s)
#and not re.match('.*o.*[db].*', s)
]
f1_gnrt_plant
) table with some fields that would be useful for categorization.
In [ ]:
small_plants_ferc1 = (
pd.read_sql(
f"""SELECT f1_gnrt_plant.report_year,\
f1_gnrt_plant.respondent_id,\
f1_gnrt_plant.row_number,\
f1_gnrt_plant.spplmnt_num,\
f1_gnrt_plant.plant_name,\
f1_gnrt_plant.capacity_rating,\
f1_gnrt_plant.kind_of_fuel, \
f1_respondent_id.respondent_name\
FROM f1_gnrt_plant, f1_respondent_id \
WHERE report_year>={min(ferc1_years)}
AND report_year<={max(ferc1_years)}
AND f1_respondent_id.respondent_id=f1_gnrt_plant.respondent_id;""", ferc1_engine).
assign(record_number=lambda x: x["row_number"] + 46*x["spplmnt_num"]).
drop(["row_number", "spplmnt_num"], axis="columns").
pipe(pudl.helpers.strip_lower, columns=["plant_name", "kind_of_fuel", "respondent_name"]).
rename(columns={"capacity_rating": "capacity_mw"}).
loc[:,["report_year", "respondent_id", "respondent_name", "record_number", "plant_name", "capacity_mw", "kind_of_fuel"]].
sort_values(["report_year", "respondent_id", "record_number"])
)
n_small_plants_ferc1 = len(small_plants_ferc1)
small_plants_ferc1_outfile = "f1_gnrt_plant.csv"
print(f"Writing {n_small_plants_ferc1} small plant records out to {small_plants_ferc1_outfile}")
small_plants_ferc1.to_csv(small_plants_ferc1_outfile, index=False)
small_plants_ferc1