In [ ]:
%load_ext autoreload
%autoreload 2
In [ ]:
import sys
import pandas as pd
import sqlalchemy as sa
import pudl
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set()
%matplotlib inline
In [ ]:
mpl.rcParams['figure.figsize'] = (10,4)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 100
pd.options.display.max_rows = 200
In [ ]:
pudl_settings = pudl.workspace.setup.get_defaults()
ferc1_engine = sa.create_engine(pudl_settings['ferc1_db'])
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_settings
In [ ]:
def get_row_literals(table_name, report_year, ferc1_engine):
row_literals = (
pd.read_sql("f1_row_lit_tbl", ferc1_engine)
.query(f"sched_table_name=='{table_name}'")
.query(f"report_year=={report_year}")
.sort_values("row_number")
)
return row_literals
def compare_row_literals(table_name, old_year, new_year, ferc1_engine):
idx_cols = ["row_number", "row_seq"]
old_df = get_row_literals(table_name, old_year, ferc1_engine).drop(columns=["row_status", "sched_table_name"])
new_df = get_row_literals(table_name, new_year, ferc1_engine).drop(columns=["row_status", "sched_table_name"])
merged_df = (
pd.merge(old_df, new_df, on=idx_cols, suffixes=["_old", "_new"], how="outer")
.set_index(idx_cols)
)
merged_df = (
merged_df.loc[:, merged_df.columns.sort_values()]
.assign(match=lambda x: x.row_literal_new == x.row_literal_old)
)
return merged_df
def check_all_row_years(table_name, ferc1_engine):
years = list(range(1994, 2018))
years.reverse()
for old_year in years:
dude = compare_row_literals(table_name, old_year, old_year+1, ferc1_engine)
if len(dude[~dude.match]) > 0:
logger.error(f"CHECK: {old_year+1} vs. {old_year}")
In [ ]:
target_tables = [
"f1_income_stmnt", # Mapped
"f1_plant_in_srvce", # Mapped
"f1_dacs_epda", # Mapped
"f1_elctrc_oper_rev", # Binz 2
"f1_elc_op_mnt_expn", # Binz 2
"f1_sales_by_sched", # Binz 2 (Messy except for Total, use eia861)
"f1_incm_stmnt_2", # Do this later
"f1_edcfu_epda", # Do this later
"f1_acb_epda", # Do this later
"f1_elc_oper_rev_nb", # Trivial one-line table. Why?
"f1_nbr_elc_deptemp", # Old table that no longer gets updated
]
target_dfs = {t: pd.read_sql(t, ferc1_engine) for t in target_tables}
In [ ]:
check_all_row_years("f1_elc_op_mnt_expn", ferc1_engine)
In [ ]:
compare_row_literals("f1_elc_op_mnt_expn", 2005, 2006, ferc1_engine)