In [ ]:
%load_ext autoreload
%autoreload 2

In [ ]:
import sys
import pandas as pd
import sqlalchemy as sa
import pudl

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set()
%matplotlib inline

In [ ]:
mpl.rcParams['figure.figsize'] = (10,4)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 100
pd.options.display.max_rows = 200

In [ ]:
pudl_settings = pudl.workspace.setup.get_defaults()
ferc1_engine = sa.create_engine(pudl_settings['ferc1_db'])
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_settings

Generating new Row Maps


In [ ]:
def get_row_literals(table_name, report_year, ferc1_engine):
    row_literals = (
        pd.read_sql("f1_row_lit_tbl", ferc1_engine)
        .query(f"sched_table_name=='{table_name}'")
        .query(f"report_year=={report_year}")
        .sort_values("row_number")
    )
    return row_literals

def compare_row_literals(table_name, old_year, new_year, ferc1_engine):
    idx_cols = ["row_number", "row_seq"]
    old_df = get_row_literals(table_name, old_year, ferc1_engine).drop(columns=["row_status", "sched_table_name"])
    new_df = get_row_literals(table_name, new_year, ferc1_engine).drop(columns=["row_status", "sched_table_name"])
    merged_df = (
        pd.merge(old_df, new_df, on=idx_cols, suffixes=["_old", "_new"], how="outer")
        .set_index(idx_cols)
    )
    merged_df = (
        merged_df.loc[:, merged_df.columns.sort_values()]
        .assign(match=lambda x: x.row_literal_new == x.row_literal_old)
    )
    return merged_df 

def check_all_row_years(table_name, ferc1_engine):
    years = list(range(1994, 2018))
    years.reverse()
    for old_year in years:
        dude = compare_row_literals(table_name, old_year, old_year+1, ferc1_engine)
        if len(dude[~dude.match]) > 0:
            logger.error(f"CHECK: {old_year+1} vs. {old_year}")

In [ ]:
target_tables = [
    "f1_income_stmnt",    # Mapped
    "f1_plant_in_srvce",  # Mapped
    "f1_dacs_epda",       # Mapped
    
    "f1_elctrc_oper_rev", # Binz 2
    "f1_elc_op_mnt_expn", # Binz 2
    "f1_sales_by_sched",  # Binz 2 (Messy except for Total, use eia861)
    
    "f1_incm_stmnt_2",    # Do this later
    "f1_edcfu_epda",      # Do this later
    "f1_acb_epda",        # Do this later
    "f1_elc_oper_rev_nb", # Trivial one-line table. Why?
    "f1_nbr_elc_deptemp", # Old table that no longer gets updated
]
target_dfs = {t: pd.read_sql(t, ferc1_engine) for t in target_tables}

In [ ]:
check_all_row_years("f1_elc_op_mnt_expn", ferc1_engine)

In [ ]:
compare_row_literals("f1_elc_op_mnt_expn", 2005, 2006, ferc1_engine)