In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..','..','..')))
from pudl import pudl, ferc1, eia923, settings, constants
from pudl import models, models_ferc1, models_eia923
from pudl import clean_eia923, clean_ferc1, clean_pudl
from pudl import analysis
import numpy as np
import pandas as pd
import sqlalchemy as sa
import matplotlib.pyplot as plt
import matplotlib as mpl
import itertools
import re
from IPython.display import HTML
pudl_engine = pudl.connect_db()
plt.style.use('ggplot')
mpl.rc('figure', figsize=(6,6))
mpl.rc('figure', dpi=150)
mpl.rc('animation', writer='ffmpeg')

Create a pair of test datasets


In [2]:
test_noise=[0.5, 0.25, 0.5, 1.0, 0.5, 0.1, 0.3]
eia_df, ferc_df = analysis.zippertestdata(gens=100, max_group_size=5, noise=test_noise, samples=13)

Aggregate test datasets as if they were FERC/EIA data for plotting


In [3]:
eia_pudl_df = eia_df.groupby(['pudl_plant_id','year']).agg(sum)
eia_pudl_df.columns = eia_pudl_df.columns.str.replace('series','eia')
ferc_pudl_df = ferc_df.groupby(['pudl_plant_id','year']).agg(sum)
ferc_pudl_df.columns = ferc_pudl_df.columns.str.replace('series','ferc')
both_pudl_df = ferc_pudl_df.merge(eia_pudl_df, left_index=True, right_index=True)

Plot the aggregated test data so we can compare it with FERC/EIA

See the ferc_to_eia_plant_correlations notebook for what the real data looks like.


In [4]:
for N in np.arange(0,len(test_noise)):
    ferc_series = 'ferc{}'.format(N)
    eia_series = 'eia{}'.format(N)
    noise_val = test_noise[N]
    
    plt.scatter(both_pudl_df[ferc_series], both_pudl_df[eia_series], s=5, alpha=0.2)
    R_2 = np.corrcoef(both_pudl_df[ferc_series],both_pudl_df[eia_series])[0,1]
    plt.title("Test Data: $\sigma={}, R^2={:0.3f}$".format(noise_val, R_2))
    plt.xlim((1e2,1e10))
    plt.ylim((1e2,1e10))
    plt.loglog()
    plt.show()



In [5]:
def plant_corr(pudl_df, x1, x2):
    gb = pudl_df.groupby('pudl_plant_id')
    corr = gb[[x1,x2]].corr().reset_index()
    corr = corr.drop(x2, axis=1)
    corr = corr[corr['level_1']==x2]
    corr = corr[['pudl_plant_id',x1]]
    corr = corr.rename(columns={x1:'corr'})
    corr = corr.dropna()
    return(corr)

def plot_plant_corr(corrs=[], titles=[]):
    """Given a list of DataFrames containing per-plant correlations of some variable,
    and a set of associated figure titles, create a figure displaying the distributions
    of those correlations."""
    
    nrows = len(corrs)
    assert nrows == len(titles)
    fig, axes = plt.subplots(ncols=1, nrows=nrows)
    fig.set_figwidth(10)
    fig.set_figheight(4*nrows)
    fig.set_dpi(150)
    for (ax, title, corr) in zip(axes, titles, corrs):
        ax.set_xlim(0.0,1.1)
        ax.set_title(title)
        ax.set_ylabel("# of Plants")
        ax.set_xlabel("$R^2$")
        ax.grid(b=True)
        ax.hist(corr['corr']**2, bins=200, range=(0,1));
    fig.tight_layout()
    plt.show()

Look at the distribtuion of per-plant variable correlations


In [6]:
corrs = []
titles = []
for N in range(len(test_noise)):
    eiaN = 'eia{}'.format(N)
    fercN = 'ferc{}'.format(N)
    titleN = 'Series {}'.format(N)
    corrs = corrs + [plant_corr(both_pudl_df, eiaN, fercN),]
    titles = titles + [titleN,]
    
plot_plant_corr(corrs=corrs, titles=titles)


Aggregate and correlate test datasets


In [7]:
agg_df = analysis.aggregate_by_pudl_plant(eia_df, ferc_df)

N_series = len(test_noise)
eia_cols = [ 'series{}_eia'.format(N) for N in range(N_series) ]
ferc_cols = [ 'series{}_ferc'.format(N) for N in range(N_series) ]
corr_cols = [ 'series{}_corr'.format(N) for N in range(N_series) ]

corr_df = analysis.correlate_by_generators(agg_df, eia_cols, ferc_cols, corr_cols)

Score candidate generator groupings


In [8]:
winners = analysis.score_all(corr_df, corr_cols)

In [9]:
winners


Out[9]:
pudl_plant_id candidate_id mean_corr eia_gen_subgroup ferc_plant_id success
0 0 0 0.873553 aa AA True
1 1 0 0.984465 ab AB True
2 2 0 0.951502 ac AC True
3 3 0 0.920110 ad_ae AD_AE True
4 4 0 0.895189 af_ag_ah AF_AG_AH True
5 5 0 0.989580 ai AI True
6 6 0 0.925274 aj_ak AJ_AK True
7 7 0 0.992993 al AL True
8 8 14 0.900869 am_an_ao_ap AM_AN_AO_AP True
9 8 14 0.900869 aq AQ True
10 9 0 0.864614 ar_as_at AR_AS_AT True
11 10 6 0.951167 au_av_aw AU_AV_AW True
12 10 6 0.951167 ax AX True
13 11 0 0.878565 ay AY True
14 12 0 0.952186 az AZ True
15 13 0 0.927993 ba BA True
18 13 0 0.927993 bb BB True
21 13 0 0.927993 bc_bd BC_BD True
22 14 0 0.846257 be_bf_bg_bh_bi BE_BF_BG_BH_BI True
23 15 6 0.836623 bj_bk_bl BJ_BK_BL True
24 15 6 0.836623 bm BM True
25 16 14 0.927517 bn_bo_bp_bq BN_BO_BP_BQ True
26 16 14 0.927517 br BR True
27 17 0 0.836694 bs BS True
28 18 2 0.923195 bt_bu BT_BU True
29 18 2 0.923195 bv_bw BV_BW True
30 19 2 0.934106 bx_by BX_BY True
31 19 2 0.934106 bz BZ True
32 20 0 0.961485 ca_cb CA_CB True
33 21 0 0.915325 cc CC True
34 22 2 0.864886 cd_ce CD_CE True
35 22 2 0.864886 cf_cg CF_CG True
36 23 0 0.846102 ch_ci_cj_ck CH_CI_CJ_CK True
37 24 0 0.864755 cl_cm_cn_co_cp CL_CM_CN_CO_CP True
38 25 0 0.963236 cq CQ True
39 25 0 0.963236 cr_cs CR_CS True
40 26 0 0.747370 ct_cu_cv_cw_cx CT_CU_CV_CW_CX True
41 27 2 0.933560 cy_cz CY_CZ True
42 27 2 0.933560 da_db_dc DA_DB_DC True
43 28 0 0.927973 dd DD True
44 28 0 0.927973 de_df_dg DE_DF_DG True
45 29 0 0.795558 dh_di_dj DH_DI_DJ True
46 30 0 0.946279 dk_dl DK_DL True
47 31 0 0.906287 dm DM True
48 32 96 0.972943 dn_do_dp DN_DO_DP True
49 32 96 0.972943 dq DQ True
56 32 96 0.972943 dr DR True
63 33 0 0.973579 ds DS True
64 34 0 0.944016 dt DT True
65 34 0 0.944016 du DU True
66 34 0 0.944016 dv DV True

In [10]:
len(winners.success==True)/len(winners)


Out[10]:
1.0