In [1]:

    
%matplotlib inline
%load_ext autoreload
%autoreload 2
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..','..','..')))
from pudl import pudl, ferc1, eia923, settings, constants
from pudl import models, models_ferc1, models_eia923
from pudl import clean_eia923, clean_ferc1, clean_pudl
from pudl import analysis
import numpy as np
import pandas as pd
import sqlalchemy as sa
import matplotlib.pyplot as plt
import matplotlib as mpl
import itertools
import re
from IPython.display import HTML
pudl_engine = pudl.connect_db()
plt.style.use('ggplot')
mpl.rc('figure', figsize=(6,6))
mpl.rc('figure', dpi=150)
mpl.rc('animation', writer='ffmpeg')

Create a pair of test datasets



In [2]:

    
test_noise=[0.5, 0.25, 0.5, 1.0, 0.5, 0.1, 0.3]
eia_df, ferc_df = analysis.zippertestdata(gens=100, max_group_size=5, noise=test_noise, samples=13)

Aggregate test datasets as if they were FERC/EIA data for plotting



In [3]:

    
eia_pudl_df = eia_df.groupby(['pudl_plant_id','year']).agg(sum)
eia_pudl_df.columns = eia_pudl_df.columns.str.replace('series','eia')
ferc_pudl_df = ferc_df.groupby(['pudl_plant_id','year']).agg(sum)
ferc_pudl_df.columns = ferc_pudl_df.columns.str.replace('series','ferc')
both_pudl_df = ferc_pudl_df.merge(eia_pudl_df, left_index=True, right_index=True)

Plot the aggregated test data so we can compare it with FERC/EIA

See the ferc_to_eia_plant_correlations notebook for what the real data looks like.



In [4]:

    
for N in np.arange(0,len(test_noise)):
    ferc_series = 'ferc{}'.format(N)
    eia_series = 'eia{}'.format(N)
    noise_val = test_noise[N]
    
    plt.scatter(both_pudl_df[ferc_series], both_pudl_df[eia_series], s=5, alpha=0.2)
    R_2 = np.corrcoef(both_pudl_df[ferc_series],both_pudl_df[eia_series])[0,1]
    plt.title("Test Data: $\sigma={}, R^2={:0.3f}$".format(noise_val, R_2))
    plt.xlim((1e2,1e10))
    plt.ylim((1e2,1e10))
    plt.loglog()
    plt.show()



In [5]:

    
def plant_corr(pudl_df, x1, x2):
    gb = pudl_df.groupby('pudl_plant_id')
    corr = gb[[x1,x2]].corr().reset_index()
    corr = corr.drop(x2, axis=1)
    corr = corr[corr['level_1']==x2]
    corr = corr[['pudl_plant_id',x1]]
    corr = corr.rename(columns={x1:'corr'})
    corr = corr.dropna()
    return(corr)

def plot_plant_corr(corrs=[], titles=[]):
    """Given a list of DataFrames containing per-plant correlations of some variable,
    and a set of associated figure titles, create a figure displaying the distributions
    of those correlations."""
    
    nrows = len(corrs)
    assert nrows == len(titles)
    fig, axes = plt.subplots(ncols=1, nrows=nrows)
    fig.set_figwidth(10)
    fig.set_figheight(4*nrows)
    fig.set_dpi(150)
    for (ax, title, corr) in zip(axes, titles, corrs):
        ax.set_xlim(0.0,1.1)
        ax.set_title(title)
        ax.set_ylabel("# of Plants")
        ax.set_xlabel("$R^2$")
        ax.grid(b=True)
        ax.hist(corr['corr']**2, bins=200, range=(0,1));
    fig.tight_layout()
    plt.show()

Look at the distribtuion of per-plant variable correlations



In [6]:

    
corrs = []
titles = []
for N in range(len(test_noise)):
    eiaN = 'eia{}'.format(N)
    fercN = 'ferc{}'.format(N)
    titleN = 'Series {}'.format(N)
    corrs = corrs + [plant_corr(both_pudl_df, eiaN, fercN),]
    titles = titles + [titleN,]
    
plot_plant_corr(corrs=corrs, titles=titles)

Aggregate and correlate test datasets



In [7]:

    
agg_df = analysis.aggregate_by_pudl_plant(eia_df, ferc_df)

N_series = len(test_noise)
eia_cols = [ 'series{}_eia'.format(N) for N in range(N_series) ]
ferc_cols = [ 'series{}_ferc'.format(N) for N in range(N_series) ]
corr_cols = [ 'series{}_corr'.format(N) for N in range(N_series) ]

corr_df = analysis.correlate_by_generators(agg_df, eia_cols, ferc_cols, corr_cols)

Score candidate generator groupings



In [8]:

    
winners = analysis.score_all(corr_df, corr_cols)



In [9]:

    
winners









    Out[9]:







  
    
      
      pudl_plant_id
      candidate_id
      mean_corr
      eia_gen_subgroup
      ferc_plant_id
      success
    
  
  
    
      0
      0
      0
      0.873553
      aa
      AA
      True
    
    
      1
      1
      0
      0.984465
      ab
      AB
      True
    
    
      2
      2
      0
      0.951502
      ac
      AC
      True
    
    
      3
      3
      0
      0.920110
      ad_ae
      AD_AE
      True
    
    
      4
      4
      0
      0.895189
      af_ag_ah
      AF_AG_AH
      True
    
    
      5
      5
      0
      0.989580
      ai
      AI
      True
    
    
      6
      6
      0
      0.925274
      aj_ak
      AJ_AK
      True
    
    
      7
      7
      0
      0.992993
      al
      AL
      True
    
    
      8
      8
      14
      0.900869
      am_an_ao_ap
      AM_AN_AO_AP
      True
    
    
      9
      8
      14
      0.900869
      aq
      AQ
      True
    
    
      10
      9
      0
      0.864614
      ar_as_at
      AR_AS_AT
      True
    
    
      11
      10
      6
      0.951167
      au_av_aw
      AU_AV_AW
      True
    
    
      12
      10
      6
      0.951167
      ax
      AX
      True
    
    
      13
      11
      0
      0.878565
      ay
      AY
      True
    
    
      14
      12
      0
      0.952186
      az
      AZ
      True
    
    
      15
      13
      0
      0.927993
      ba
      BA
      True
    
    
      18
      13
      0
      0.927993
      bb
      BB
      True
    
    
      21
      13
      0
      0.927993
      bc_bd
      BC_BD
      True
    
    
      22
      14
      0
      0.846257
      be_bf_bg_bh_bi
      BE_BF_BG_BH_BI
      True
    
    
      23
      15
      6
      0.836623
      bj_bk_bl
      BJ_BK_BL
      True
    
    
      24
      15
      6
      0.836623
      bm
      BM
      True
    
    
      25
      16
      14
      0.927517
      bn_bo_bp_bq
      BN_BO_BP_BQ
      True
    
    
      26
      16
      14
      0.927517
      br
      BR
      True
    
    
      27
      17
      0
      0.836694
      bs
      BS
      True
    
    
      28
      18
      2
      0.923195
      bt_bu
      BT_BU
      True
    
    
      29
      18
      2
      0.923195
      bv_bw
      BV_BW
      True
    
    
      30
      19
      2
      0.934106
      bx_by
      BX_BY
      True
    
    
      31
      19
      2
      0.934106
      bz
      BZ
      True
    
    
      32
      20
      0
      0.961485
      ca_cb
      CA_CB
      True
    
    
      33
      21
      0
      0.915325
      cc
      CC
      True
    
    
      34
      22
      2
      0.864886
      cd_ce
      CD_CE
      True
    
    
      35
      22
      2
      0.864886
      cf_cg
      CF_CG
      True
    
    
      36
      23
      0
      0.846102
      ch_ci_cj_ck
      CH_CI_CJ_CK
      True
    
    
      37
      24
      0
      0.864755
      cl_cm_cn_co_cp
      CL_CM_CN_CO_CP
      True
    
    
      38
      25
      0
      0.963236
      cq
      CQ
      True
    
    
      39
      25
      0
      0.963236
      cr_cs
      CR_CS
      True
    
    
      40
      26
      0
      0.747370
      ct_cu_cv_cw_cx
      CT_CU_CV_CW_CX
      True
    
    
      41
      27
      2
      0.933560
      cy_cz
      CY_CZ
      True
    
    
      42
      27
      2
      0.933560
      da_db_dc
      DA_DB_DC
      True
    
    
      43
      28
      0
      0.927973
      dd
      DD
      True
    
    
      44
      28
      0
      0.927973
      de_df_dg
      DE_DF_DG
      True
    
    
      45
      29
      0
      0.795558
      dh_di_dj
      DH_DI_DJ
      True
    
    
      46
      30
      0
      0.946279
      dk_dl
      DK_DL
      True
    
    
      47
      31
      0
      0.906287
      dm
      DM
      True
    
    
      48
      32
      96
      0.972943
      dn_do_dp
      DN_DO_DP
      True
    
    
      49
      32
      96
      0.972943
      dq
      DQ
      True
    
    
      56
      32
      96
      0.972943
      dr
      DR
      True
    
    
      63
      33
      0
      0.973579
      ds
      DS
      True
    
    
      64
      34
      0
      0.944016
      dt
      DT
      True
    
    
      65
      34
      0
      0.944016
      du
      DU
      True
    
    
      66
      34
      0
      0.944016
      dv
      DV
      True



In [10]:

    
len(winners.success==True)/len(winners)









    Out[10]:





1.0

	pudl_plant_id	candidate_id	mean_corr	eia_gen_subgroup	ferc_plant_id	success
0	0	0	0.873553	aa	AA	True
1	1	0	0.984465	ab	AB	True
2	2	0	0.951502	ac	AC	True
3	3	0	0.920110	ad_ae	AD_AE	True
4	4	0	0.895189	af_ag_ah	AF_AG_AH	True
5	5	0	0.989580	ai	AI	True
6	6	0	0.925274	aj_ak	AJ_AK	True
7	7	0	0.992993	al	AL	True
8	8	14	0.900869	am_an_ao_ap	AM_AN_AO_AP	True
9	8	14	0.900869	aq	AQ	True
10	9	0	0.864614	ar_as_at	AR_AS_AT	True
11	10	6	0.951167	au_av_aw	AU_AV_AW	True
12	10	6	0.951167	ax	AX	True
13	11	0	0.878565	ay	AY	True
14	12	0	0.952186	az	AZ	True
15	13	0	0.927993	ba	BA	True
18	13	0	0.927993	bb	BB	True
21	13	0	0.927993	bc_bd	BC_BD	True
22	14	0	0.846257	be_bf_bg_bh_bi	BE_BF_BG_BH_BI	True
23	15	6	0.836623	bj_bk_bl	BJ_BK_BL	True
24	15	6	0.836623	bm	BM	True
25	16	14	0.927517	bn_bo_bp_bq	BN_BO_BP_BQ	True
26	16	14	0.927517	br	BR	True
27	17	0	0.836694	bs	BS	True
28	18	2	0.923195	bt_bu	BT_BU	True
29	18	2	0.923195	bv_bw	BV_BW	True
30	19	2	0.934106	bx_by	BX_BY	True
31	19	2	0.934106	bz	BZ	True
32	20	0	0.961485	ca_cb	CA_CB	True
33	21	0	0.915325	cc	CC	True
34	22	2	0.864886	cd_ce	CD_CE	True
35	22	2	0.864886	cf_cg	CF_CG	True
36	23	0	0.846102	ch_ci_cj_ck	CH_CI_CJ_CK	True
37	24	0	0.864755	cl_cm_cn_co_cp	CL_CM_CN_CO_CP	True
38	25	0	0.963236	cq	CQ	True
39	25	0	0.963236	cr_cs	CR_CS	True
40	26	0	0.747370	ct_cu_cv_cw_cx	CT_CU_CV_CW_CX	True
41	27	2	0.933560	cy_cz	CY_CZ	True
42	27	2	0.933560	da_db_dc	DA_DB_DC	True
43	28	0	0.927973	dd	DD	True
44	28	0	0.927973	de_df_dg	DE_DF_DG	True
45	29	0	0.795558	dh_di_dj	DH_DI_DJ	True
46	30	0	0.946279	dk_dl	DK_DL	True
47	31	0	0.906287	dm	DM	True
48	32	96	0.972943	dn_do_dp	DN_DO_DP	True
49	32	96	0.972943	dq	DQ	True
56	32	96	0.972943	dr	DR	True
63	33	0	0.973579	ds	DS	True
64	34	0	0.944016	dt	DT	True
65	34	0	0.944016	du	DU	True
66	34	0	0.944016	dv	DV	True