Download and get info from all EIA-923 Excel files

This setup downloads all the zip files, extracts the contents, and identifies the correct header row in the correct file. I'm only getting 2 columns of data (plant id and NERC region), but it can be modified for other data.


In [14]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import glob
import numpy as np
import requests
from bs4 import BeautifulSoup
from urllib import urlretrieve
import zipfile
import fnmatch

In [3]:
url = 'https://www.eia.gov/electricity/data/eia923'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')

table = soup.find_all('table', attrs={'class': 'simpletable'})[0]

In [5]:
fns = []
links = []
for row in table.find_all('td', attrs={'align': 'center'}):
    href = row.a.get('href')
    fns.append(href.split('/')[-1])
    links.append(url + '/' + href)

In [6]:
fns


Out[6]:
['f923_2017.zip',
 'f923_2016.zip',
 'f923_2015.zip',
 'f923_2014.zip',
 'f923_2013.zip',
 'f923_2012.zip',
 'f923_2011.zip',
 'f923_2010.zip',
 'f923_2009.zip',
 'f923_2008.zip',
 'f906920_2007.zip',
 'f906920_2006.zip',
 'f906920_2005.zip',
 'f906920_2004.zip',
 'f906920_2003.zip',
 'f906920_2002.zip',
 'f906920_2001.zip']

In [7]:
path = os.path.join('Data storage', '923 raw data')
os.mkdir(path)

In [9]:
base_path = os.path.join('Data storage', '923 raw data')
for fn, link in zip(fns, links):
    path = os.path.join(base_path, fn)
    urlretrieve(link, filename=path)


Out[9]:
('Data storage/923 raw data/f923_2017.zip',
 <httplib.HTTPMessage instance at 0x10ecbb9e0>)
Out[9]:
('Data storage/923 raw data/f923_2016.zip',
 <httplib.HTTPMessage instance at 0x10ecc07e8>)
Out[9]:
('Data storage/923 raw data/f923_2015.zip',
 <httplib.HTTPMessage instance at 0x10ecc0b00>)
Out[9]:
('Data storage/923 raw data/f923_2014.zip',
 <httplib.HTTPMessage instance at 0x10ecc0e18>)
Out[9]:
('Data storage/923 raw data/f923_2013.zip',
 <httplib.HTTPMessage instance at 0x10ecbb998>)
Out[9]:
('Data storage/923 raw data/f923_2012.zip',
 <httplib.HTTPMessage instance at 0x10ecc0950>)
Out[9]:
('Data storage/923 raw data/f923_2011.zip',
 <httplib.HTTPMessage instance at 0x10ecc0c68>)
Out[9]:
('Data storage/923 raw data/f923_2010.zip',
 <httplib.HTTPMessage instance at 0x10ecc0ab8>)
Out[9]:
('Data storage/923 raw data/f923_2009.zip',
 <httplib.HTTPMessage instance at 0x10ecc0dd0>)
Out[9]:
('Data storage/923 raw data/f923_2008.zip',
 <httplib.HTTPMessage instance at 0x10ecbb6c8>)
Out[9]:
('Data storage/923 raw data/f906920_2007.zip',
 <httplib.HTTPMessage instance at 0x10ecc08c0>)
Out[9]:
('Data storage/923 raw data/f906920_2006.zip',
 <httplib.HTTPMessage instance at 0x10ecc09e0>)
Out[9]:
('Data storage/923 raw data/f906920_2005.zip',
 <httplib.HTTPMessage instance at 0x10ecc3248>)
Out[9]:
('Data storage/923 raw data/f906920_2004.zip',
 <httplib.HTTPMessage instance at 0x10ecc3368>)
Out[9]:
('Data storage/923 raw data/f906920_2003.zip',
 <httplib.HTTPMessage instance at 0x10ecc3488>)
Out[9]:
('Data storage/923 raw data/f906920_2002.zip',
 <httplib.HTTPMessage instance at 0x10ecc3a28>)
Out[9]:
('Data storage/923 raw data/f906920_2001.zip',
 <httplib.HTTPMessage instance at 0x10ecc3b48>)

In [13]:
base_path = os.path.join('Data storage', '923 raw data')
for fn in fns:
    zip_path = os.path.join(base_path, fn)
    target_folder = os.path.join(base_path, fn.split('.')[0])
    
    with zipfile.ZipFile(zip_path,"r") as zip_ref:
        zip_ref.extractall(target_folder)

In [70]:
matches = []
for root, dirnames, filenames in os.walk(base_path):
    for filename in fnmatch.filter(filenames, '*2_3*'):
        matches.append(os.path.join(root, filename))
    for filename in fnmatch.filter(filenames, 'eia923*'):
        matches.append(os.path.join(root, filename))
    for filename in fnmatch.filter(filenames, '*906920*.xls'):
        matches.append(os.path.join(root, filename))

In [71]:
matches


Out[71]:
['Data storage/923 raw data/f906920_2001/f906920y2001.xls',
 'Data storage/923 raw data/f906920_2002/f906920y2002.xls',
 'Data storage/923 raw data/f906920_2003/f906920_2003.xls',
 'Data storage/923 raw data/f906920_2004/f906920_2004.xls',
 'Data storage/923 raw data/f906920_2005/f906920_2005.xls',
 'Data storage/923 raw data/f906920_2006/f906920_2006.xls',
 'Data storage/923 raw data/f906920_2007/f906920_2007.xls',
 'Data storage/923 raw data/f923_2008/eia923December2008.xls',
 'Data storage/923 raw data/f923_2009/EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS',
 'Data storage/923 raw data/f923_2010/EIA923 SCHEDULES 2_3_4_5 Final 2010.xls',
 'Data storage/923 raw data/f923_2011/EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx',
 'Data storage/923 raw data/f923_2012/EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx',
 'Data storage/923 raw data/f923_2013/EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx',
 'Data storage/923 raw data/f923_2014/EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx',
 'Data storage/923 raw data/f923_2015/EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx',
 'Data storage/923 raw data/f923_2016/EIA923_Schedules_2_3_4_5_M_12_2016.xlsx',
 'Data storage/923 raw data/f923_2017/EIA923_Schedules_2_3_4_5_M_04_2017_22JUN2017.xlsx']

In [84]:
def clip_at_header(df, year):
    """Find the appropriate header row, only keep Plant Id and NERC Region columns,
     and add a column with the year"""
    header = df.loc[df.iloc[:, 8].str.contains('NERC').replace(np.nan, False)].index[0]
#     print header
    # Drop rows above header
    df = df.loc[header + 1:, :]
    # Only keep columns 0 (plant id) and 8 (NERC Region)
    df = df.iloc[:, [0, 8]]
    df.columns = ['Plant Id', 'NERC Region']
    df.reset_index(inplace=True, drop=True)
    df.dropna(inplace=True)
    df['Plant Id'] = pd.to_numeric(df['Plant Id'])
    df['Year'] = year
    return df

In [85]:
df_list = []
for fn in matches:
    year = int(fn.split('/')[-2].split('_')[-1])
    df = pd.read_excel(fn)
    df_list.append(clip_at_header(df, year))

In [186]:
nerc_assignment = pd.concat(df_list)
nerc_assignment.reset_index(inplace=True, drop=True)
nerc_assignment.drop_duplicates(inplace=True)

In [187]:
nerc_assignment['Year'] = pd.to_numeric(nerc_assignment['Year'])

In [188]:
nerc_region = nerc_assignment['NERC Region']
nerc_year = nerc_assignment['Year']

In [189]:
for region in nerc_assignment['NERC Region'].unique():
    years = nerc_assignment.loc[nerc_region == region, 'Year'].unique()
    print (region, list(years))


(u'SERC', [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017])
(u'ECAR', [2001, 2002, 2003, 2004, 2005])
(u'MAPP', [2001, 2002, 2003, 2004])
(u'NPCC', [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017])
(u'SPP', [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017])
(u'ERCOT', [2001, 2002, 2003, 2004, 2005, 2006])
(u'FRCC', [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017])
(u'MAIN', [2001, 2002, 2003, 2004, 2005])
(u'WECC', [2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017])
(u'ASCC', [2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017])
(u'MAAC', [2002, 2003, 2004, 2005])
(u'HICC', [2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017])
(u'WSCC', [2002])
(u'MRO', [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017])
(u'RFC', [2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017])
(u'TRE', [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017])

Export original data


In [190]:
path = os.path.join('Data storage', 'Plant NERC regions.csv')
nerc_assignment.to_csv(path, index=False)

Assign NERC region to pre-2005/6 facilities based on where they ended up

Somehow I'm having trouble doing this


In [183]:
region_dict = dict(nerc_assignment.loc[nerc_assignment['Year'] == 2006,
                                       ['Plant Id', 'NERC Region']].values)

In [184]:
regions = ['ECAR', 'MAPP', 'MAIN', 'MAAC']
years = range(2001, 2006)
nerc_assignment.loc[(nerc_region.isin(regions)) &
                    (nerc_assignment['Year'].isin(years)), 
                    'Corrected Region'] = nerc_assignment.loc[(nerc_region.isin(regions)) &
                                                      (nerc_assignment['Year'].isin(years)),
                                                      'Plant Id'].map(region_dict)

In [185]:
nerc_assignment.head()


Out[185]:
Plant Id NERC Region Year Corrected Region
0 2 SERC 2001 NaN
1 3 SERC 2001 NaN
4 4 SERC 2001 NaN
5 5 SERC 2001 NaN
7 7 SERC 2001 NaN

In [162]:
nerc_assignment.loc[(nerc_assignment['Year'] == 2006) &
                                 (nerc_assignment['Plant Id'] == 3), 'NERC Region'].values[0]


Out[162]:
u'SERC'

In [167]:
nerc_assignment.loc[(nerc_assignment['Plant Id'] == 3) & 
                            (nerc_assignment['Year'].isin(years)), 'Corrected Region'] = 'SERC'

In [168]:
nerc_assignment.loc[(nerc_assignment['Plant Id'] == 3) & 
                            (nerc_assignment['Year'].isin(years)), 'Corrected Region']


Out[168]:
1        SERC
1583     SERC
9921     SERC
19264    SERC
28525    SERC
Name: Corrected Region, dtype: object

In [170]:
nerc_assignment.loc[nerc_assignment['Year'] == 2002].head()


Out[170]:
Plant Id NERC Region Year Corrected Region
1582 2 SERC 2002 NaN
1583 3 SERC 2002 SERC
1586 4 SERC 2002 NaN
1587 7 SERC 2002 NaN
1590 8 SERC 2002 NaN

In [115]:
nerc_assignment.index = pd.MultiIndex.from_arrays([nerc_assignment['Year'],
                                                   nerc_assignment['Plant Id']])

In [165]:
nerc_assignment.head()


Out[165]:
Plant Id NERC Region Year Corrected Region
0 2 SERC 2001 NaN
1 3 SERC 2001 NaN
4 4 SERC 2001 NaN
5 5 SERC 2001 NaN
7 7 SERC 2001 NaN

In [136]:
idx = pd.IndexSlice

In [137]:
regions_2006 = nerc_assignment.loc[idx[2006, :], 'NERC Region'].copy()

In [145]:
regions_2006 = nerc_assignment.xs(2006, level='Year')['NERC Region']

In [146]:
regions_2006


Out[146]:
Plant Id
2        SERC
3        SERC
4        SERC
7        SERC
8        SERC
9        WECC
10       SERC
11       SERC
12       SERC
13       SERC
14       SERC
15       SERC
16       SERC
17       SERC
18       SERC
19       SERC
20       SERC
21       SERC
26       SERC
30        MRO
34       WECC
38       SERC
46       SERC
47       SERC
48       SERC
49       SERC
50       SERC
51        SPP
53       SERC
54       SERC
         ... 
56544     MRO
56545    SERC
56554    WECC
56557     SPP
56558     SPP
56559     SPP
56561     SPP
56562     SPP
56570    WECC
56571     RFC
56577     MRO
56578     MRO
56579     MRO
56580     MRO
56581     MRO
56582     MRO
56583     MRO
56584     MRO
56585     MRO
56586     MRO
56587     MRO
56588     MRO
56589     MRO
56590     MRO
56591    WECC
56595     MRO
56597    SERC
56598    SERC
56599    SERC
56600    SERC
Name: NERC Region, dtype: object

In [151]:
for year in range(2001, 2006):
    nerc_assignment.xs(year, level='Year')['Corrected NERC'] = regions_2006

In [152]:
nerc_assignment


Out[152]:
Plant Id NERC Region Year Corrected NERC
Year Plant Id
2001 2 2 SERC 2001 NaN
3 3 SERC 2001 NaN
4 4 SERC 2001 NaN
5 5 SERC 2001 NaN
7 7 SERC 2001 NaN
8 8 SERC 2001 NaN
10 10 SERC 2001 NaN
11 11 SERC 2001 NaN
12 12 SERC 2001 NaN
13 13 SERC 2001 NaN
14 14 SERC 2001 NaN
15 15 SERC 2001 NaN
16 16 SERC 2001 NaN
17 17 SERC 2001 NaN
18 18 SERC 2001 NaN
19 19 SERC 2001 NaN
20 20 SERC 2001 NaN
21 21 SERC 2001 NaN
26 26 SERC 2001 NaN
38 38 SERC 2001 NaN
46 46 SERC 2001 NaN
47 47 SERC 2001 NaN
48 48 SERC 2001 NaN
49 49 SERC 2001 NaN
50 50 SERC 2001 NaN
53 53 SERC 2001 NaN
54 54 ECAR 2001 NaN
55 55 SERC 2001 NaN
56 56 SERC 2001 NaN
60 60 MAPP 2001 NaN
... ... ... ... ... ...
2017 60145 60145 WECC 2017 NaN
60149 60149 SERC 2017 NaN
60152 60152 WECC 2017 NaN
60185 60185 WECC 2017 NaN
60186 60186 WECC 2017 NaN
60210 60210 TRE 2017 NaN
60218 60218 SPP 2017 NaN
60256 60256 SPP 2017 NaN
60258 60258 WECC 2017 NaN
60263 60263 WECC 2017 NaN
60278 60278 NPCC 2017 NaN
60304 60304 WECC 2017 NaN
60307 60307 WECC 2017 NaN
60308 60308 WECC 2017 NaN
60329 60329 RFC 2017 NaN
60349 60349 WECC 2017 NaN
60352 60352 WECC 2017 NaN
60354 60354 MRO 2017 NaN
60366 60366 TRE 2017 NaN
60372 60372 TRE 2017 NaN
60383 60383 SERC 2017 NaN
60390 60390 RFC 2017 NaN
60414 60414 SPP 2017 NaN
60432 60432 WECC 2017 NaN
60436 60436 TRE 2017 NaN
60441 60441 WECC 2017 NaN
60467 60467 WECC 2017 NaN
60470 60470 RFC 2017 NaN
60502 60502 TRE 2017 NaN
60506 60506 TRE 2017 NaN

85768 rows × 4 columns


In [ ]: