In [14]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import glob
import numpy as np
import requests
from bs4 import BeautifulSoup
from urllib import urlretrieve
import zipfile
import fnmatch
In [3]:
url = 'https://www.eia.gov/electricity/data/eia923'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
table = soup.find_all('table', attrs={'class': 'simpletable'})[0]
In [5]:
fns = []
links = []
for row in table.find_all('td', attrs={'align': 'center'}):
href = row.a.get('href')
fns.append(href.split('/')[-1])
links.append(url + '/' + href)
In [6]:
fns
Out[6]:
In [7]:
path = os.path.join('Data storage', '923 raw data')
os.mkdir(path)
In [9]:
base_path = os.path.join('Data storage', '923 raw data')
for fn, link in zip(fns, links):
path = os.path.join(base_path, fn)
urlretrieve(link, filename=path)
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
In [13]:
base_path = os.path.join('Data storage', '923 raw data')
for fn in fns:
zip_path = os.path.join(base_path, fn)
target_folder = os.path.join(base_path, fn.split('.')[0])
with zipfile.ZipFile(zip_path,"r") as zip_ref:
zip_ref.extractall(target_folder)
In [70]:
matches = []
for root, dirnames, filenames in os.walk(base_path):
for filename in fnmatch.filter(filenames, '*2_3*'):
matches.append(os.path.join(root, filename))
for filename in fnmatch.filter(filenames, 'eia923*'):
matches.append(os.path.join(root, filename))
for filename in fnmatch.filter(filenames, '*906920*.xls'):
matches.append(os.path.join(root, filename))
In [71]:
matches
Out[71]:
In [84]:
def clip_at_header(df, year):
"""Find the appropriate header row, only keep Plant Id and NERC Region columns,
and add a column with the year"""
header = df.loc[df.iloc[:, 8].str.contains('NERC').replace(np.nan, False)].index[0]
# print header
# Drop rows above header
df = df.loc[header + 1:, :]
# Only keep columns 0 (plant id) and 8 (NERC Region)
df = df.iloc[:, [0, 8]]
df.columns = ['Plant Id', 'NERC Region']
df.reset_index(inplace=True, drop=True)
df.dropna(inplace=True)
df['Plant Id'] = pd.to_numeric(df['Plant Id'])
df['Year'] = year
return df
In [85]:
df_list = []
for fn in matches:
year = int(fn.split('/')[-2].split('_')[-1])
df = pd.read_excel(fn)
df_list.append(clip_at_header(df, year))
In [186]:
nerc_assignment = pd.concat(df_list)
nerc_assignment.reset_index(inplace=True, drop=True)
nerc_assignment.drop_duplicates(inplace=True)
In [187]:
nerc_assignment['Year'] = pd.to_numeric(nerc_assignment['Year'])
In [188]:
nerc_region = nerc_assignment['NERC Region']
nerc_year = nerc_assignment['Year']
In [189]:
for region in nerc_assignment['NERC Region'].unique():
years = nerc_assignment.loc[nerc_region == region, 'Year'].unique()
print (region, list(years))
In [190]:
path = os.path.join('Data storage', 'Plant NERC regions.csv')
nerc_assignment.to_csv(path, index=False)
In [183]:
region_dict = dict(nerc_assignment.loc[nerc_assignment['Year'] == 2006,
['Plant Id', 'NERC Region']].values)
In [184]:
regions = ['ECAR', 'MAPP', 'MAIN', 'MAAC']
years = range(2001, 2006)
nerc_assignment.loc[(nerc_region.isin(regions)) &
(nerc_assignment['Year'].isin(years)),
'Corrected Region'] = nerc_assignment.loc[(nerc_region.isin(regions)) &
(nerc_assignment['Year'].isin(years)),
'Plant Id'].map(region_dict)
In [185]:
nerc_assignment.head()
Out[185]:
In [162]:
nerc_assignment.loc[(nerc_assignment['Year'] == 2006) &
(nerc_assignment['Plant Id'] == 3), 'NERC Region'].values[0]
Out[162]:
In [167]:
nerc_assignment.loc[(nerc_assignment['Plant Id'] == 3) &
(nerc_assignment['Year'].isin(years)), 'Corrected Region'] = 'SERC'
In [168]:
nerc_assignment.loc[(nerc_assignment['Plant Id'] == 3) &
(nerc_assignment['Year'].isin(years)), 'Corrected Region']
Out[168]:
In [170]:
nerc_assignment.loc[nerc_assignment['Year'] == 2002].head()
Out[170]:
In [115]:
nerc_assignment.index = pd.MultiIndex.from_arrays([nerc_assignment['Year'],
nerc_assignment['Plant Id']])
In [165]:
nerc_assignment.head()
Out[165]:
In [136]:
idx = pd.IndexSlice
In [137]:
regions_2006 = nerc_assignment.loc[idx[2006, :], 'NERC Region'].copy()
In [145]:
regions_2006 = nerc_assignment.xs(2006, level='Year')['NERC Region']
In [146]:
regions_2006
Out[146]:
In [151]:
for year in range(2001, 2006):
nerc_assignment.xs(year, level='Year')['Corrected NERC'] = regions_2006
In [152]:
nerc_assignment
Out[152]:
In [ ]: