Verify that the merge result is as expected. I noticed that when I checked the length of the EPA for a 6-month period, the length was longer than the post-merge 6-month period. This implies that the EIA data doesn't have some facilities that are listed in EPA. I haven't verified this though.
NaN values also need to be replaced with 0's.
Numbers are being stored in scientific notation
In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import re
import cPickle as pickle
import gzip
import seaborn as sns
In [5]:
#Iterate through the directory to find all the files to import
#Modified so that it also works on macs
path = os.path.join('EIA Data', '923-No_Header')
full_path = os.path.join(path, '*.*')
eiaNames = os.listdir(path)
#Rename the keys for easier merging later
fileNameMap = {'EIA923 SCHEDULES 2_3_4_5 Final 2010.xls':2010,
'EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS':2009,
'eia923December2008.xls':2008,
'EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx':2011,
'EIA923_Schedules_2_3_4_5_2012_Final_Release_12.04.2013.xlsx':2012,
'EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx':2013,
'EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx':2014,
'EIA923_Schedules_2_3_4_5_M_12_2015_Final.xlsx':2015,
'f906920_2007.xls':2007}
#Load the files into data frames, one df per file
eiaDict = {fileNameMap[fn]:pd.read_excel(os.path.join(path, fn)) for fn in eiaNames}
eiaDict = {key:val[val["NERC Region"] == "TRE"] for key, val in eiaDict.iteritems()}
The excel documents have different column names so we need to standardize them all
In [6]:
#Dict of values to replace to standardize column names across all dataframes
monthDict = {"JANUARY":"JAN",
"FEBRUARY":"FEB",
"MARCH":"MAR",
"APRIL":"APR",
"MAY":"MAY",
"JUNE":"JUN",
"JULY":"JUL",
"AUGUST":"AUG",
"SEPTEMBER":"SEP",
"OCTOBER":"OCT",
"NOVEMBER":"NOV",
"DECEMBER":"DEC"}
replaceDict = {"ELECTRIC":"ELEC",
"&":"AND",
"I.D.":"ID",
"MMBTUPER":"MMBTU_PER"}
#Add "MMBTUMON" : "MMBTU_MON" to be replaced
for month in monthDict.values():
replaceDict["MMBTU"+month] = "MMBTU_" + month
#Replace the column name
def rename(col):
for old, new in monthDict.iteritems():
col = col.replace(old, new)
for old, new in replaceDict.iteritems():
col = col.replace(old, new)
col = col.replace("MMBTUS", "MMBTU")
return col
#Iterate through each column name of each dataframe to standardize
for key, df in eiaDict.iteritems():
colNames = [name.replace("\n", "_").replace(" ", "_").strip().upper() for name in df.columns]
colNames = [rename(col) for col in colNames]
eiaDict[key].columns = colNames
Define which columns we need to sum, and which columns don't need to be summed, but we still need to keep.
Note: If we don't care about monthly stuff we can delete the second block of code.
In [7]:
#Define the columns that are necessary but are not summable
allCols = eiaDict[fileNameMap.values()[0]].columns
nonSumCols = ["PLANT_ID", "PLANT_NAME", "YEAR"]
#Define the columns that contain the year's totals (Used to calc fuel type %)
yearCols = ["TOTAL_FUEL_CONSUMPTION_QUANTITY", "ELEC_FUEL_CONSUMPTION_QUANTITY",
"TOTAL_FUEL_CONSUMPTION_MMBTU", "ELEC_FUEL_CONSUMPTION_MMBTU",
"NET_GENERATION_(MEGAWATTHOURS)"]
#Define the columns that are necessary and summable
sumCols = []
sumCols.extend(yearCols)
# regex = re.compile(r"^ELEC_QUANTITY_.*")
# sumCols.extend([col for col in allCols if regex.search(col)])
regex = re.compile(r"^MMBTU_PER_UNIT_.*")
sumCols.extend([col for col in allCols if regex.search(col)])
regex = re.compile(r"^TOT_MMBTU_.*")
sumCols.extend([col for col in allCols if regex.search(col)])
regex = re.compile(r"^ELEC_MMBTUS_.*")
sumCols.extend([col for col in allCols if regex.search(col)])
regex = re.compile(r"^NETGEN_.*")
sumCols.extend([col for col in allCols if regex.search(col)])
Get a list of all the different fuel type codes. If we don't care about all of them, then just hardcode the list
In [8]:
fuelTypes = []
fuelTypes.extend([fuelType for df in eiaDict.values() for fuelType in df["REPORTED_FUEL_TYPE_CODE"].tolist()])
fuelTypes = set(fuelTypes)
In [9]:
fuelTypes
Out[9]:
3 parts to aggregate by facility, and to calculate the % of each type of fuel. This will take a few minutes to run.
The end result is aggEIADict.
In [10]:
#Actually calculate the % type for each facility grouping
def calcPerc(group, aggGroup, fuelType, col):
#Check to see if the facility has a record for the fuel type, and if the total column > 0
if len(group[group["REPORTED_FUEL_TYPE_CODE"] == fuelType]) > 0 and aggGroup[col] > 0:
#summing fuel type because a facility may have multiple plants with the same fuel type
return float((group[group["REPORTED_FUEL_TYPE_CODE"] == fuelType][col]).sum())/aggGroup[col]
else:
return 0
#Perform the aggregation on facility level
def aggAndCalcPerc(group):
aggGroup = group.iloc[0][nonSumCols] #Get the non-agg columns
aggGroup = aggGroup.append(group[sumCols].sum()) #Aggregate the agg columns and append to non-agg
percCols = {col + " %" + fuelType:calcPerc(group, aggGroup, fuelType, col) for col in yearCols for fuelType in fuelTypes}
aggGroup = aggGroup.append(pd.Series(percCols))
return aggGroup
#Iterate through each dataframe to perform aggregation by facility
aggEIADict = dict()
for key, df in eiaDict.iteritems():
gb = df.groupby(by="PLANT_ID")
#aggGroup will be a list of panda series, each series representing a facility
aggGroup = [aggAndCalcPerc(gb.get_group(group)) for group in gb.groups]
aggEIADict[key] = pd.DataFrame(aggGroup)
In [ ]:
aggEIADict[2007].head()
In [ ]:
aggEIADict[2015].head()
In [ ]:
filename = 'EIA 923.pkl'
path = '../Clean Data'
fullpath = os.path.join(path, filename)
pickle.dump(aggEIADict, open(fullpath, 'wb'))
In [11]:
all923 = pd.concat(aggEIADict)
In [12]:
all923.head()
Out[12]:
In [13]:
all923.reset_index(drop=True, inplace=True)
In [14]:
# Check column numbers to use in the function below
all923.iloc[1,1:27]
Out[14]:
In [14]:
def top_fuel(row):
#Fraction of largest fuel for electric heat input
try:
fuel = row.iloc[1:27].idxmax()[29:]
except:
return None
return fuel
In [15]:
all923['FUEL'] = all923.apply(top_fuel, axis=1)
In [16]:
all923.head()
Out[16]:
In [17]:
fossil923 = all923.loc[all923['FUEL'].isin(['DFO', 'LIG', 'NG', 'PC', 'SUB'])]
In [19]:
filename = 'Fossil EIA 923.csv'
path = '../Clean Data'
fullpath = os.path.join(path, filename)
fossil923.to_csv(fullpath)
In [18]:
#Read the EPA files into a dataframe
path2 = os.path.join('EPA air markets')
epaNames = os.listdir(path2)
filePaths = {dn:os.path.join(path2, dn, "*.txt") for dn in epaNames}
filePaths = {dn:glob.glob(val) for dn, val in filePaths.iteritems()}
epaDict = {key:pd.read_csv(fp, index_col = False) for key, val in filePaths.iteritems() for fp in val}
First rename the column name so we can merge on that column, then change the datatype of date to a datetime object
In [19]:
#Rename the column names to remove the leading space.
for key, df in epaDict.iteritems():
colNames = [name.upper().strip() for name in df.columns]
colNames[colNames.index("FACILITY ID (ORISPL)")] = "PLANT_ID"
epaDict[key].columns = colNames
#Convert DATE to datetime object
#Add new column DATETIME with both date and hour
for key, df in epaDict.iteritems():
epaDict[key]["DATE"] = pd.to_datetime(df["DATE"])
epaDict[key]['DATETIME'] = df['DATE'] + pd.to_timedelta(df['HOUR'], unit='h')
The DataFrames in epaDict contain all power plants in Texas. We can filter on NERC REGION so that it only includes ERCOT.
In [20]:
set(epaDict['2015 July-Dec'].loc[:,'NERC REGION'])
Out[20]:
In [21]:
#Boolean filter to only keep ERCOT plants
for key, df in epaDict.iteritems():
epaDict[key] = df[df["NERC REGION"] == "ERCOT"].reset_index(drop = True)
In [22]:
set(epaDict['2015 July-Dec'].loc[:,'NERC REGION'])
Out[22]:
In [23]:
epaDict['2015 July-Dec'].head()
Out[23]:
In [26]:
# pickle with gzip, from http://stackoverflow.com/questions/18474791/decreasing-the-size-of-cpickle-objects
def save_zipped_pickle(obj, filename, protocol=-1):
with gzip.open(filename, 'wb') as f:
pickle.dump(obj, f, protocol)
In [27]:
filename = 'EPA hourly dictionary.pgz'
path = '../Clean Data'
fullpath = os.path.join(path, filename)
save_zipped_pickle(epaDict, fullpath)
In [28]:
df = epaDict['2015 July-Dec']
In [29]:
df.head()
Out[29]:
In [30]:
set(df['PLANT_ID'])
Out[30]:
In [31]:
df_temp = df[df['PLANT_ID'].isin([127, 298, 3439])].fillna(0)
In [32]:
df_temp.head()
Out[32]:
In [ ]:
g = sns.FacetGrid(df_temp, col='PLANT_ID')
g.map(plt.plot, 'datetime', 'GROSS LOAD (MW)')
g.set_xticklabels(rotation=30)
path = os.path.join('..', 'Exploratory visualization', 'Midterm figures', 'Sample hourly load.svg')
plt.savefig(path)
Switch to an inner join?
No need to join. Can keep them as separate databases, since one is hourly data and the other is annual/monthly Create a clustering dataframe with index of all plant IDs (from the EPA hourly data), add columns with variables. Calculate the inputs in separate dataframes - example is to calculate ramp rate values in the EPA hourly data, then put the results in the clustering dataframe.
In [24]:
#Join the two data sources on PLANT_ID
fullData = {key:df.merge(aggEIADict[df["YEAR"][0]], on="PLANT_ID") for key, df in epaDict.iteritems()}
In [25]:
fullData[fullData.keys()[0]].head()
Out[25]:
BIT, SUB, LIG, NG, DFO, RFO
In [36]:
[x for x in fullData[fullData.keys()[0]].columns]
Out[36]:
In [26]:
# Iterate through the directory to find all the files to import
path = os.path.join('EIA Data', '860-No_Header')
full_path = os.path.join(path, '*.*')
eia860Names = os.listdir(path)
# Rename the keys for easier merging later
fileName860Map = { 'GenY07.xls':2007,
'GenY08.xls':2008,
'GeneratorY09.xls':2009,
'GeneratorsY2010.xls':2010,
'GeneratorY2011.xlsx':2011,
'GeneratorY2012.xlsx':2012,
'3_1_Generator_Y2013.xlsx':2013,
'3_1_Generator_Y2014.xlsx':2014,
'3_1_Generator_Y2015.xlsx':2015}
#Load the files into data frames, one df per file
eia860Dict = {fileName860Map[fn]:pd.read_excel(os.path.join(path, fn)) for fn in eia860Names}
In [27]:
#Dict of values to replace to standardize column names across all dataframes
renameDict = { "PLNTCODE":"PLANT_ID",
"PLANT_CODE":"PLANT_ID",
"Plant Code":"PLANT_ID",
"NAMEPLATE":"NAMEPLATE_CAPACITY(MW)",
"Nameplate Capacity (MW)":"NAMEPLATE_CAPACITY(MW)"}
#Replace the column name
def rename860(col):
for old, new in renameDict.iteritems():
col = col.replace(old, new)
return col
#Iterate through each column name of each dataframe to standardize and select columns 'PLANT_ID', 'NAMEPLATE_CAPACITY(MW)'
for key, df in eia860Dict.iteritems():
colNames = [rename860(col) for col in df.columns]
eia860Dict[key].columns = colNames
eia860Dict[key] = eia860Dict[key][["PLANT_ID", "NAMEPLATE_CAPACITY(MW)"]]
In [28]:
# verify the tables
for key, df in eia860Dict.iteritems():
print key, df.columns, len(df)
In [29]:
# Iterate through each dataframe to perform aggregation by PLANT_ID
for key, df in eia860Dict.iteritems():
gb = df.groupby(by='PLANT_ID').apply(lambda x: x['NAMEPLATE_CAPACITY(MW)'].sum())
eia860Dict[key]['NAMEPLATE_CAPACITY(MW)'] = eia860Dict[key].PLANT_ID.apply(gb.get_value)
eia860Dict[key] = eia860Dict[key].drop_duplicates(subset=['PLANT_ID', 'NAMEPLATE_CAPACITY(MW)'])
eia860Dict[key] = eia860Dict[key].sort_values(by='PLANT_ID').reset_index(drop=True)
In [41]:
filename = 'EIA 860.pkl'
path = '../Clean Data'
fullpath = os.path.join(path, filename)
pickle.dump(eia860Dict, open(fullpath, 'wb'))
In [31]:
clusterDict = dict()
for key, df in eia860Dict.iteritems():
clusterDict[key] = pd.merge(aggEIADict[key], eia860Dict[key], how='left', on='PLANT_ID')[['PLANT_ID', 'NAMEPLATE_CAPACITY(MW)']]
clusterDict[key].rename(columns={'NAMEPLATE_CAPACITY(MW)': 'capacity', 'PLANT_ID': 'plant_id'}, inplace=True)
In [32]:
# verify for no loss of data
for key, df in eia860Dict.iteritems():
print key, len(clusterDict[key]), len(aggEIADict[key])
In [33]:
clusterDict[2015].head()
Out[33]:
Function to get fuel type
In [34]:
fuel_cols = [col for col in aggEIADict[2008].columns if 'ELEC_FUEL_CONSUMPTION_MMBTU %' in col]
def top_fuel(row):
#Fraction of largest fuel for electric heat input
try:
fuel = row.idxmax()[29:]
except:
return None
return fuel
# clusterDict[2008]['fuel'] = aggEIADict[2008][fuel_cols].apply(top_fuel, axis=1)
Calculate Capacity factor, Efficiency, Fuel type
In [35]:
for key, df in clusterDict.iteritems():
clusterDict[key]['year'] = key
clusterDict[key]['capacity_factor'] = aggEIADict[key]['NET_GENERATION_(MEGAWATTHOURS)'] / (8670*clusterDict[key]['capacity'])
clusterDict[key]['efficiency'] = (aggEIADict[key]['NET_GENERATION_(MEGAWATTHOURS)']*3.412)/(1.0*aggEIADict[key]['ELEC_FUEL_CONSUMPTION_MMBTU'])
clusterDict[key]['fuel_type'] = aggEIADict[key][fuel_cols].apply(top_fuel, axis=1)
clusterDict[key] = clusterDict[key][clusterDict[key]['fuel_type'].isin(['SUB',
'LIG',
'DFO',
'NG',
'PC'])]
Merge all epa files in one df
In [36]:
columns = ['PLANT_ID', 'YEAR', 'DATE', 'HOUR', 'GROSS LOAD (MW)']
counter = 0
for key, df in epaDict.iteritems():
if counter == 0:
result = epaDict[key][columns]
counter = 1
else:
result = result.append(epaDict[key][columns], ignore_index=True)
# Change nan to 0
result.fillna(0, inplace=True)
In [37]:
result.describe()
Out[37]:
Function to calculate the ramp rate for every hour
In [38]:
def plant_gen_delta(df):
"""
For every plant in the input df, calculate the change in gross load (MW)
from the previous hour.
input:
df: dataframe of EPA clean air markets data
return:
df: concatanated list of dataframes
"""
df_list = []
for plant in df['PLANT_ID'].unique():
temp = df.loc[df['PLANT_ID'] == plant,:]
gen_change = temp.loc[:,'GROSS LOAD (MW)'].values - temp.loc[:,'GROSS LOAD (MW)'].shift(1).values
temp.loc[:,'Gen Change'] = gen_change
df_list.append(temp)
return pd.concat(df_list)
In [39]:
ramp_df = plant_gen_delta(result)
In [40]:
ramp_df.describe()
Out[40]:
Get the max ramp rate for every plant for each year
In [41]:
cols = ['PLANT_ID', 'YEAR', 'Gen Change']
ramp_rate_list = []
for year in ramp_df['YEAR'].unique():
for plant in ramp_df.loc[ramp_df['YEAR']==year,'PLANT_ID'].unique():
# 95th percentile ramp rate per plant per year
ramp_95 = ramp_df.loc[(ramp_df['PLANT_ID']== plant) &
(ramp_df['YEAR']==year),'Gen Change'].quantile(0.95, interpolation='nearest')
ramp_rate_list.append([plant, year, ramp_95])
In [42]:
ramp_rate_df = pd.DataFrame(ramp_rate_list, columns=['plant_id', 'year', 'ramp_rate'])
In [43]:
ramp_rate_df.describe()
Out[43]:
In [44]:
for key, df in clusterDict.iteritems():
clusterDict[key] = pd.merge(clusterDict[key], ramp_rate_df, how='left', on=['plant_id', 'year'])
In [45]:
clusterDict[2010].head()
Out[45]:
In [46]:
# Check plants larger than 25MW, which is the lower limit for EPA
clusterDict[2010][clusterDict[2010].capacity >=25].describe()
Out[46]:
In [47]:
for key in clusterDict.keys():
print key, clusterDict[key].plant_id.count(), clusterDict[key].ramp_rate.count()
Save dict to csv
In [48]:
# re-arrange column order
columns = ['year', 'plant_id', 'capacity', 'capacity_factor', 'efficiency', 'ramp_rate', 'fuel_type']
filename = 'Cluster_Data_2.csv'
path = '../Clean Data'
fullpath = os.path.join(path, filename)
counter = 0
for key, df in clusterDict.iteritems():
# create the csv file
if counter == 0:
df[columns].sort_values(by='plant_id').to_csv(fullpath, sep=',', index = False)
counter += 1
# append to existing csv file
else:
df[columns].sort_values(by='plant_id').to_csv(fullpath, sep=',', index = False, header=False, mode = 'a')
In [ ]: