TODO:

  1. Verify that the merge result is as expected. I noticed that when I checked the length of the EPA for a 6-month period, the length was longer than the post-merge 6-month period. This implies that the EIA data doesn't have some facilities that are listed in EPA. I haven't verified this though.

  2. NaN values also need to be replaced with 0's.

  3. Numbers are being stored in scientific notation


In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import re
import cPickle as pickle
import gzip
import seaborn as sns

Loading the EIA Data, the path may need to be updated...

This will take a few minutes to run.


In [5]:
#Iterate through the directory to find all the files to import
#Modified so that it also works on macs
path = os.path.join('EIA Data', '923-No_Header')
full_path = os.path.join(path, '*.*')


eiaNames = os.listdir(path)

#Rename the keys for easier merging later
fileNameMap = {'EIA923 SCHEDULES 2_3_4_5 Final 2010.xls':2010,
                'EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS':2009,
                'eia923December2008.xls':2008,
                'EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx':2011,
                'EIA923_Schedules_2_3_4_5_2012_Final_Release_12.04.2013.xlsx':2012,
                'EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx':2013,
                'EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx':2014,
                'EIA923_Schedules_2_3_4_5_M_12_2015_Final.xlsx':2015,
                'f906920_2007.xls':2007}

#Load the files into data frames, one df per file
eiaDict = {fileNameMap[fn]:pd.read_excel(os.path.join(path, fn)) for fn in eiaNames}
eiaDict = {key:val[val["NERC Region"] == "TRE"] for key, val in eiaDict.iteritems()}

The excel documents have different column names so we need to standardize them all


In [6]:
#Dict of values to replace to standardize column names across all dataframes
monthDict = {"JANUARY":"JAN",
           "FEBRUARY":"FEB",
           "MARCH":"MAR",
           "APRIL":"APR",
           "MAY":"MAY",
           "JUNE":"JUN",
           "JULY":"JUL",
           "AUGUST":"AUG",
           "SEPTEMBER":"SEP",
           "OCTOBER":"OCT",
           "NOVEMBER":"NOV",
           "DECEMBER":"DEC"}
           
replaceDict = {"ELECTRIC":"ELEC",
               "&":"AND",
               "I.D.":"ID",
               "MMBTUPER":"MMBTU_PER"}
               
#Add "MMBTUMON" : "MMBTU_MON" to be replaced
for month in monthDict.values():
    replaceDict["MMBTU"+month] = "MMBTU_" + month

#Replace the column name
def rename(col):
    for old, new in monthDict.iteritems():
        col = col.replace(old, new)
        
    for old, new in replaceDict.iteritems():
        col = col.replace(old, new)
        
    col = col.replace("MMBTUS", "MMBTU")
    return col
    
#Iterate through each column name of each dataframe to standardize
for key, df in eiaDict.iteritems():
    colNames = [name.replace("\n", "_").replace(" ", "_").strip().upper() for name in df.columns]
    colNames = [rename(col) for col in colNames]
    eiaDict[key].columns = colNames

Define which columns we need to sum, and which columns don't need to be summed, but we still need to keep.

Note: If we don't care about monthly stuff we can delete the second block of code.


In [7]:
#Define the columns that are necessary but are not summable
allCols = eiaDict[fileNameMap.values()[0]].columns
nonSumCols = ["PLANT_ID", "PLANT_NAME", "YEAR"]

#Define the columns that contain the year's totals (Used to calc fuel type %)
yearCols = ["TOTAL_FUEL_CONSUMPTION_QUANTITY", "ELEC_FUEL_CONSUMPTION_QUANTITY",
            "TOTAL_FUEL_CONSUMPTION_MMBTU", "ELEC_FUEL_CONSUMPTION_MMBTU",
            "NET_GENERATION_(MEGAWATTHOURS)"]


#Define the columns that are necessary and summable
sumCols = []
sumCols.extend(yearCols)
# regex = re.compile(r"^ELEC_QUANTITY_.*")
# sumCols.extend([col for col in allCols if regex.search(col)])
regex = re.compile(r"^MMBTU_PER_UNIT_.*")
sumCols.extend([col for col in allCols if regex.search(col)])
regex = re.compile(r"^TOT_MMBTU_.*")
sumCols.extend([col for col in allCols if regex.search(col)])
regex = re.compile(r"^ELEC_MMBTUS_.*")
sumCols.extend([col for col in allCols if regex.search(col)])
regex = re.compile(r"^NETGEN_.*")
sumCols.extend([col for col in allCols if regex.search(col)])

Get a list of all the different fuel type codes. If we don't care about all of them, then just hardcode the list


In [8]:
fuelTypes = []
fuelTypes.extend([fuelType for df in eiaDict.values() for fuelType in df["REPORTED_FUEL_TYPE_CODE"].tolist()])
fuelTypes = set(fuelTypes)

In [9]:
fuelTypes


Out[9]:
{u'AB',
 u'BIT',
 u'BLQ',
 u'DFO',
 u'JF',
 u'LFG',
 u'LIG',
 u'MWH',
 u'NG',
 u'NUC',
 u'OBG',
 u'OBL',
 u'OBS',
 u'OG',
 u'OTH',
 u'PC',
 u'PUR',
 u'RFO',
 u'SC',
 u'SUB',
 u'SUN',
 u'WAT',
 u'WDS',
 u'WH',
 u'WND',
 u'WO'}

3 parts to aggregate by facility, and to calculate the % of each type of fuel. This will take a few minutes to run.

The end result is aggEIADict.


In [10]:
#Actually calculate the % type for each facility grouping
def calcPerc(group, aggGroup, fuelType, col):
    #Check to see if the facility has a record for the fuel type, and if the total column > 0
    if len(group[group["REPORTED_FUEL_TYPE_CODE"] == fuelType]) > 0 and aggGroup[col] > 0:
        #summing fuel type because a facility may have multiple plants with the same fuel type        
        return float((group[group["REPORTED_FUEL_TYPE_CODE"] == fuelType][col]).sum())/aggGroup[col] 
    else:
        return 0

#Perform the aggregation on facility level
def aggAndCalcPerc(group):
    aggGroup = group.iloc[0][nonSumCols] #Get the non-agg columns
    aggGroup = aggGroup.append(group[sumCols].sum())   #Aggregate the agg columns and append to non-agg
    percCols = {col + " %" + fuelType:calcPerc(group, aggGroup, fuelType, col) for col in yearCols for fuelType in fuelTypes}
    aggGroup = aggGroup.append(pd.Series(percCols))
    return aggGroup    

#Iterate through each dataframe to perform aggregation by facility
aggEIADict = dict()
for key, df in eiaDict.iteritems():
    gb = df.groupby(by="PLANT_ID")
    #aggGroup will be a list of panda series, each series representing a facility
    aggGroup = [aggAndCalcPerc(gb.get_group(group)) for group in gb.groups]
    aggEIADict[key] = pd.DataFrame(aggGroup)

Column order doesn't match in all years


In [ ]:
aggEIADict[2007].head()

In [ ]:
aggEIADict[2015].head()

Export the EIA 923 data as pickle

Just sending the dictionary to a pickle file for now. At least doing this will save several min of time loading and processing the data in the future.


In [ ]:
filename = 'EIA 923.pkl'
path = '../Clean Data'
fullpath = os.path.join(path, filename)

pickle.dump(aggEIADict, open(fullpath, 'wb'))

Combine all df's from the dict into one df

Concat all dataframes, reset the index, determine the primary fuel type for each facility, filter to only include fossil power plants, and export as a csv


In [11]:
all923 = pd.concat(aggEIADict)

In [12]:
all923.head()


Out[12]:
ELEC_FUEL_CONSUMPTION_MMBTU ELEC_FUEL_CONSUMPTION_MMBTU %AB ELEC_FUEL_CONSUMPTION_MMBTU %BIT ELEC_FUEL_CONSUMPTION_MMBTU %BLQ ELEC_FUEL_CONSUMPTION_MMBTU %DFO ELEC_FUEL_CONSUMPTION_MMBTU %JF ELEC_FUEL_CONSUMPTION_MMBTU %LFG ELEC_FUEL_CONSUMPTION_MMBTU %LIG ELEC_FUEL_CONSUMPTION_MMBTU %MWH ELEC_FUEL_CONSUMPTION_MMBTU %NG ... TOT_MMBTU_FEB TOT_MMBTU_JAN TOT_MMBTU_JUL TOT_MMBTU_JUN TOT_MMBTU_MAR TOT_MMBTU_MAY TOT_MMBTU_NOV TOT_MMBTU_OCT TOT_MMBTU_SEP YEAR
2007 0 109110.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.000000 0.0 0.0 ... 10344 9710 25442 14517 1220 7128 2034 2392 15284 2007
1 118993.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.000000 0.0 0.0 ... 11281 10590 27746 15832 1331 7774 2219 2608 16668 2007
2 142735.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.000000 0.0 0.0 ... 13532 12703 33282 18991 1596 9325 2661 3129 19994 2007
3 171279117.0 0.0 0.0 0.0 0.00082 0.0 0.0 0.403995 0.0 0.0 ... 1.37476e+07 1.39678e+07 1.50848e+07 1.40674e+07 1.47215e+07 1.3538e+07 1.45648e+07 1.49483e+07 1.39296e+07 2007
4 197597659.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.000000 0.0 0.0 ... 1.51598e+07 1.83731e+07 1.80729e+07 1.75828e+07 9.18252e+06 1.82552e+07 1.77292e+07 1.81813e+07 1.74853e+07 2007

5 rows × 174 columns


In [13]:
all923.reset_index(drop=True, inplace=True)

In [14]:
# Check column numbers to use in the function below
all923.iloc[1,1:27]


Out[14]:
ELEC_FUEL_CONSUMPTION_MMBTU %AB     0
ELEC_FUEL_CONSUMPTION_MMBTU %BIT    0
ELEC_FUEL_CONSUMPTION_MMBTU %BLQ    0
ELEC_FUEL_CONSUMPTION_MMBTU %DFO    0
ELEC_FUEL_CONSUMPTION_MMBTU %JF     0
ELEC_FUEL_CONSUMPTION_MMBTU %LFG    0
ELEC_FUEL_CONSUMPTION_MMBTU %LIG    0
ELEC_FUEL_CONSUMPTION_MMBTU %MWH    0
ELEC_FUEL_CONSUMPTION_MMBTU %NG     0
ELEC_FUEL_CONSUMPTION_MMBTU %NUC    0
ELEC_FUEL_CONSUMPTION_MMBTU %OBG    0
ELEC_FUEL_CONSUMPTION_MMBTU %OBL    0
ELEC_FUEL_CONSUMPTION_MMBTU %OBS    0
ELEC_FUEL_CONSUMPTION_MMBTU %OG     0
ELEC_FUEL_CONSUMPTION_MMBTU %OTH    0
ELEC_FUEL_CONSUMPTION_MMBTU %PC     0
ELEC_FUEL_CONSUMPTION_MMBTU %PUR    0
ELEC_FUEL_CONSUMPTION_MMBTU %RFO    0
ELEC_FUEL_CONSUMPTION_MMBTU %SC     0
ELEC_FUEL_CONSUMPTION_MMBTU %SUB    0
ELEC_FUEL_CONSUMPTION_MMBTU %SUN    0
ELEC_FUEL_CONSUMPTION_MMBTU %WAT    1
ELEC_FUEL_CONSUMPTION_MMBTU %WDS    0
ELEC_FUEL_CONSUMPTION_MMBTU %WH     0
ELEC_FUEL_CONSUMPTION_MMBTU %WND    0
ELEC_FUEL_CONSUMPTION_MMBTU %WO     0
Name: 1, dtype: object

In [14]:
def top_fuel(row):
    #Fraction of largest fuel for electric heat input 
    try:
        fuel = row.iloc[1:27].idxmax()[29:]
    except:
        return None
    return fuel

In [15]:
all923['FUEL'] = all923.apply(top_fuel, axis=1)

In [16]:
all923.head()


Out[16]:
ELEC_FUEL_CONSUMPTION_MMBTU ELEC_FUEL_CONSUMPTION_MMBTU %AB ELEC_FUEL_CONSUMPTION_MMBTU %BIT ELEC_FUEL_CONSUMPTION_MMBTU %BLQ ELEC_FUEL_CONSUMPTION_MMBTU %DFO ELEC_FUEL_CONSUMPTION_MMBTU %JF ELEC_FUEL_CONSUMPTION_MMBTU %LFG ELEC_FUEL_CONSUMPTION_MMBTU %LIG ELEC_FUEL_CONSUMPTION_MMBTU %MWH ELEC_FUEL_CONSUMPTION_MMBTU %NG ... TOT_MMBTU_JAN TOT_MMBTU_JUL TOT_MMBTU_JUN TOT_MMBTU_MAR TOT_MMBTU_MAY TOT_MMBTU_NOV TOT_MMBTU_OCT TOT_MMBTU_SEP YEAR FUEL
0 109110.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.000000 0.0 0.0 ... 9710 25442 14517 1220 7128 2034 2392 15284 2007 WAT
1 118993.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.000000 0.0 0.0 ... 10590 27746 15832 1331 7774 2219 2608 16668 2007 WAT
2 142735.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.000000 0.0 0.0 ... 12703 33282 18991 1596 9325 2661 3129 19994 2007 WAT
3 171279117.0 0.0 0.0 0.0 0.00082 0.0 0.0 0.403995 0.0 0.0 ... 1.39678e+07 1.50848e+07 1.40674e+07 1.47215e+07 1.3538e+07 1.45648e+07 1.49483e+07 1.39296e+07 2007 SUB
4 197597659.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.000000 0.0 0.0 ... 1.83731e+07 1.80729e+07 1.75828e+07 9.18252e+06 1.82552e+07 1.77292e+07 1.81813e+07 1.74853e+07 2007 NUC

5 rows × 175 columns


In [17]:
fossil923 = all923.loc[all923['FUEL'].isin(['DFO', 'LIG', 'NG', 'PC', 'SUB'])]

Export the EIA 923 data dataframe as csv

Export the dataframe with primary fuel and filtered to only include fossil plants


In [19]:
filename = 'Fossil EIA 923.csv'
path = '../Clean Data'
fullpath = os.path.join(path, filename)
fossil923.to_csv(fullpath)

Loading the EPA Data, the path may need to be updated...


In [18]:
#Read the EPA files into a dataframe
path2 = os.path.join('EPA air markets')
epaNames = os.listdir(path2)
filePaths = {dn:os.path.join(path2, dn, "*.txt") for dn in epaNames}
filePaths = {dn:glob.glob(val) for dn, val in filePaths.iteritems()}
epaDict = {key:pd.read_csv(fp, index_col = False) for key, val in filePaths.iteritems() for fp in val}

First rename the column name so we can merge on that column, then change the datatype of date to a datetime object


In [19]:
#Rename the column names to remove the leading space.
for key, df in epaDict.iteritems():
    colNames = [name.upper().strip() for name in df.columns]
    colNames[colNames.index("FACILITY ID (ORISPL)")] = "PLANT_ID"
    epaDict[key].columns = colNames
    
#Convert DATE to datetime object
#Add new column DATETIME with both date and hour
for key, df in epaDict.iteritems():
    epaDict[key]["DATE"] = pd.to_datetime(df["DATE"])
    epaDict[key]['DATETIME'] = df['DATE'] + pd.to_timedelta(df['HOUR'], unit='h')

The DataFrames in epaDict contain all power plants in Texas. We can filter on NERC REGION so that it only includes ERCOT.


In [20]:
set(epaDict['2015 July-Dec'].loc[:,'NERC REGION'])


Out[20]:
{nan, 'ERCOT', 'SERC', 'SPP', 'WECC'}

In [21]:
#Boolean filter to only keep ERCOT plants
for key, df in epaDict.iteritems():
    epaDict[key] = df[df["NERC REGION"] == "ERCOT"].reset_index(drop = True)

In [22]:
set(epaDict['2015 July-Dec'].loc[:,'NERC REGION'])


Out[22]:
{'ERCOT'}

In [23]:
epaDict['2015 July-Dec'].head()


Out[23]:
STATE FACILITY NAME PLANT_ID YEAR DATE HOUR GROSS LOAD (MW) STEAM LOAD (1000LB/HR) SO2 (POUNDS) NOX (POUNDS) CO2 (SHORT TONS) EPA REGION NERC REGION COUNTY HEAT INPUT (MMBTU) FACILITY LATITUDE FACILITY LONGITUDE DATETIME
0 TX Barney M. Davis 4939 2015 2015-07-01 0 NaN NaN NaN NaN NaN 6 ERCOT Nueces NaN 27.6067 -97.3119 2015-07-01 00:00:00
1 TX Barney M. Davis 4939 2015 2015-07-01 1 NaN NaN NaN NaN NaN 6 ERCOT Nueces NaN 27.6067 -97.3119 2015-07-01 01:00:00
2 TX Barney M. Davis 4939 2015 2015-07-01 2 NaN NaN NaN NaN NaN 6 ERCOT Nueces NaN 27.6067 -97.3119 2015-07-01 02:00:00
3 TX Barney M. Davis 4939 2015 2015-07-01 3 NaN NaN NaN NaN NaN 6 ERCOT Nueces NaN 27.6067 -97.3119 2015-07-01 03:00:00
4 TX Barney M. Davis 4939 2015 2015-07-01 4 NaN NaN NaN NaN NaN 6 ERCOT Nueces NaN 27.6067 -97.3119 2015-07-01 04:00:00

Export EPA data as a series of dataframes

The whole dictionary is too big as a pickle file


In [26]:
# pickle with gzip, from http://stackoverflow.com/questions/18474791/decreasing-the-size-of-cpickle-objects
def save_zipped_pickle(obj, filename, protocol=-1):
    with gzip.open(filename, 'wb') as f:
        pickle.dump(obj, f, protocol)

In [27]:
filename = 'EPA hourly dictionary.pgz'
path = '../Clean Data'
fullpath = os.path.join(path, filename)

save_zipped_pickle(epaDict, fullpath)

In [28]:
df = epaDict['2015 July-Dec']

In [29]:
df.head()


Out[29]:
STATE FACILITY NAME PLANT_ID YEAR DATE HOUR GROSS LOAD (MW) STEAM LOAD (1000LB/HR) SO2 (POUNDS) NOX (POUNDS) CO2 (SHORT TONS) EPA REGION NERC REGION COUNTY HEAT INPUT (MMBTU) FACILITY LATITUDE FACILITY LONGITUDE DATETIME
0 TX Barney M. Davis 4939 2015 2015-07-01 0 NaN NaN NaN NaN NaN 6 ERCOT Nueces NaN 27.6067 -97.3119 2015-07-01 00:00:00
1 TX Barney M. Davis 4939 2015 2015-07-01 1 NaN NaN NaN NaN NaN 6 ERCOT Nueces NaN 27.6067 -97.3119 2015-07-01 01:00:00
2 TX Barney M. Davis 4939 2015 2015-07-01 2 NaN NaN NaN NaN NaN 6 ERCOT Nueces NaN 27.6067 -97.3119 2015-07-01 02:00:00
3 TX Barney M. Davis 4939 2015 2015-07-01 3 NaN NaN NaN NaN NaN 6 ERCOT Nueces NaN 27.6067 -97.3119 2015-07-01 03:00:00
4 TX Barney M. Davis 4939 2015 2015-07-01 4 NaN NaN NaN NaN NaN 6 ERCOT Nueces NaN 27.6067 -97.3119 2015-07-01 04:00:00

In [30]:
set(df['PLANT_ID'])


Out[30]:
{127,
 298,
 3439,
 3441,
 3443,
 3452,
 3453,
 3460,
 3464,
 3468,
 3469,
 3470,
 3476,
 3477,
 3478,
 3490,
 3491,
 3492,
 3494,
 3497,
 3504,
 3507,
 3548,
 3559,
 3576,
 3601,
 3611,
 3612,
 3628,
 3631,
 4266,
 4937,
 4939,
 6136,
 6139,
 6147,
 6178,
 6179,
 6180,
 6181,
 6183,
 6243,
 6648,
 7030,
 7097,
 7325,
 7900,
 8063,
 52176,
 55015,
 55091,
 55139,
 55153,
 55215,
 55223,
 56350,
 56674}

In [31]:
df_temp = df[df['PLANT_ID'].isin([127, 298, 3439])].fillna(0)

In [32]:
df_temp.head()


Out[32]:
STATE FACILITY NAME PLANT_ID YEAR DATE HOUR GROSS LOAD (MW) STEAM LOAD (1000LB/HR) SO2 (POUNDS) NOX (POUNDS) CO2 (SHORT TONS) EPA REGION NERC REGION COUNTY HEAT INPUT (MMBTU) FACILITY LATITUDE FACILITY LONGITUDE DATETIME
79488 TX Laredo 3439 2015 2015-07-01 0 0.0 0.0 0.0 0.0 0.0 6 ERCOT Webb 0.0 27.5667 -99.5083 2015-07-01 00:00:00
79489 TX Laredo 3439 2015 2015-07-01 1 0.0 0.0 0.0 0.0 0.0 6 ERCOT Webb 0.0 27.5667 -99.5083 2015-07-01 01:00:00
79490 TX Laredo 3439 2015 2015-07-01 2 0.0 0.0 0.0 0.0 0.0 6 ERCOT Webb 0.0 27.5667 -99.5083 2015-07-01 02:00:00
79491 TX Laredo 3439 2015 2015-07-01 3 0.0 0.0 0.0 0.0 0.0 6 ERCOT Webb 0.0 27.5667 -99.5083 2015-07-01 03:00:00
79492 TX Laredo 3439 2015 2015-07-01 4 0.0 0.0 0.0 0.0 0.0 6 ERCOT Webb 0.0 27.5667 -99.5083 2015-07-01 04:00:00

In [ ]:
g = sns.FacetGrid(df_temp, col='PLANT_ID')
g.map(plt.plot, 'datetime', 'GROSS LOAD (MW)')
g.set_xticklabels(rotation=30)

path = os.path.join('..', 'Exploratory visualization', 'Midterm figures', 'Sample hourly load.svg')
plt.savefig(path)

Finally join the two data sources

Switch to an inner join?

No need to join. Can keep them as separate databases, since one is hourly data and the other is annual/monthly Create a clustering dataframe with index of all plant IDs (from the EPA hourly data), add columns with variables. Calculate the inputs in separate dataframes - example is to calculate ramp rate values in the EPA hourly data, then put the results in the clustering dataframe.


In [24]:
#Join the two data sources on PLANT_ID
fullData = {key:df.merge(aggEIADict[df["YEAR"][0]], on="PLANT_ID") for key, df in epaDict.iteritems()}

In [25]:
fullData[fullData.keys()[0]].head()


Out[25]:
STATE FACILITY NAME PLANT_ID YEAR_x DATE HOUR GROSS LOAD (MW) STEAM LOAD (1000LB/HR) SO2 (POUNDS) NOX (POUNDS) ... TOT_MMBTU_FEB TOT_MMBTU_JAN TOT_MMBTU_JUL TOT_MMBTU_JUN TOT_MMBTU_MAR TOT_MMBTU_MAY TOT_MMBTU_NOV TOT_MMBTU_OCT TOT_MMBTU_SEP YEAR_y
0 TX Barney M. Davis 4939 2012 2012-01-01 0 NaN NaN NaN NaN ... 1.49482e+06 1.28956e+06 1.98845e+06 2.02212e+06 1.93844e+06 2.06786e+06 905868 1.51242e+06 1.88822e+06 2012
1 TX Barney M. Davis 4939 2012 2012-01-01 1 NaN NaN NaN NaN ... 1.49482e+06 1.28956e+06 1.98845e+06 2.02212e+06 1.93844e+06 2.06786e+06 905868 1.51242e+06 1.88822e+06 2012
2 TX Barney M. Davis 4939 2012 2012-01-01 2 NaN NaN NaN NaN ... 1.49482e+06 1.28956e+06 1.98845e+06 2.02212e+06 1.93844e+06 2.06786e+06 905868 1.51242e+06 1.88822e+06 2012
3 TX Barney M. Davis 4939 2012 2012-01-01 3 NaN NaN NaN NaN ... 1.49482e+06 1.28956e+06 1.98845e+06 2.02212e+06 1.93844e+06 2.06786e+06 905868 1.51242e+06 1.88822e+06 2012
4 TX Barney M. Davis 4939 2012 2012-01-01 4 NaN NaN NaN NaN ... 1.49482e+06 1.28956e+06 1.98845e+06 2.02212e+06 1.93844e+06 2.06786e+06 905868 1.51242e+06 1.88822e+06 2012

5 rows × 191 columns

BIT, SUB, LIG, NG, DFO, RFO


In [36]:
[x for x in fullData[fullData.keys()[0]].columns]


Out[36]:
['STATE',
 'FACILITY NAME',
 'PLANT_ID',
 'YEAR_x',
 'DATE',
 'HOUR',
 'GROSS LOAD (MW)',
 'STEAM LOAD (1000LB/HR)',
 'SO2 (POUNDS)',
 'NOX (POUNDS)',
 'CO2 (SHORT TONS)',
 'HEAT INPUT (MMBTU)',
 'EPA REGION',
 'NERC REGION',
 'COUNTY',
 'FACILITY LATITUDE',
 'FACILITY LONGITUDE',
 'DATETIME',
 u'ELEC_FUEL_CONSUMPTION_MMBTU',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %AB',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %BIT',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %BLQ',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %DFO',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %JF',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %LFG',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %LIG',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %MWH',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %NG',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %NUC',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %OBG',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %OBL',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %OBS',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %OG',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %OTH',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %PC',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %PUR',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %RFO',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %SC',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %SUB',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %SUN',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %WAT',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %WDS',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %WH',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %WND',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %WO',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %AB',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %BIT',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %BLQ',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %DFO',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %JF',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %LFG',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %LIG',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %MWH',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %NG',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %NUC',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %OBG',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %OBL',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %OBS',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %OG',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %OTH',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %PC',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %PUR',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %RFO',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %SC',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %SUB',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %SUN',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %WAT',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %WDS',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %WH',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %WND',
 u'ELEC_FUEL_CONSUMPTION_QUANTITY %WO',
 u'MMBTU_PER_UNIT_APR',
 u'MMBTU_PER_UNIT_AUG',
 u'MMBTU_PER_UNIT_DEC',
 u'MMBTU_PER_UNIT_FEB',
 u'MMBTU_PER_UNIT_JAN',
 u'MMBTU_PER_UNIT_JUL',
 u'MMBTU_PER_UNIT_JUN',
 u'MMBTU_PER_UNIT_MAR',
 u'MMBTU_PER_UNIT_MAY',
 u'MMBTU_PER_UNIT_NOV',
 u'MMBTU_PER_UNIT_OCT',
 u'MMBTU_PER_UNIT_SEP',
 u'NETGEN_APR',
 u'NETGEN_AUG',
 u'NETGEN_DEC',
 u'NETGEN_FEB',
 u'NETGEN_JAN',
 u'NETGEN_JUL',
 u'NETGEN_JUN',
 u'NETGEN_MAR',
 u'NETGEN_MAY',
 u'NETGEN_NOV',
 u'NETGEN_OCT',
 u'NETGEN_SEP',
 u'NET_GENERATION_(MEGAWATTHOURS)',
 u'NET_GENERATION_(MEGAWATTHOURS) %AB',
 u'NET_GENERATION_(MEGAWATTHOURS) %BIT',
 u'NET_GENERATION_(MEGAWATTHOURS) %BLQ',
 u'NET_GENERATION_(MEGAWATTHOURS) %DFO',
 u'NET_GENERATION_(MEGAWATTHOURS) %JF',
 u'NET_GENERATION_(MEGAWATTHOURS) %LFG',
 u'NET_GENERATION_(MEGAWATTHOURS) %LIG',
 u'NET_GENERATION_(MEGAWATTHOURS) %MWH',
 u'NET_GENERATION_(MEGAWATTHOURS) %NG',
 u'NET_GENERATION_(MEGAWATTHOURS) %NUC',
 u'NET_GENERATION_(MEGAWATTHOURS) %OBG',
 u'NET_GENERATION_(MEGAWATTHOURS) %OBL',
 u'NET_GENERATION_(MEGAWATTHOURS) %OBS',
 u'NET_GENERATION_(MEGAWATTHOURS) %OG',
 u'NET_GENERATION_(MEGAWATTHOURS) %OTH',
 u'NET_GENERATION_(MEGAWATTHOURS) %PC',
 u'NET_GENERATION_(MEGAWATTHOURS) %PUR',
 u'NET_GENERATION_(MEGAWATTHOURS) %RFO',
 u'NET_GENERATION_(MEGAWATTHOURS) %SC',
 u'NET_GENERATION_(MEGAWATTHOURS) %SUB',
 u'NET_GENERATION_(MEGAWATTHOURS) %SUN',
 u'NET_GENERATION_(MEGAWATTHOURS) %WAT',
 u'NET_GENERATION_(MEGAWATTHOURS) %WDS',
 u'NET_GENERATION_(MEGAWATTHOURS) %WH',
 u'NET_GENERATION_(MEGAWATTHOURS) %WND',
 u'NET_GENERATION_(MEGAWATTHOURS) %WO',
 'PLANT_NAME',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %AB',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %BIT',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %BLQ',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %DFO',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %JF',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %LFG',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %LIG',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %MWH',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %NG',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %NUC',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %OBG',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %OBL',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %OBS',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %OG',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %OTH',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %PC',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %PUR',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %RFO',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %SC',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %SUB',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %SUN',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %WAT',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %WDS',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %WH',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %WND',
 u'TOTAL_FUEL_CONSUMPTION_MMBTU %WO',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %AB',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %BIT',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %BLQ',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %DFO',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %JF',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %LFG',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %LIG',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %MWH',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %NG',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %NUC',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %OBG',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %OBL',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %OBS',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %OG',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %OTH',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %PC',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %PUR',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %RFO',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %SC',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %SUB',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %SUN',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %WAT',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %WDS',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %WH',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %WND',
 u'TOTAL_FUEL_CONSUMPTION_QUANTITY %WO',
 u'TOT_MMBTU_APR',
 u'TOT_MMBTU_AUG',
 u'TOT_MMBTU_DEC',
 u'TOT_MMBTU_FEB',
 u'TOT_MMBTU_JAN',
 u'TOT_MMBTU_JUL',
 u'TOT_MMBTU_JUN',
 u'TOT_MMBTU_MAR',
 u'TOT_MMBTU_MAY',
 u'TOT_MMBTU_NOV',
 u'TOT_MMBTU_OCT',
 u'TOT_MMBTU_SEP',
 'YEAR_y']

Loading EIA 860 Data


In [26]:
# Iterate through the directory to find all the files to import
path = os.path.join('EIA Data', '860-No_Header')
full_path = os.path.join(path, '*.*')

eia860Names = os.listdir(path)

# Rename the keys for easier merging later
fileName860Map = {  'GenY07.xls':2007,
                    'GenY08.xls':2008,
                    'GeneratorY09.xls':2009,
                    'GeneratorsY2010.xls':2010,
                    'GeneratorY2011.xlsx':2011,
                    'GeneratorY2012.xlsx':2012,
                    '3_1_Generator_Y2013.xlsx':2013,
                    '3_1_Generator_Y2014.xlsx':2014,
                    '3_1_Generator_Y2015.xlsx':2015}

#Load the files into data frames, one df per file
eia860Dict = {fileName860Map[fn]:pd.read_excel(os.path.join(path, fn)) for fn in eia860Names}

In [27]:
#Dict of values to replace to standardize column names across all dataframes
renameDict = {  "PLNTCODE":"PLANT_ID",
                "PLANT_CODE":"PLANT_ID",
                "Plant Code":"PLANT_ID",
                "NAMEPLATE":"NAMEPLATE_CAPACITY(MW)",
                "Nameplate Capacity (MW)":"NAMEPLATE_CAPACITY(MW)"}

#Replace the column name
def rename860(col):
    for old, new in renameDict.iteritems():
        col = col.replace(old, new)
    return col

#Iterate through each column name of each dataframe to standardize and select columns 'PLANT_ID', 'NAMEPLATE_CAPACITY(MW)'
for key, df in eia860Dict.iteritems():
    colNames = [rename860(col) for col in df.columns]
    eia860Dict[key].columns = colNames
    eia860Dict[key] = eia860Dict[key][["PLANT_ID", "NAMEPLATE_CAPACITY(MW)"]]

In [28]:
# verify the tables
for key, df in eia860Dict.iteritems():
    print key, df.columns, len(df)


2007 Index([u'PLANT_ID', u'NAMEPLATE_CAPACITY(MW)'], dtype='object') 19060
2008 Index([u'PLANT_ID', u'NAMEPLATE_CAPACITY(MW)'], dtype='object') 19558
2009 Index([u'PLANT_ID', u'NAMEPLATE_CAPACITY(MW)'], dtype='object') 17876
2010 Index([u'PLANT_ID', u'NAMEPLATE_CAPACITY(MW)'], dtype='object') 18150
2011 Index([u'PLANT_ID', u'NAMEPLATE_CAPACITY(MW)'], dtype='object') 18530
2012 Index([u'PLANT_ID', u'NAMEPLATE_CAPACITY(MW)'], dtype='object') 19023
2013 Index([u'PLANT_ID', u'NAMEPLATE_CAPACITY(MW)'], dtype='object') 19243
2014 Index([u'PLANT_ID', u'NAMEPLATE_CAPACITY(MW)'], dtype='object') 19745
2015 Index([u'PLANT_ID', u'NAMEPLATE_CAPACITY(MW)'], dtype='object') 20068

In [29]:
# Iterate through each dataframe to perform aggregation by PLANT_ID
for key, df in eia860Dict.iteritems():
    gb = df.groupby(by='PLANT_ID').apply(lambda x: x['NAMEPLATE_CAPACITY(MW)'].sum())
    eia860Dict[key]['NAMEPLATE_CAPACITY(MW)'] = eia860Dict[key].PLANT_ID.apply(gb.get_value)
    eia860Dict[key] = eia860Dict[key].drop_duplicates(subset=['PLANT_ID', 'NAMEPLATE_CAPACITY(MW)'])
    eia860Dict[key] = eia860Dict[key].sort_values(by='PLANT_ID').reset_index(drop=True)

Export EIA 860 data


In [41]:
filename = 'EIA 860.pkl'
path = '../Clean Data'
fullpath = os.path.join(path, filename)

pickle.dump(eia860Dict, open(fullpath, 'wb'))

Creating Final DataFrame for Clustering Algorithm:

clusterDict {year : cluster_DF}

  1. For each PLANT_ID in aggEIADict, fetch the corresponding aggregated NAMEPLATE_CAPACITY(MW)

In [31]:
clusterDict = dict()
for key, df in eia860Dict.iteritems():
    clusterDict[key] = pd.merge(aggEIADict[key], eia860Dict[key], how='left', on='PLANT_ID')[['PLANT_ID', 'NAMEPLATE_CAPACITY(MW)']]
    clusterDict[key].rename(columns={'NAMEPLATE_CAPACITY(MW)': 'capacity', 'PLANT_ID': 'plant_id'}, inplace=True)

In [32]:
# verify for no loss of data
for key, df in eia860Dict.iteritems():
    print key, len(clusterDict[key]), len(aggEIADict[key])


2007 216 216
2008 234 234
2009 245 245
2010 253 253
2011 257 257
2012 274 274
2013 282 282
2014 291 291
2015 312 312

In [33]:
clusterDict[2015].head()


Out[33]:
plant_id capacity
0 3584 2.4
1 3585 2.4
2 3586 2.4
3 6147 1980.0
4 58372 135.4

Function to get fuel type


In [34]:
fuel_cols = [col for col in aggEIADict[2008].columns if 'ELEC_FUEL_CONSUMPTION_MMBTU %' in col]

def top_fuel(row):
    #Fraction of largest fuel for electric heat input 
    try:
        fuel = row.idxmax()[29:]
    except:
        return None
    return fuel

# clusterDict[2008]['fuel'] = aggEIADict[2008][fuel_cols].apply(top_fuel, axis=1)

Calculate Capacity factor, Efficiency, Fuel type


In [35]:
for key, df in clusterDict.iteritems():
    clusterDict[key]['year'] = key
    clusterDict[key]['capacity_factor'] = aggEIADict[key]['NET_GENERATION_(MEGAWATTHOURS)'] / (8670*clusterDict[key]['capacity'])
    clusterDict[key]['efficiency'] = (aggEIADict[key]['NET_GENERATION_(MEGAWATTHOURS)']*3.412)/(1.0*aggEIADict[key]['ELEC_FUEL_CONSUMPTION_MMBTU'])
    clusterDict[key]['fuel_type'] = aggEIADict[key][fuel_cols].apply(top_fuel, axis=1)
    clusterDict[key] = clusterDict[key][clusterDict[key]['fuel_type'].isin(['SUB', 
                                                                            'LIG', 
                                                                            'DFO',
                                                                            'NG', 
                                                                            'PC'])]

Merge all epa files in one df


In [36]:
columns = ['PLANT_ID', 'YEAR', 'DATE', 'HOUR', 'GROSS LOAD (MW)']
counter = 0
for key, df in epaDict.iteritems():
    if counter == 0:
        result = epaDict[key][columns]
        counter = 1
    else:
        result = result.append(epaDict[key][columns], ignore_index=True)
        
# Change nan to 0
result.fillna(0, inplace=True)

In [37]:
result.describe()


Out[37]:
PLANT_ID YEAR HOUR GROSS LOAD (MW)
count 4.511880e+06 4.511880e+06 4.511880e+06 4.511880e+06
mean 1.006244e+04 2.010944e+03 1.150000e+01 3.000693e+02
std 1.616199e+04 2.613026e+00 6.922187e+00 5.057269e+02
min 1.270000e+02 2.007000e+03 0.000000e+00 0.000000e+00
25% 3.477000e+03 2.009000e+03 5.750000e+00 0.000000e+00
50% 3.611000e+03 2.011000e+03 1.150000e+01 4.100000e+01
75% 6.183000e+03 2.013000e+03 1.725000e+01 4.180000e+02
max 5.667400e+04 2.015000e+03 2.300000e+01 3.871000e+03

Function to calculate the ramp rate for every hour


In [38]:
def plant_gen_delta(df):
    """
    For every plant in the input df, calculate the change in gross load (MW)
    from the previous hour.
    
    input:
        df: dataframe of EPA clean air markets data
    return:
        df: concatanated list of dataframes
    """
    df_list = []
    for plant in df['PLANT_ID'].unique():
        temp = df.loc[df['PLANT_ID'] == plant,:]
        gen_change = temp.loc[:,'GROSS LOAD (MW)'].values - temp.loc[:,'GROSS LOAD (MW)'].shift(1).values
        temp.loc[:,'Gen Change'] = gen_change
        df_list.append(temp)
    return pd.concat(df_list)

In [39]:
ramp_df = plant_gen_delta(result)


/Users/Home/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py:296: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
/Users/Home/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py:476: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s

In [40]:
ramp_df.describe()


Out[40]:
PLANT_ID YEAR HOUR GROSS LOAD (MW) Gen Change
count 4.511880e+06 4.511880e+06 4.511880e+06 4.511880e+06 4.511811e+06
mean 1.006244e+04 2.010944e+03 1.150000e+01 3.000693e+02 8.391309e-04
std 1.616199e+04 2.613026e+00 6.922187e+00 5.057269e+02 5.223558e+01
min 1.270000e+02 2.007000e+03 0.000000e+00 0.000000e+00 -1.996000e+03
25% 3.477000e+03 2.009000e+03 5.750000e+00 0.000000e+00 0.000000e+00
50% 3.611000e+03 2.011000e+03 1.150000e+01 4.100000e+01 0.000000e+00
75% 6.183000e+03 2.013000e+03 1.725000e+01 4.180000e+02 0.000000e+00
max 5.667400e+04 2.015000e+03 2.300000e+01 3.871000e+03 2.014000e+03

Get the max ramp rate for every plant for each year


In [41]:
cols = ['PLANT_ID', 'YEAR', 'Gen Change']

ramp_rate_list = []
for year in ramp_df['YEAR'].unique():
    for plant in ramp_df.loc[ramp_df['YEAR']==year,'PLANT_ID'].unique():
        # 95th percentile ramp rate per plant per year
        ramp_95 = ramp_df.loc[(ramp_df['PLANT_ID']== plant) & 
                              (ramp_df['YEAR']==year),'Gen Change'].quantile(0.95, interpolation='nearest')
        ramp_rate_list.append([plant, year, ramp_95])

In [42]:
ramp_rate_df = pd.DataFrame(ramp_rate_list, columns=['plant_id', 'year', 'ramp_rate'])

In [43]:
ramp_rate_df.describe()


Out[43]:
plant_id year ramp_rate
count 521.000000 521.000000 521.000000
mean 10039.737044 2010.934741 52.230326
std 16157.049504 2.615700 57.270081
min 127.000000 2007.000000 0.000000
25% 3477.000000 2009.000000 6.000000
50% 3611.000000 2011.000000 34.000000
75% 6181.000000 2013.000000 83.000000
max 56674.000000 2015.000000 373.000000

In [44]:
for key, df in clusterDict.iteritems():
    clusterDict[key] = pd.merge(clusterDict[key], ramp_rate_df, how='left', on=['plant_id', 'year'])

In [45]:
clusterDict[2010].head()


Out[45]:
plant_id capacity year capacity_factor efficiency fuel_type ramp_rate
0 6147 1980.0 2010 0.783340 0.306713 SUB 184.0
1 3601 639.0 2010 0.158669 0.294749 NG 67.0
2 10243 44.2 2010 0.570003 0.658194 NG NaN
3 10261 25.7 2010 0.843285 0.721433 NG NaN
4 55320 746.0 2010 0.772931 0.598151 NG NaN

In [46]:
# Check plants larger than 25MW, which is the lower limit for EPA
clusterDict[2010][clusterDict[2010].capacity >=25].describe()


Out[46]:
plant_id capacity year capacity_factor efficiency ramp_rate
count 123.000000 123.000000 123.0 123.000000 123.000000 50.000000
mean 28813.487805 668.199187 2010.0 0.370436 0.406640 60.500000
std 24432.017024 560.823054 0.0 0.269981 0.147277 59.216397
min 127.000000 25.700000 2010.0 0.000542 0.165060 0.000000
25% 3630.500000 242.650000 2010.0 0.088122 0.294307 16.500000
50% 10692.000000 590.600000 2010.0 0.394855 0.359618 52.500000
75% 55127.500000 922.900000 2010.0 0.582822 0.462692 93.000000
max 57504.000000 4008.400000 2010.0 0.922401 0.810491 328.000000

In [47]:
for key in clusterDict.keys():
    print key, clusterDict[key].plant_id.count(), clusterDict[key].ramp_rate.count()


2007 143 52
2008 139 52
2009 138 49
2010 139 50
2011 135 50
2012 139 50
2013 141 50
2014 140 51
2015 139 52

Save dict to csv


In [48]:
# re-arrange column order
columns = ['year', 'plant_id', 'capacity', 'capacity_factor', 'efficiency', 'ramp_rate', 'fuel_type']

filename = 'Cluster_Data_2.csv'
path = '../Clean Data'
fullpath = os.path.join(path, filename)

counter = 0
for key, df in clusterDict.iteritems():
    # create the csv file
    if counter == 0:
        df[columns].sort_values(by='plant_id').to_csv(fullpath, sep=',',  index = False)
        counter += 1
    # append to existing csv file
    else:
        df[columns].sort_values(by='plant_id').to_csv(fullpath, sep=',',  index = False, header=False, mode = 'a')

In [ ]:

Assumptions

  1. Plant capacity changes at the start of the year and is constant for the entire year
  2. Same for ramp rate - no changes over the course of the year