In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import geopandas as gpd
# from shapely.geometry import Point
# from geopandas import GeoDataFrame
import os
from os.path import join
import pandas as pd
import fiona
# from cartopy import crs as ccrs
# from urllib.request import urlopen
# from zipfile import ZipFile
# from io import BytesIO
# from pathlib import Path
sns.set(style='white')
cwd = os.getcwd()
data_path = join(cwd, '..', '..', 'Data storage')
figure_path = join(cwd, '..', '..', 'Figures')
file_date = '2018-03-06'
In [3]:
%load_ext watermark
%watermark -v -p pandas,geopandas,shapely
Link to the shapefile for local download
In [3]:
# EIA NERC region shapefile, which has an "Indeterminate" region
# path = os.path.join(data_path, 'NERC_Regions_EIA', 'NercRegions_201610.shp')
# regions = gpd.read_file(path)
# regions.crs
Out[3]:
In [149]:
path = os.path.join(data_path, 'nercregions', 'NERCregions.shp')
regions_nerc = gpd.read_file(path)
regions_nerc['nerc'] = regions_nerc['NERCregion']
In [150]:
regions_nerc
Out[150]:
In [151]:
regions_nerc.boundary.plot()
Out[151]:
Now using NERC region shapefiles created by DHS
In [133]:
path = os.path.join(data_path, 'NERC_Regions', 'NERC_Regions.shp')
regions = gpd.read_file(path)
regions.columns = regions.columns.str.lower()
regions['nerc'] = regions['name'].str.split().str[-1].str.strip('()')
# regions = regions.loc[:, ['nerc', 'geometry']]
regions.head()
Out[133]:
In [136]:
regions.loc[regions['nerc']=='WECC'].plot(cmap='tab20', alpha=0.5)
Out[136]:
In [ ]:
regions.plot()
In [132]:
regions.to_crs(epsg=2163).plot(cmap='tab20', alpha=0.5,
linewidth=0.5, edgecolor='0.1')
Out[132]:
In [6]:
# This is slow!
regions_nerc = regions.dissolve(by='nerc', as_index=False)
In [8]:
regions_nerc.to_crs(epsg=2163).plot()
Out[8]:
In [ ]:
path = join(data_path, 'NERC_Regions', 'Aggregated NERC Regions.geojson')
regions_nerc.to_file(path, driver='GeoJSON')
In [ ]:
fiona.open()
In [5]:
path = join(data_path, 'NERC_Regions', 'Aggregated NERC Regions.geojson')
regions_nerc = gpd.read_file(path, driver='GeoJSON')
In [4]:
path = os.path.join(data_path, 'cb_2016_us_state_20m', 'cb_2016_us_state_20m.shp')
states = gpd.read_file(path)
states.crs
Out[4]:
In [6]:
drop_states = ['Alaska', 'Hawaii', 'Puerto Rico']
states = states.loc[~states['NAME'].isin(drop_states)]
In [7]:
states.to_crs(epsg=2163).plot()
Out[7]:
In [8]:
path = join(data_path, 'final NERC data',
'Summary table {}.csv'.format(file_date))
index = pd.read_csv(path)
index
Out[8]:
In [152]:
for nerc in regions_nerc['nerc'].unique():
try:
val_2017 = index.loc[index['nerc']==nerc, '2017'].values[0]
val_2001 = index.loc[index['nerc']==nerc, '2001'].values[0]
reduce = index.loc[index['nerc']==nerc, 'Percent Reduction'].values[0]
# print(val)
regions_nerc.loc[regions_nerc['nerc']==nerc, 2017] = val_2017
regions_nerc.loc[regions_nerc['nerc']==nerc, 2001] = val_2001
regions_nerc.loc[regions_nerc['nerc']==nerc, 'reduction'] = '{:.0%}'.format(reduce)
regions_nerc.loc[regions_nerc['nerc']==nerc, 'reduction value'] = reduce
except:
pass
In [153]:
regions_nerc
Out[153]:
In [154]:
regions_albers = regions_nerc.to_crs(epsg=2163)
states_albers = states.to_crs(epsg=2163)
In [155]:
regions_albers.plot(column=2001, cmap='cividis_r', edgecolor='0.1', linewidth=1)
Out[155]:
In [169]:
def plot_nerc_annual(regions_proj, states_proj, data_col, text_col,
cmap='cividis_r', vmin=None, vmax=None, title=None,
cbar_title=None, **kwargs):
states_ec = kwargs.get('states_ec', '0.6')
regions_ec = kwargs.get('regions_ec', '0.2')
regions_lw = kwargs.get('regions_lw', 0.75)
font_size = kwargs.get('font_size', 9)
bbox_alpha = kwargs.get('bbox_alpha', 0.7)
FRCC_x = kwargs.get('FRCC_x', 4.75)
SERC_x = kwargs.get('SERC_x', 2)
SERC_y = kwargs.get('SERC_y', -2)
SPP_y = kwargs.get('SPP_y', 1.75)
RFC_y = kwargs.get('RFC_y', -0.5)
fig, ax = plt.subplots(figsize=(8,3.5))
# set aspect to equal. This is done automatically
# when using *geopandas* plot on it's own, but not when
# working with pyplot directly.
ax.set_aspect('equal')
regions_proj.plot(ax=ax, column=data_col, cmap=cmap, legend=True,
vmin=vmin, vmax=vmax)
states_proj.plot(ax=ax, color='none', edgecolor=states_ec)
regions_proj.plot(ax=ax, color='none', edgecolor=regions_ec,
linewidth=regions_lw)
# plt.text(x=1.1, y=1.01, s=cbar_title, transform=ax.transAxes,
# ha='center', va='bottom', fontdict={'size':font_size})
plt.title(title)
for point, nerc in zip(regions_proj.centroid, regions_proj['nerc'].values):
text = regions_proj.loc[regions_proj['nerc']==nerc, text_col].values[0]
# text = '{}'.format(nerc, reduce)
x = point.x
y = point.y
if nerc == 'FRCC':
x = x + conv_lon(4.75)#-79
y = y - conv_lat(1)#28
rot = -67
plt.text(x, y, text, ha='center', va='center',
fontdict={'size':font_size})
elif nerc == 'NPCC':
x = x - conv_lon(1.5)
y = y + conv_lat(2.1)
plt.text(x, y, text, ha='center',
fontdict={'size':font_size})
elif nerc == 'SERC':
x = x + conv_lon(SERC_x)
y = y + conv_lat(SERC_y)
plt.text(x, y, text, ha='center', va='center',
bbox=dict(facecolor='white',
alpha=bbox_alpha,
boxstyle="square"),
fontdict={'size':font_size})
elif nerc == 'RFC':
# x = x + conv_lon(RFC_x)
y = y + conv_lat(RFC_y)
plt.text(x, y, text, ha='center', va='center',
bbox=dict(facecolor='white',
alpha=bbox_alpha,
boxstyle="square"),
fontdict={'size':font_size})
elif nerc == 'SPP':
# x = x + 2
y = y + conv_lat(SPP_y)
plt.text(x, y, text, ha='center', va='center',
bbox=dict(facecolor='white',
alpha=bbox_alpha,
boxstyle="square"),
fontdict={'size':font_size})
else:
plt.text(x, y, text, ha='center', va='center',
bbox=dict(facecolor='white',
alpha=bbox_alpha,
boxstyle="square"),
fontdict={'size':font_size})
sns.despine(left=True, bottom=True)
ax.set_yticklabels(labels=[])
ax.set_xticklabels(labels=[])
cax = fig.get_axes()[-1]
cax.set_title(cbar_title, fontdict={'size':font_size})
In [16]:
# https://gist.github.com/springmeyer/871897
def conv_lon(x):
newx = x * 20037508.34 / 180
return newx
def conv_lat(y):
newy = np.log(np.tan((90 + y) * np.pi / 360)) / (np.pi / 180)
newy *= 20037508.34 / 180
return newy
In [170]:
title = '2001 U.S. Average\n{} g CO$_2$/kWh'.format(usa_2001)
kwargs = dict(
regions_lw = 1,
regions_ec = '0.1',
SERC_y = -1.5,
SPP_y = 2.25
)
vmin = regions_albers.loc[:, [2001, 2017]].min().min()
vmax = regions_albers.loc[:, [2001, 2017]].max().max()
plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col=2001,
text_col='nerc', vmin=vmin, vmax=vmax, title=title,
cbar_title='g CO$_2$/kWh', **kwargs)
path = join(figure_path, 'NERC map_cividis_2001.png')
plt.savefig(path, bbox_inches='tight', dpi=350)
In [171]:
title = '2017 U.S. Average\n{} g CO$_2$/kWh (↓ 30%)'.format(usa_2017)
kwargs = dict(
regions_lw = 1,
regions_ec = '0.1',
SERC_y = -1.5,
SPP_y = 2.25
)
vmin = regions_albers.loc[:, [2001, 2017]].min().min()
vmax = regions_albers.loc[:, [2001, 2017]].max().max()
regions_albers['arrow reduction'] = '↓ ' + regions_albers['reduction']
plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col=2017,
text_col='arrow reduction', vmin=vmin, vmax=vmax, title=title,
cbar_title='g CO$_2$/kWh', **kwargs)
path = join(figure_path, 'NERC map_cividis_2017_change.png')
plt.savefig(path, bbox_inches='tight', dpi=350)
In [443]:
title = '2001 U.S. Average\n{} g CO$_2$/kWh'.format(usa_2001)
vmin = regions_albers.loc[:, [2001, 2017]].min().min()
vmax = regions_albers.loc[:, [2001, 2017]].max().max()
plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col=2001,
text_col='NERC', vmin=vmin, vmax=vmax, cmap='Reds',
title=title, cbar_title='g CO$_2$/kWh')
path = join(figure_path, 'NERC map_reds_2001.png')
plt.savefig(path, bbox_inches='tight', dpi=350)
In [444]:
title = '2017 U.S. Average\n{} g CO$_2$/kWh (↓ 30%)'.format(usa_2017)
vmin = regions_albers.loc[:, [2001, 2017]].min().min()
vmax = regions_albers.loc[:, [2001, 2017]].max().max()
regions_albers['arrow reduction'] = '↓ ' + regions_albers['reduction']
plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col=2017,
text_col='arrow reduction', vmin=vmin, vmax=vmax, cmap='Reds',
title=title, cbar_title='g CO$_2$/kWh')
path = join(figure_path, 'NERC map_reds_2017_change.png')
plt.savefig(path, bbox_inches='tight', dpi=350)
In [448]:
title = '2001 U.S. Average\n{} g CO$_2$/kWh'.format(usa_2001)
vmax = max(abs(regions_albers['2017 diff'].min()), regions_albers['2017 diff'].max())
vmin = -vmax
plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col='2001 diff',
text_col='NERC', cmap='PRGn_r', vmin=vmin, vmax=vmax,
title=title, cbar_title='g CO$_2$/kWh')
path = join(figure_path, 'NERC map_diverging_2001_2017_bounds.png')
plt.savefig(path, bbox_inches='tight', dpi=350)
In [446]:
title = '2017 U.S. Average\n{} g CO$_2$/kWh (↓ 30%)'.format(usa_2017)
vmax = max(abs(regions_albers['2017 diff'].min()), regions_albers['2017 diff'].max())
vmin = -vmax
regions_albers['arrow reduction'] = '↓ ' + regions_albers['reduction']
plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col='2017 diff',
text_col='arrow reduction', cmap='PRGn_r', vmin=vmin, vmax=vmax,
title=title, cbar_title='g CO$_2$/kWh')
path = join(figure_path, 'NERC map_diverging_2017.png')
plt.savefig(path, bbox_inches='tight', dpi=350)
In [447]:
title = '2017 U.S. Average\n{} g CO$_2$/kWh (↓ 30%)'.format(usa_2017)
vmax = max(abs(regions_albers['2017 diff'].min()), regions_albers['2017 diff'].max())
vmin = -vmax
regions_albers['arrow reduction'] = '↓ ' + regions_albers['reduction']
plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col='2017 diff',
text_col='arrow reduction', cmap='BrBG_r', vmin=vmin, vmax=vmax,
title=title, cbar_title='g CO$_2$/kWh')
path = join(figure_path, 'NERC map_diverging_BrBG_2017.png')
plt.savefig(path, bbox_inches='tight', dpi=350)
In [ ]:
In [12]:
path = os.path.join(data_path, 'Facility gen fuels and CO2 2017-08-31.zip')
facility_df = pd.read_csv(path)
In [7]:
print(len(facility_df['plant id'].unique()), 'total plants')
print(len(facility_df.loc[facility_df['lat'].isnull(), 'plant id'].unique()),
'plants without lat/lon')
In [8]:
years = facility_df.loc[facility_df['lat'].isnull(), 'year'].unique()
for year in years:
total_gen = facility_df.loc[facility_df['year'] == year, 'generation (MWh)'].sum()
# Plant ids with no 'lat' in year
no_loc_plants = facility_df.loc[(facility_df['lat'].isnull()) &
(facility_df['year'] == year), 'plant id'].unique()
no_loc_gen = facility_df.loc[(facility_df['year'] == year) &
(facility_df['plant id'].isin(no_loc_plants)),
'generation (MWh)'].sum()
percent_dropped = no_loc_gen / total_gen * 100
print('In {}, {:.3f}% of generation is from plants without lat/lon'.format(year, percent_dropped))
In [9]:
facility_df.dropna(inplace=True, subset=['lat', 'lon'])
In [10]:
facility_df.columns
Out[10]:
Because I have monthly data for every facility from 2001-2017, there are lots of duplicate rows. No need to do a spatial join on every row. Just keep one instance of each facility in each year.
In [11]:
cols = ['lat', 'lon', 'plant id', 'year']
small_facility = facility_df.loc[:, cols].drop_duplicates()
Use Point
from Shapely to create the geometry
list of facility locations. crs
is the coordinate reference system that translates lat/lon into a specific map projection.
In [12]:
geometry = [Point(xy) for xy in zip(small_facility.lon, small_facility.lat)]
# small_facility = small_facility.drop(['lon', 'lat'], axis=1)
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(small_facility, crs=crs, geometry=geometry)
In [13]:
geo_df.head()
Out[13]:
In [14]:
len(geo_df)
Out[14]:
Joining the 9 regions (NERC) with 100,810 records takes much longer with facilities as the left dataframe in the join (2 min vs 12 seconds). Not quite sure why this is. The faster method leaves me with polygons rather than points tho. There might be a better way to rectify this, but I just create a new geodataframe with the geometry set as the lat/lon points.
EDIT
Although the GeoPandas documentation says that the op
parameter doesn't matter for point-in-polygon operations, using 'within' made the operation much faster (12 seconds). The first method below is probably preferable since it keeps the point geometry.
Method 1 (slow unless you use op='within'
)
In [15]:
facility_nerc = gpd.sjoin(geo_df, regions, how='inner', op='within')
In [16]:
facility_nerc.head()
Out[16]:
Method 2 (faster when using the default operation)
In [10]:
facility_nerc = gpd.sjoin(df, geo_df, how="inner")
In [11]:
facility_nerc.head()
Out[11]:
Make the new geometry of facility locations
In [12]:
geometry = [Point(xy) for xy in zip(facility_nerc.lon, facility_nerc.lat)]
Create new dataframe with the data I want to keep and the new geometry
In [13]:
crs = {'init': 'epsg:4326'}
keep_cols = ['NERC_Label', 'plant id', 'year']
facility_nerc = GeoDataFrame(facility_nerc[keep_cols], crs=crs, geometry=geometry)
In [14]:
facility_nerc.head()
Out[14]:
In [31]:
df_test = df.to_crs({'init': 'epsg:3395'})
In [32]:
df.plot()
Out[32]:
In [33]:
df_test.plot()
Out[33]:
If you need to load the shapefile, download it here.
In [ ]:
# If loading the shapefile
shape_path = ''
regions = gpd.read_file(shape_path)
In [3]:
path = join(data_path, 'Facility labels', 'Facility locations.csv')
location = pd.read_csv(path)
In [20]:
len(location)
Out[20]:
The nerc
column is labels from a spatial join using this same shapefile and lat/lon data in QGIS.
In [21]:
location.head()
Out[21]:
In [22]:
location.loc[location['lat'].isnull()]
Out[22]:
In [4]:
geometry = [Point(xy) for xy in zip(location.lon, location.lat)]
# small_facility = small_facility.drop(['lon', 'lat'], axis=1)
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(location, crs=crs, geometry=geometry)
In [5]:
geo_df.head()
Out[5]:
Every plant id is unique
In [24]:
len(geo_df)
Out[24]:
In [25]:
len(geo_df['plant id'].unique())
Out[25]:
In [26]:
df1 = gpd.sjoin(regions, geo_df)
In [27]:
df1.head()
Out[27]:
We can already see that there are more rows in this dataframe than there were power plants.
In [28]:
len(df1)
Out[28]:
In [6]:
df2 = gpd.sjoin(geo_df, regions, op='within')
In [31]:
df2.head()
Out[31]:
Same size dataframe as from Method 1. Yet again, there appear to be extra results.
In [26]:
len(df2)
Out[26]:
In [39]:
df3 = gpd.sjoin(regions, geo_df, op='contains')
In [40]:
df3.head()
Out[40]:
Still too many results
In [41]:
len(df3)
Out[41]:
In [32]:
regions
Out[32]:
In [23]:
frcc = regions.loc[1, 'geometry']
serc = regions.loc[5, 'geometry']
I figured out that plant id 641 is one that shows up in both FRCC and SERC
In [27]:
plant_641 = geo_df.loc[geo_df['plant id'] == 641, 'geometry'].values[0]
In [28]:
frcc.contains(plant_641)
Out[28]:
In [29]:
serc.contains(plant_641)
Out[29]:
So 641 definitely is in both NERC regions. And apparently the regions overlap? But they aren't supposed to. And I didn't have any plants show up in multiple regions when doing the spatial join in QGIS.
In [37]:
frcc.intersects(serc)
Out[37]:
There are 36 plants with duplicate NERC regions.
In [38]:
len(df2.loc[df2['plant id'].duplicated()].sort_values('plant id'))
Out[38]:
In [10]:
pd.set_option('display.max_rows', 200)
In [11]:
df2.loc[df2['plant id'].duplicated(keep=False)].sort_values('plant id')
Out[11]:
In [17]:
facility_df.head()
Out[17]:
In [49]:
plants = facility_df.loc[:, ['plant id', 'year', 'lat', 'lon']]
plants.drop_duplicates(inplace=True)
In [24]:
plants.groupby(['plant id']).max().hist()
Out[24]:
In [116]:
path = join(data_path, 'EIA downloads', 'EIA-860 2015', '2___Plant_Y2015.xlsx')
nercs2015 = pd.read_excel(path, skiprows=0, parse_cols='C,L')
nercs2015.columns = ['plant id', 'nerc']
path = join(data_path, 'EIA downloads', 'eia8602012', 'PlantY2012.xlsx')
nercs2012 = pd.read_excel(path, skiprows=0, parse_cols='B,J')
nercs2012.columns = ['plant id', 'nerc']
In [117]:
nercs2012.head()
Out[117]:
In [118]:
nercs = pd.concat([nercs2012, nercs2015]).drop_duplicates()
In [119]:
len(nercs), len(nercs2012), len(nercs2015)
Out[119]:
In [120]:
df = pd.merge(plants, nercs, on=['plant id'], how='left')
In [121]:
plants.tail()
Out[121]:
In [122]:
nercs.tail()
Out[122]:
In [123]:
nercs.nerc.unique()
Out[123]:
In [67]:
df.tail()
Out[67]:
In [63]:
len(df.loc[df.isnull().any(axis=1), 'plant id'].unique())
Out[63]:
In [148]:
from sklearn import neighbors
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
In [124]:
nercs = ['SERC', 'RFC', 'SPP', 'NPCC', 'WECC', 'MRO', 'TRE', 'FRCC']
In [125]:
df_slim = df.loc[df.nerc.isin(nercs), ['plant id', 'lat', 'lon', 'nerc']].dropna(subset=['lon']).drop_duplicates()
In [126]:
unknown = df_slim.loc[df_slim.nerc.isnull()]
In [69]:
n_neighbors = 10
In [155]:
X = df_slim.loc[df_slim.notnull().all(axis=1), ['lat', 'lon']]
y = df_slim.loc[df_slim.notnull().all(axis=1), 'nerc']
X_scale = StandardScaler().fit_transform(X)
In [164]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
In [128]:
y.unique()
Out[128]:
In [165]:
knn = neighbors.KNeighborsClassifier()
In [ ]:
params = {'weights': ['uniform', 'distance'],
'n_neighbors': [10, 15, 20, 30],
'leaf_size': [3, 5, 10, 30],
'p': [1, 2]}
clf = GridSearchCV(knn, params, n_jobs=-1)
In [160]:
clf.fit(X_train, y_train)
Out[160]:
In [161]:
clf.best_estimator_, clf.best_params_, clf.best_score_
Out[161]:
No difference in score when X is preprocessed with StandardScaler
In [163]:
clf.score(X_scale, y)
Out[163]:
In [ ]:
In [102]:
clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
clf.fit(X_train, y_train)
Out[102]:
In [103]:
clf.score(X_test, y_test)
Out[103]:
In [104]:
clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
clf.fit(X_train, y_train)
Out[104]:
In [105]:
clf.score(X_test, y_test)
Out[105]:
In [95]:
h = .02 # step size in the mesh
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
for weights in ['uniform', 'distance']:
# we create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(X, y)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = %i, weights = '%s')"
% (n_neighbors, weights))
In [ ]: