This script downlads and extracts the original data of renewable power plant lists from the data sources, processes and merges them. It subsequently adds the geolocation for each power plant. Finally it saves the DataFrames as pickle-files. Make sure you run the download and process Notebook before the validation and output Notebook.
In [ ]:
version = '2020-05-20'
In [ ]:
import logging
import os
import posixpath
import urllib.parse
import urllib.request
import re
import zipfile
import pickle
import urllib
import shutil
import datetime
import numpy as np
import pandas as pd
import utm # for transforming geoinformation in the utm format
import requests
import fake_useragent
from string import Template
from IPython.display import display
import xlrd
import bs4
import bng_to_latlon
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
# for visualizing locations on maps
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from cartopy.io import shapereader
import geopandas
import shapely
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%d %b %Y %H:%M:%S'
)
logger = logging.getLogger()
# Create input, intermediate and output folders if they don't exist.
# If the paths are relative, the correspoding folders will be created
# inside the current working directory.
input_directory_path = os.path.join('input', 'original_data')
intermediate_directory_path = 'intermediate'
output_directory_path = os.path.join('output', 'renewable_power_plants')
os.makedirs(input_directory_path, exist_ok=True)
os.makedirs(intermediate_directory_path, exist_ok=True)
os.makedirs(output_directory_path, exist_ok=True)
# Create the folder to which the Eurostat files with data at the level of the whole EU/Europe
#are going to be downloaded
eurostat_eu_directory_path = os.path.join('input', 'eurostat_eu')
os.makedirs(eurostat_eu_directory_path, exist_ok=True)
# Define the path of the file with the list of sources.
source_list_filepath = os.path.join('input', 'sources.csv')
# Import the utility functions and classes from the util package
import util.helper
from util.visualizer import visualize_points
The original data can either be downloaded from the original data sources as specified below or from the opsd-Server. Default option is to download from the original sources as the aim of the project is to stay as close to original sources as possible. However, if problems with downloads e.g. due to changing urls occur, you can still run the script with the original data from the opsd_server.
In [ ]:
download_from = 'original_sources'
#download_from = 'opsd_server'
The download link for the UK is updated at the end of each quarter by the source provider, BEIS. We keep up with those changes by extracting the download link automatically from the web page it is on. That way, the link does not have to be updated manually.
Note: you must be connected to the Internet if you want to execute this step.
In [ ]:
source_df = pd.read_csv(source_list_filepath)
uk_main_page = 'https://www.gov.uk/government/publications/renewable-energy-planning-database-monthly-extract'
current_link = util.helper.get_beis_link(uk_main_page)
current_filename = current_link.split('/')[-1]
source_df.loc[(source_df['country'] == 'UK') & (source_df['source'] == 'BEIS'), 'url'] = current_link
source_df.loc[(source_df['country'] == 'UK') & (source_df['source'] == 'BEIS'), 'filename'] = current_filename
source_df.to_csv(source_list_filepath, index=False, header=True)
#pd.set_option('display.max_colwidth', -1)
source_df.fillna('')
Note that, as of May 5, 2020, the following sources are available only from the OPSD server and the data will be downloaded from it even if download_from
is set to 'original_sources'
:
The original links which should be downloaded from OPSD are marked as inactive in the column active
in the above dataframe.
In [ ]:
import util.downloader
from util.downloader import Downloader
downloader = Downloader(version, input_directory_path, source_list_filepath, download_from)
The NUTSConverter
class in the util
package uses the information on each facility's postcode, municipalty name, municipality code, longitude, and latitude to assign it correct NUTS 2016 level 1, 2, and 3 codes.
Here, we instantiate the converter so that we can use it later.
In [ ]:
from util.nuts_converter import NUTSConverter
nuts_converter = NUTSConverter(downloader, eurostat_eu_directory_path)
Column and value names of the original data sources will be translated to English and standardized across different sources. Standardized column names, e.g. "electrical_capacity" are required to merge data in one DataFrame.
The column and the value translation lists are provided in the input folder of the Data Package.
In [ ]:
# Get column translation list
columnnames = pd.read_csv(os.path.join('input', 'column_translation_list.csv'))
columnnames.head(2)
In [ ]:
# Get value translation list
valuenames = pd.read_csv(os.path.join('input', 'value_translation_list.csv'))
valuenames.head(2)
For one country after the other, the original data is downloaded, read, processed, translated, eventually georeferenced and saved. If respective files are already in the local folder, these will be utilized.
To process the provided data pandas DataFrame is applied.
The data which will be processed below is provided by the following data sources:
Netztransparenz.de - Official grid transparency platform from the German Transmission System Operators (TSOs): 50Hertz, Amprion, TenneT and TransnetBW.
Bundesnetzagentur (BNetzA) - German Federal Network Agency for Electricity, Gas, Telecommunications, Posts and Railway (In separate files for data for roof-mounted PV power plants and for all other renewable energy power plants.)
Data URL for BNetzA gets updated every few month. To be sure, always check if the links (url_bnetza; url_bnetza_pv) are up to date.
In [ ]:
# Define the lists of source names
downloader = Downloader(version, input_directory_path, source_list_filepath, download_from)
tsos = ['50Hertz', 'Amprion', 'TenneT', 'TransnetBW']
datasets = ['50Hertz', 'Amprion', 'TenneT', 'TransnetBW','bnetza','bnetza_pv','bnetza_pv_historic']
# Download the files and get the local file paths indexed by source names
filepaths = downloader.download_data_for_country('DE')
# Remove the Eurostat NUTS file as it's a geoinformation source
DE_postcode2nuts_filepath = filepaths.pop('Eurostat')
In [ ]:
# Open all data sets before processing.
filenames = {}
for source in filepaths:
filepath = filepaths[source]
print(source, filepath)
if os.path.splitext(filepath)[1] != '.xlsx' and zipfile.is_zipfile(filepath):
filenames[source] = zipfile.ZipFile(filepath)
else:
filenames[source] = filepath
In [ ]:
# Read TSO data from the zip files
dfs = {}
basenames_by_tso = {
'50Hertz': '50Hertz Transmission GmbH EEG-Zahlungen Stammdaten 2018',
'Amprion': 'Netztransparenz Anlagenstammdaten 2018 Amprion GmbH',
'TenneT': 'TenneT TSO GmbH Anlagenstammdaten 2018',
'TransnetBW': 'TransnetBW_Anlagenstammdaten_2018',
}
for tso in tsos:
filename = basenames_by_tso[tso]+'.csv'
print('Reading', filename)
print(filenames[tso].namelist())
dfs[tso] = pd.read_csv(
filenames[tso].open(filename),
sep=';',
thousands='.',
decimal=',',
# Headers have to have the same order for all TSOs. Therefore just define headers here.
# Remove the following three lines if for next version, headers should be read out initially
# to then check if order is the same everywhere.
names=['EEG-Anlagenschlüssel','Netzbetreiber Betriebsnummer','Netzbetreiber Name',
'Strasse_flurstueck','PLZ','Ort / Gemarkung','Gemeindeschlüssel','Bundesland',
'Installierte Leistung','Energieträger','Spannungsebene','Leistungsmessung','Regelbarkeit',
'Inbetriebnahme','Außerbetriebnahme','Netzzugang','Netzabgang'],
header=None,
skiprows=1,
parse_dates=[13, 14, 15, 16], #[11, 12, 13, 14]
#infer_datetime_format=True,
date_parser = lambda x: pd.to_datetime(x, errors='coerce', format='%d.%m.%Y'),
encoding='iso-8859-1',
dayfirst=True,
low_memory=False
)
print('Done reading ' + filename)
for filename in filenames.values():
if(isinstance(filename, zipfile.ZipFile)):
#print(filename)
filename.close()
In [ ]:
# define the date parser
def date_parser(x):
if type(x) == str:
return datetime.datetime.strptime(x, '%D.%M.%Y')
elif type(x) == float and pd.isnull(x):
return pd.NaT
def inspect(x):
try:
converted = datetime.datetime.strptime(x, '%d.%m.%Y')
return False
except:
return True
In [ ]:
# Read BNetzA register
print('Reading bnetza: '+filenames['bnetza'])
dfs['bnetza'] = pd.read_excel(filenames['bnetza'],
sheet_name='Gesamtübersicht',
header=0,
converters={'4.9 Postleit-zahl': str, 'Gemeinde-Schlüssel': str}
)
skiprows = {'bnetza_pv_historic': 10, 'bnetza_pv': 9}
for dataset in ['bnetza_pv', 'bnetza_pv_historic']:
print(dataset)
print('Reading ' + dataset + ': ' + filenames[dataset])
xls_handle = pd.ExcelFile(filenames[dataset])
print('Concatenating all '+dataset+' sheets into one dataframe')
dfs[dataset] = pd.concat(
(xls_handle.parse(
sheet,
skiprows=skiprows[dataset],
converters={'Anlage \nPLZ': str}
) for sheet in xls_handle.sheet_names),
sort=True
)
In [ ]:
# Make sure that the column `Inbetriebnahme-datum *)` (commissioning date) in the bnetza_pv set is datetime.
mask = dfs['bnetza_pv']['Inbetriebnahme-datum *)'].apply(lambda x: type(x) == int)
dfs['bnetza_pv']['Inbetriebnahme-datum *)'] = pd.to_datetime(dfs['bnetza_pv']['Inbetriebnahme-datum *)'],
errors='coerce',
dayfirst=True,
infer_datetime_format=True)
dfs['bnetza_pv']['Inbetriebnahme-datum *)'] = dfs['bnetza_pv']['Inbetriebnahme-datum *)'].apply(
lambda x: x.to_datetime64()
)
In [ ]:
dfs['bnetza_pv_historic'] = dfs['bnetza_pv_historic'].drop(['Unnamed: 7'], axis=1)
In [ ]:
pickle.dump( dfs, open( "intermediate/temp_dfs_DE_after_reading.pickle", "wb" ) )
In [ ]:
dfs = pickle.load( open( "intermediate/temp_dfs_DE_after_reading.pickle", "rb" ) )
To standardise the DataFrame the original column names from the German TSOs and the BNetzA wil be translated and new English column names wil be assigned to the DataFrame. The unique column names are required to merge the DataFrame.
The column_translation_list is provided here as csv in the input folder. It is loaded in 2.3 Setup of translation dictionaries.
In [ ]:
# Choose the translation terms for Germany, create dictionary and show dictionary
columnnames = pd.read_csv(os.path.join('input', 'column_translation_list.csv'))
idx_DE = columnnames[columnnames['country'] == 'DE'].index
column_dict_DE = columnnames.loc[idx_DE].set_index('original_name')['opsd_name'].to_dict()
column_dict_DE
In [ ]:
# Start the column translation process for each original data source
print('Translation...')
for dataset in dfs:
# Remove newlines and any other duplicate whitespaces in column names:
dfs[dataset] = dfs[dataset].rename(columns={col: re.sub(r"\s+", ' ', col) for col in dfs[dataset].columns})
# Do column name translations
print(dataset)
#print(list(dfs[dataset].columns))
dfs[dataset].rename(columns=column_dict_DE, inplace=True)
#print(list(dfs[dataset].columns).index('decommissioning_date'))
#print('--------------------------------------------')
print('done.')
In [ ]:
# Add data source names to the DataFrames
for tso in tsos:
dfs[tso]['data_source'] = tso
dfs[tso]['tso'] = tso
dfs['bnetza']['data_source'] = 'BNetzA'
dfs['bnetza_pv']['data_source'] = 'BNetzA_PV'
dfs['bnetza_pv_historic']['data_source'] = 'BNetzA_PV_historic'
# Add for the BNetzA PV data the energy source level 2
dfs['bnetza_pv']['energy_source_level_2'] = 'Photovoltaics'
dfs['bnetza_pv_historic']['energy_source_level_2'] = 'Photovoltaics'
In [ ]:
# Select those columns of the original data which are utilised further
dfs['bnetza'] = dfs['bnetza'].loc[:, ('commissioning_date', 'decommissioning_date',
'notification_reason', 'energy_source_level_2',
'electrical_capacity_kW', 'thermal_capacity_kW',
'voltage_level', 'dso', 'eeg_id', 'bnetza_id',
'federal_state', 'postcode', 'municipality_code',
'municipality', 'address', 'address_number',
'utm_zone', 'utm_east', 'utm_north',
'data_source')]
In [ ]:
for dataset in datasets: print(dataset+':'); display(dfs[dataset].tail(2))
In [ ]:
# Merge DataFrames of each original source into a common DataFrame DE_renewables
dfs_list = []
for dataset in datasets:
dfs_list.append(dfs[dataset])
DE_renewables = pd.concat(dfs_list, sort=True)
DE_renewables.head(2)
In [ ]:
DE_renewables.reset_index(drop=True, inplace=True)
DE_renewables.head(2)
Different German terms for energy source level 2, energy source level 3, technology and voltage levels are translated and harmonized across the individual data sources. The value_translation_list is provided here as csv in the input folder. It is loaded in 2.3 Setup of translation dictionaries.
In [ ]:
# Choose the translation terms for Germany, create dictionary and show dictionary
valuenames = pd.read_csv(os.path.join('input', 'value_translation_list.csv'))
idx_DE = valuenames[valuenames['country'] == 'DE'].index
value_dict_DE = valuenames.loc[idx_DE].set_index('original_name')['opsd_name'].to_dict()
value_dict_DE
In [ ]:
print('replacing...')
# Replace all original value names by the OPSD value names.
# Running time: some minutes.
DE_renewables.replace(value_dict_DE, inplace=True)
print('Done!')
In [ ]:
DE_renewables['postcode'] = DE_renewables['postcode'].apply(pd.to_numeric, errors='ignore')
Separate and assign energy source level 1 - 3 and technology
In [ ]:
# Create dictionary in order to assign energy_source to its subtype
energy_source_dict_DE = valuenames.loc[idx_DE].set_index(
'opsd_name')['energy_source_level_2'].to_dict()
# Column energy_source partly contains energy source level 3 and technology information,
# thus this column is copied to new column technology...
DE_renewables['technology'] = DE_renewables['energy_source_level_2']
In [ ]:
# ...and the energy source level 2 values are replaced by the higher level classification
DE_renewables['energy_source_level_2'].replace(energy_source_dict_DE, inplace=True)
In [ ]:
# Choose energy source level 2 entries where energy_source is "Bioenergy" in order to
# separate Bioenergy subtypes to "energy_source_level_3" and subtypes for the rest to "technology"
idx_DE_Bioenergy = DE_renewables[DE_renewables['energy_source_level_2'] == 'Bioenergy'].index
# Assign technology to energy source level 3 for all entries where energy source level 2 is
# Bioenergy and delete those entries from technology
DE_renewables[['energy_source_level_3']] = DE_renewables.iloc[idx_DE_Bioenergy][['technology']]
DE_renewables.loc[idx_DE_Bioenergy]['technology'] = np.nan
In [ ]:
# Assign energy source level 1 to the dataframe
DE_renewables['energy_source_level_1'] = 'Renewable energy'
In [ ]:
# Show the hierarchy of the energy types present in the frame
energy_columns = ['energy_source_level_1', 'energy_source_level_2', 'energy_source_level_3', 'technology']
DE_renewables[energy_columns].drop_duplicates().sort_values(by='energy_source_level_2')
According to the OPSD energy hierarchy, the power plants whose energy_source_level_2
is either Storage
or Other fossil fuels
do not belong to the class of renewable-energy facilities. Therefore, we can remove them.
In [ ]:
drop_mask = DE_renewables['energy_source_level_2'].isin(['Other fossil fuels', 'Storage'])
DE_renewables.drop(DE_renewables.index[drop_mask], axis=0, inplace=True)
Summary of DataFrame
In [ ]:
# Electrical capacity per energy source level 2 (in MW)
DE_renewables.groupby(['energy_source_level_2'])['electrical_capacity_kW'].sum() / 1000
In [ ]:
# kW to MW
DE_renewables[['electrical_capacity_kW', 'thermal_capacity_kW']] /= 1000
# adapt column name
DE_renewables.rename(columns={'electrical_capacity_kW': 'electrical_capacity',
'thermal_capacity_kW': 'thermal_capacity'}, inplace=True)
(for data with no existing geocoordinates)
The available post code in the original data provides a first approximation for the geocoordinates of the RE power plants.
The BNetzA data provides the full zip code whereas due to data privacy the TSOs only report the first three digits of the power plant's post code (e.g. 024xx) and no address. Subsequently a centroid of the post code region polygon is used to find the coordinates.
With data from
a CSV-file for all existing German post codes with matching geocoordinates has been compiled. The latitude and longitude coordinates were generated by running a PostgreSQL + PostGIS database. Additionally the respective TSO has been added to each post code. (A Link to the SQL script will follow here later)
(License: http://www.suche-postleitzahl.org/downloads, Open Database Licence for free use. Source of data: © OpenStreetMap contributors)
In [ ]:
# Read generated postcode/location file
postcode = pd.read_csv(os.path.join('input', 'de_tso_postcode_full.csv'))
# Drop possible duplicates in postcodes
postcode.drop_duplicates('postcode', keep='last', inplace=True)
# Show first entries
postcode.head(2)
Merge geometry information by using the postcode
In [ ]:
# Take postcode and longitude/latitude information
postcode = postcode[['postcode', 'lon', 'lat']]
# Cast DE_renewables['postcode'] to int64 in order to do the natural join of the dataframes
DE_renewables['postcode'] = pd.to_numeric(DE_renewables['postcode'], errors='coerce')
# Join two dataframes
DE_renewables = DE_renewables.merge(postcode, on=['postcode'], how='left')
(for data with already existing geoinformation)
In this section the existing geoinformation (in UTM-format) will be transformed into latidude and longitude coordiates as a uniform standard for geoinformation.
The BNetzA data set offers UTM Geoinformation with the columns utm_zone (UTM-Zonenwert), utm_east and utm_north. Most of utm_east-values include the utm_zone-value 32 at the beginning of the number. In order to properly standardize and transform this geoinformation into latitude and longitude it is necessary to remove this utm_zone value. For all UTM entries the utm_zone 32 is used by the BNetzA.
utm_zone | utm_east | utm_north | comment |
---|---|---|---|
32 | 413151.72 | 6027467.73 | proper coordinates |
32 | 32912159.6008 | 5692423.9664 | caused error by 32 |
How many different utm_zone values are in the data set?
In [ ]:
DE_renewables.groupby(['utm_zone'])['utm_zone'].count()
Remove the utm_zone "32" from the utm_east value
In [ ]:
# Find entries with 32 value at the beginning
idx_32 = (DE_renewables['utm_east'].astype(str).str[:2] == '32')
idx_notnull = DE_renewables['utm_east'].notnull()
# Remove 32 from utm_east entries
DE_renewables.loc[idx_32, 'utm_east'] = DE_renewables.loc[idx_32,
'utm_east'].astype(str).str[2:].astype(float)
In [ ]:
def convert_to_latlon(utm_east, utm_north, utm_zone):
try:
return utm.to_latlon(utm_east, utm_north, utm_zone, 'U')
except:
return ''
DE_renewables['latlon'] = DE_renewables.loc[idx_notnull, ['utm_east', 'utm_north', 'utm_zone']].apply(
lambda x: convert_to_latlon(x[0], x[1], x[2]), axis=1).astype(str)
Conversion UTM to latitude and longitude
In [ ]:
lat = []
lon = []
for row in DE_renewables['latlon']:
try:
# Split tuple format into the column lat and lon
row = row.lstrip('(').rstrip(')')
parts = row.split(',')
if(len(parts)<2):
raise Exception('This is not a proper tuple. So go to exception block.')
lat.append(parts[0])
lon.append(parts[1])
except:
# set NaN
lat.append(np.NaN)
lon.append(np.NaN)
DE_renewables['latitude'] = pd.to_numeric(lat)
DE_renewables['longitude'] = pd.to_numeric(lon)
# Add new values to DataFrame lon and lat
DE_renewables['lat'] = DE_renewables[['lat', 'latitude']].apply(
lambda x: x[1] if pd.isnull(x[0]) else x[0],
axis=1)
DE_renewables['lon'] = DE_renewables[['lon', 'longitude']].apply(
lambda x: x[1] if pd.isnull(x[0]) else x[0],
axis=1)
Check: missing coordinates by data source and type
In [ ]:
#DE_renewables[DE_renewables['data_source'] == '50Hertz'].to_excel('test.xlsx')
In [ ]:
print('Missing coordinates ', DE_renewables.lat.isnull().sum())
display(
DE_renewables[DE_renewables.lat.isnull()].groupby(
['energy_source_level_2','data_source']
)['data_source'].count()
)
print('Share of missing coordinates (note that NaN can mean it\'s all fine):')
DE_renewables[DE_renewables.lat.isnull()].groupby(
['energy_source_level_2','data_source']
)['data_source'].count() / DE_renewables.groupby(
['energy_source_level_2','data_source']
)['data_source'].count()
Remove temporary columns
In [ ]:
# drop lonlat column that contains both, latitute and longitude
DE_renewables.drop(['latlon', 'longitude', 'latitude'], axis=1, inplace=True)
Save temporary Pickle (to have a point to quickly return to if things break after this point):
In [ ]:
pickle.dump(DE_renewables, open( "intermediate/temp_dfs_DE_before_cleaning.pickle", "wb" ) )
In [ ]:
DE_renewables = pickle.load( open( "intermediate/temp_dfs_DE_before_cleaning.pickle", "rb" ) )
In [ ]:
# Remove out-of-range dates
# Keep only values between 1900 and 2100 to rule out outliers / wrong values.
# Also, Excel doesn't support dates before 1900..
mask = ((DE_renewables['commissioning_date']>pd.Timestamp('1900')) &
(DE_renewables['commissioning_date']<pd.Timestamp('2100')))
DE_renewables = DE_renewables[mask]
In [ ]:
DE_renewables['municipality_code'] = DE_renewables['municipality_code'].astype(str)
In [ ]:
# Remove spaces from municipality code
DE_renewables['municipality_code'] = DE_renewables['municipality_code'].str.replace(' ', '', regex=False)
In [ ]:
DE_renewables['municipality_code'] = pd.to_numeric(DE_renewables['municipality_code'], errors='coerce', downcast='integer')
In [ ]:
# Merge address and address_number
to_string = lambda x: str(x) if not pd.isnull(x) else ''
DE_renewables['address'] = DE_renewables['address'].map(to_string) + ' ' + DE_renewables['address_number'].map(to_string)
# Make sure that the column has no whitespaces at the beginning and the end
DE_renewables['address'] = DE_renewables['address'].str.strip()
# Remove the column with address numbers as it is not needed anymore
del DE_renewables['address_number']
In [ ]:
# Set up a temporary postcode column as a string column for joining with the appropriate NUTS correspondence table
DE_renewables['postcode_str'] = DE_renewables['postcode'].astype(str).str[:-2]
DE_renewables = nuts_converter.add_nuts_information(DE_renewables, 'DE', DE_postcode2nuts_filepath,
postcode_column='postcode_str',
how=['postcode', 'municipality_code', 'municipality', 'latlon'])
# Drop the temporary column
DE_renewables.drop('postcode_str', axis='columns', inplace=True)
# Report the number of facilites whose NUTS codes were successfully sudetermined
determined = DE_renewables['nuts_1_region'].notnull().sum()
print('NUTS successfully determined for', determined, 'out of', DE_renewables.shape[0], 'facilities in DE.')
# Report the number of facilites whose NUTS codes could not be determined
not_determined = DE_renewables['nuts_1_region'].isnull().sum()
print('NUTS could not be determined for', not_determined, 'out of', DE_renewables.shape[0], 'facilities in DE.')
In [ ]:
visualize_points(DE_renewables['lat'],
DE_renewables['lon'],
'Germany',
categories=DE_renewables['energy_source_level_2']
)
In [ ]:
DE_renewables.to_pickle('intermediate/DE_renewables.pickle')
In [ ]:
del DE_renewables
The data which will be processed below is provided by the following data sources:
Energistyrelsen (ens) / Danish Energy Agency - The wind turbines register is released by the Danish Energy Agency.
Energinet.dk - The data of solar power plants are released by the leading transmission network operator Denmark.
geonames.org - The postcode data from Denmark is provided by Geonames and licensed under a Creative Commons Attribution 3.0 license.
Eurostat - The data for converting information on municipalities, postcodes and geographic coordinates to NUTS 2016 classification codes.
In [ ]:
# Download the data for Denmark
filepaths = downloader.download_data_for_country('DK')
print(filepaths)
The function for reading the data on the wind turbines.
In [ ]:
def read_dk_wind_turbines(filepath, sheet_name):
# Reads the data on Danish wind turbines
# from the sheet of the given name
# in the file with the path.
# Returns the data as a Pandas dataframe.
book = xlrd.open_workbook(filepath)
sheet = book.sheet_by_name(sheet_name)
# Since the column names are in two rows, not one,
# collect them in two parts. The first part is
# fixed and contains column names.
header = []
for i in range(0, 16):
# Make sure that strings 1) do not contain the newline sign
# and 2) have no trailing blank spaces.
column_name = sheet.cell_value(17, i).replace("\n", "").strip()
header = header + [column_name]
# The second part is variable. It consists of two subparts:
# 1) previous years (type float)
# 2) the past months of the current year (type date)
# Reading previous years as column names
i = 16
cell = sheet.cell(16, i)
while cell.ctype == xlrd.XL_CELL_NUMBER:
column_name = str(int(cell.value))
header = header + [column_name]
i = i + 1
cell = sheet.cell(16, i)
# Reading the months of the current year as column names
while cell.ctype == xlrd.XL_CELL_DATE:
year, month, _, _, _, _ = xlrd.xldate_as_tuple(cell.value, book.datemode)
column_name = str("{}-{}".format(year, month))
header = header + [column_name]
i = i + 1
cell = sheet.cell(16, i)
# Add the final column for the total of the current year
header += ['{}-total'.format(header[-1].split('-')[0])]
# Skip the first 17 rows in the sheet. The rest contains the data.
df = pd.read_excel(filepath,
sheet_name=sheet_name,
skiprows=17,
skipfooter=3
)
#
#df.drop(df.columns[len(df.columns)-1], axis=1, inplace=True)
# Set the column names.
df.columns = header
return df
In [ ]:
# Get wind turbines data
wind_turbines_sheet_name = 'IkkeAfmeldte-Existing turbines'
DK_wind_filepath = filepaths['Energistyrelsen']
DK_wind_df = read_dk_wind_turbines(DK_wind_filepath,
wind_turbines_sheet_name
)
# Get photovoltaic data
DK_solar_filepath = filepaths['Energinet']
DK_solar_df = pd.read_excel(DK_solar_filepath,
sheet_name='Data',
skiprows=[0],
converters={'Postnr': str}
)
In [ ]:
# Choose the translation terms for Denmark, create dictionary and show dictionary
idx_DK = columnnames[columnnames['country'] == 'DK'].index
column_dict_DK = columnnames.loc[idx_DK].set_index('original_name')['opsd_name'].to_dict()
# Windows has problems reading the csv entry for east and north (DK).
# The reason might be the difference when opening the csv between linux and
# windows.
column_dict_DK_temp = {}
for k, v in column_dict_DK.items():
column_dict_DK_temp[k] = v
if v == 'utm_east' or v == 'utm_north':
# merge 2 lines to 1
new_key = ''.join(k.splitlines())
column_dict_DK_temp[new_key] = v
column_dict_DK = column_dict_DK_temp
column_dict_DK
In [ ]:
# Replace column names based on column_dict_DK
DK_wind_df.rename(columns=column_dict_DK, inplace=True)
DK_solar_df.rename(columns=column_dict_DK, inplace=True)
In [ ]:
# Add names of the data sources to the DataFrames
DK_wind_df['data_source'] = 'Energistyrelsen'
DK_solar_df['data_source'] = 'Energinet.dk'
# Add energy source level 2 and technology for each of the two DataFrames
DK_wind_df['energy_source_level_2'] = 'Wind'
DK_solar_df['energy_source_level_2'] = 'Solar'
DK_solar_df['technology'] = 'Photovoltaics'
In [ ]:
# Choose the translation terms for Denmark, create dictionary and show dictionary
idx_DK = valuenames[valuenames['country'] == 'DK'].index
value_dict_DK = valuenames.loc[idx_DK].set_index('original_name')['opsd_name'].to_dict()
In [ ]:
# Replace all original value names by the OPSD value names
DK_wind_df.replace(value_dict_DK, inplace=True)
DK_solar_df.replace(value_dict_DK, inplace=True)
UTM32 to latitude and longitude (Data from Energistyrelsen)
The Energistyrelsen data set offers UTM Geoinformation with the columns utm_east and utm_north belonging to the UTM zone 32. In this section the existing geoinformation (in UTM-format) will be transformed into latidude and longitude coordiates as a uniform standard for geoinformation.
In [ ]:
# Index for all values with utm information
idx_notnull = DK_wind_df['utm_east'].notnull()
In [ ]:
# Convert from UTM values to latitude and longitude coordinates
DK_wind_df['lonlat'] = DK_wind_df.loc[idx_notnull, ['utm_east', 'utm_north']
].apply(lambda x: utm.to_latlon(x[0],
x[1],
32,
'U'), axis=1).astype(str)
In [ ]:
# Split latitude and longitude in two columns
lat = []
lon = []
for row in DK_wind_df['lonlat']:
try:
# Split tuple format
# into the column lat and lon
row = row.lstrip('(').rstrip(')')
lat.append(row.split(',')[0])
lon.append(row.split(',')[1])
except:
# set NAN
lat.append(np.NaN)
lon.append(np.NaN)
DK_wind_df['lat'] = pd.to_numeric(lat)
DK_wind_df['lon'] = pd.to_numeric(lon)
# drop lonlat column that contains both, latitute and longitude
DK_wind_df.drop('lonlat', axis=1, inplace=True)
Postcode to lat/lon (WGS84) (for data from Energinet.dk)
The available post code in the original data provides an approximation for the geocoordinates of the solar power plants.
The postcode will be assigned to latitude and longitude coordinates with the help of the postcode table.
In [ ]:
# Get geo-information
zip_DK_geo = zipfile.ZipFile(filepaths['Geonames'])
# Read generated postcode/location file
DK_geo = pd.read_csv(zip_DK_geo.open('DK.txt'), sep='\t', header=None)
# add column names as defined in associated readme file
DK_geo.columns = ['country_code', 'postcode', 'place_name', 'admin_name1',
'admin_code1', 'admin_name2', 'admin_code2', 'admin_name3',
'admin_code3', 'lat', 'lon', 'accuracy']
# Drop rows of possible duplicate postal_code
DK_geo.drop_duplicates('postcode', keep='last', inplace=True)
DK_geo['postcode'] = DK_geo['postcode'].astype(str)
In [ ]:
# Add longitude/latitude infomation assigned by postcode (for Energinet.dk data)
DK_solar_df = DK_solar_df.merge(DK_geo[['postcode', 'lon', 'lat']],
on=['postcode'],
how='left')
In [ ]:
# Show number of units with missing coordinates separated by wind and solar
print('Missing Coordinates DK_wind', DK_wind_df.lat.isnull().sum(), 'out of', len(DK_wind_df.index))
print('Missing Coordinates DK_solar', DK_solar_df.lat.isnull().sum(), 'out of', len(DK_solar_df.index))
In [ ]:
zip_DK_geo.close()
In [ ]:
# Merge DataFrames for wind and solar into DK_renewables
dataframes = [DK_wind_df, DK_solar_df]
DK_renewables = pd.concat(dataframes, sort=False)
DK_renewables = DK_renewables.reset_index()
In [ ]:
# Assign energy source level 1 to the dataframe
DK_renewables['energy_source_level_1'] = 'Renewable energy'
In [ ]:
# Merge the address and address-number columns into one
to_string = lambda x: str(x) if not pd.isnull(x) else ""
DK_renewables['address'] = DK_renewables['address'].map(to_string) + " " + DK_renewables['address_number'].map(to_string)
In [ ]:
# Make sure that the column has no whitespaces at the beginning or the end
DK_renewables['address'] = DK_renewables['address'].str.strip()
In [ ]:
# Assign NUTS codes
DK_postcode2nuts = filepaths['Eurostat']
DK_renewables = nuts_converter.add_nuts_information(DK_renewables, 'DK', DK_postcode2nuts,
how=['latlon', 'postcode', 'municipality_code', 'municipality_name'])
# Report the number of facilites whose NUTS codes were successfully sudetermined
determined = DK_renewables['nuts_1_region'].notnull().sum()
print('NUTS successfully determined for', determined, 'out of', DK_renewables.shape[0], 'facilities in DK.')
# Report the number of facilites whose NUTS codes could not be determined
not_determined = DK_renewables['nuts_1_region'].isnull().sum()
print('NUTS could not be determined for', not_determined, 'out of', DK_renewables.shape[0], 'facilities in DK.')
Let us check geoinformation on the facilities for which NUTS codes could not be determined.
In [ ]:
DK_renewables[DK_renewables['nuts_1_region'].isnull()][['municipality', 'municipality_code', 'lat', 'lon']]
As we see, no information on municipality and latitude/longitude coordinates are present for those power plants, so there was no possibility to assign them their NUTS codes.
In [ ]:
# Select those columns of the orignal data which are utilised further
columns_of_interest = ['commissioning_date', 'energy_source_level_1', 'energy_source_level_2',
'technology', 'electrical_capacity_kW', 'dso', 'gsrn_id', 'postcode',
'municipality_code', 'municipality', 'address',
'utm_east', 'utm_north', 'lon', 'lat', 'nuts_1_region', 'nuts_2_region', 'nuts_3_region',
'hub_height', 'rotor_diameter', 'manufacturer', 'model', 'data_source']
In [ ]:
# Clean DataFrame from columns other than specified above
DK_renewables = DK_renewables.loc[:, columns_of_interest]
DK_renewables.reset_index(drop=True, inplace=True)
In [ ]:
# kW to MW
DK_renewables['electrical_capacity_kW'] /= 1000
# adapt column name
DK_renewables.rename(columns={'electrical_capacity_kW': 'electrical_capacity'},
inplace=True)
In [ ]:
visualize_points(DK_renewables['lat'],
DK_renewables['lon'],
'Denmark',
categories=DK_renewables['energy_source_level_2']
)
In [ ]:
DK_renewables.to_pickle('intermediate/DK_renewables.pickle')
del DK_renewables
The data which will be processed below is provided by the following data sources:
Ministry for Ecological and Inclusive Transition - Number of installations and installed capacity of the different renewable source for every municipality in France. Data until 31/12/2017. As of 2020, this dataset is no longer maintained by the ministry and we refer to it as the old dataset.
ODRÉ - The Open Data Réseaux Énergies (ODRÉ, Open Data Networks for Energy) platform provides stakeholders with data around the themes of Production, Multi-energy Consumption, Storage, Mobility, Territories and Regions, Infrastructure, Markets and Meteorology. As of 2020, we refer to this dataset as the new dataset. It contains the data up to 31/12/2018.
OpenDataSoft - a list of French INSEE codes and corresponding coordinates, published under the Licence Ouverte (Etalab).
In [ ]:
# Download the data
filepaths = downloader.download_data_for_country('FR')
# Show the local paths
filepaths
In [ ]:
# Load the data
FR_re_filepath = filepaths['ODRE']
FR_re_df = pd.read_csv(FR_re_filepath,
sep=';',
parse_dates=['dateRaccordement', 'dateDeraccordement',
'dateMiseEnService', 'dateDebutVersion'],
infer_datetime_format=True)
# Make sure that the column dateDeraccordement is datetime
FR_re_df['dateDeraccordement'] = pd.to_datetime(FR_re_df['dateDeraccordement'], errors='coerce')
In [ ]:
# Choose the translation terms for France, create dictionary and show it
columnnames = pd.read_csv(os.path.join('input', 'column_translation_list.csv'))
idx_FR = columnnames[(columnnames['country'] == 'FR') & (columnnames['data_source'] == 'ODRE')].index
column_dict_FR = columnnames.loc[idx_FR].set_index('original_name')['opsd_name'].to_dict()
column_dict_FR
# Translate column names
FR_re_df.rename(columns=column_dict_FR, inplace=True)
In [ ]:
# Keep only the columns specified in the translation dictionary as we'll need only them
columns_to_keep = list(column_dict_FR.values())
FR_re_df = FR_re_df.loc[:, columns_to_keep]
FR_re_df.reset_index(drop=True, inplace=True)
# Show a pair of rows
FR_re_df.head(2)
In [ ]:
FR_re_df['data_source'] = 'OPEN DATA RESEAUX ENERGIES'
FR_re_df['as_of_year'] = 2018 # Year for which the dataset has been compiled by the data source
In [ ]:
# Choose the translation terms for France, create a dictionary and show it
valuenames = pd.read_csv(os.path.join('input', 'value_translation_list.csv'))
idx_FR = valuenames[(valuenames['country'] == 'FR') & (valuenames['data_source'] == 'ODRE')].index
value_dict_FR = valuenames.loc[idx_FR].set_index('original_name')['opsd_name'].to_dict()
value_dict_FR
In [ ]:
# Replace all original value names by the OPSD value names
FR_re_df.replace(value_dict_FR, inplace=True)
In [ ]:
# Check the columns
FR_re_df.isnull().all()
As we see above, no column contains only the null value, so we do not need to drop any.
In [ ]:
# Check the rows
print('There is a row containing all the null values?')
FR_re_df.isnull().all(axis=1).any()
No row contains only the null values, so no need to for filtering on that basis.
Now, we proceed with standardizing the energy types and technologies present in the data according to the OPSD energy hierarchy.
In [ ]:
FR_re_df[['energy_source_level_2', 'technology']].drop_duplicates()
In order to facilitate further processing, we can remove the rows that we know for sure we won't need. Those are the rows satisfying either of the following conditions:
electrical_capacity
is 0
or NaN
,energy_source_level_2
corresponds to a non-renewable energy type (Non-renewable thermal
, Non-hydraulic storage
, Nuclear
),technology
indicates that a non-renewable technology is used at the facility (Steam turbine
, Combustion cogeneration
, Combustion engine
, Combined cycle
, Pumped storage
, Piston motor
, Nuclear fission
).energy_source_level_2
is Other
and technology
is Other
or NaN
.
In [ ]:
# Define the mask for selecting rows with unusable info on electrical capacity
ec_mask = (FR_re_df['electrical_capacity'] == 0) | (FR_re_df['electrical_capacity'].isna())
# Define the mask for selecting the rows with non-renewable energy_source_level_2
non_renewable_esl2 = ['Non-renewable thermal', 'Non-hydraulic storage', 'Nuclear']
non_renewable_esl2_mask = FR_re_df['energy_source_level_2'].isin(non_renewable_esl2)
# Define the mask to select the rows with non-renewable technology
non_renewable_technologies = ['Steam turbine', 'Combustion cogeneration', 'Combustion engine',
'Combined cycle', 'Pumped storage', 'Piston motor', 'Nuclear fission']
non_renewable_technology_mask = FR_re_df['technology'].isin(non_renewable_technologies)
# Define the mask to select the rows without specified energy type and technology
other_mask = (FR_re_df['energy_source_level_2'] == 'Other') & \
((FR_re_df['technology'] == 'Other') | (pd.isnull(FR_re_df['technology'])))
# Combine the masks
drop_mask = ec_mask | non_renewable_esl2_mask | non_renewable_technology_mask | other_mask
# Show how many rows are going to be dropped
print('Dropping', drop_mask.sum(), 'rows out of', FR_re_df.shape[0])
# Keep all the rows not selected by the drop mask
keep_mask = ~drop_mask
FR_re_df = FR_re_df[keep_mask].reindex()
# Show some rows
print("A sample of the kept data:")
FR_re_df.sample(5)
Let us see the energy types and technologies present in the filtered data.
In [ ]:
FR_re_df[['energy_source_level_2', 'technology']].drop_duplicates()
First, let us standardize the values for energy source level 2 and technology.
np.nan
to indicate that technology should not be specified for the respective kind of sources according to the OPSD hierarchy.'Other or unspecified technology'
will mean that technology should be specified but it was unclear or missing in the original dataset.That means that we need to apply the following correction rules to the current data:
Other
in the column technology
should be replaced with Other or unspecified technology
.energy_source_level_2
is Marine
, Geothermal
, or Bioenergy
, then technology
should be set to np.nan
regardless of what is specified in the data set.energy_source_level_2
is Solar
, Hydro
, or Wind
, and technology
is NaN
, then technology
should be set to Other or unspecified technology
.energy_source_level_2
is Hydro
and technology
is Lake
or Closed
, then technology
should be set to Other or unspecified technology
.energy_source_level_2
is Solar
and technology
is Thermodynamic
, then technology
should be set to Other or unspecified technology
.energy_source_level_2
is Other
and technology
is Photovoltaics
, then energy_source_level_2
should be set to Solar
.
In [ ]:
# Make sure that the proper string is used to indicate other or unspecified technology
FR_re_df['technology'].replace('Other', 'Other or unspecified technology', inplace=True)
# Define a function that will deal with other cases
def standardize(row):
"""Input"""
level_2 = row['energy_source_level_2']
technology = row['technology']
if level_2 in ['Marine', 'Geothermal', 'Bioenergy']:
technology = np.nan
elif level_2 in ['Solar', 'Hydro', 'Wind', 'Other'] and pd.isna(technology):
technology = 'Other or unspecified technology'
if level_2 == 'Hydro' and technology in ['Lake', 'Closed']:
technology = 'Other or unspecified technology'
elif level_2 == 'Solar' and technology == 'Thermodynamic':
technology = 'Other or unspecified technology'
elif level_2 == 'Other' and technology == 'Photovoltaics':
level_2 = 'Solar'
return [level_2, technology]
# Apply the rules coded in function standardize
FR_re_df[['energy_source_level_2', 'technology']] = FR_re_df.apply(standardize, axis=1, result_type='expand')
# Show the existing level 2 types and technologies
FR_re_df[['energy_source_level_2', 'technology']].drop_duplicates()
Let us now deal with the third level of the energy hierarchy. Only Bioenergy
has the third level. Information on it can be found in the column energy_source_level_3
(whose original name was combustible
).
In [ ]:
FR_re_df[['energy_source_level_2', 'energy_source_level_3']].drop_duplicates()
We see that only the following correction is needed:
energy_source_level_3
is Sewage and landfill gas
and energy_source_level_2
is Other
, set energy_source_level_2
to Bioenergy
.
In [ ]:
index = (FR_re_df['energy_source_level_3'] == 'Sewage and landfill gas') & \
(FR_re_df['energy_source_level_2'] == 'Other')
FR_re_df.loc[index, 'energy_source_level_2'] = 'Bioenergy'
Finally, we declare all the plants as renewable and show the final hierarchy.
In [ ]:
# Assign energy_source_level_1 to the dataframe
FR_re_df['energy_source_level_1'] = 'Renewable energy'
# Show the hierarchy
FR_re_df[['energy_source_level_1', 'energy_source_level_2', 'energy_source_level_3', 'technology']].drop_duplicates()
In [ ]:
# Get the local path of the downloaded georeferencing data
FR_geo_filepath = filepaths['Opendatasoft']
# Read INSEE Code Data
FR_geo = pd.read_csv(FR_geo_filepath,
sep=';',
header=0,
converters={'Code_postal': str})
# Drop possible duplicates of the same INSEE code
FR_geo.drop_duplicates('INSEE_COM', keep='last', inplace=True)
In [ ]:
# create columns for latitude/longitude
lat = []
lon = []
# split in latitude/longitude
for row in FR_geo['Geo Point']:
try:
# Split tuple format
# into the column lat and lon
row = row.lstrip('(').rstrip(')')
lat.append(row.split(',')[0])
lon.append(row.split(',')[1])
except:
# set NAN
lat.append(np.NaN)
lon.append(np.NaN)
# add these columns to the INSEE DataFrame
FR_geo['lat'] = pd.to_numeric(lat)
FR_geo['lon'] = pd.to_numeric(lon)
In [ ]:
# Column names of merge key have to be named identically
FR_re_df.rename(columns={'municipality_code': 'INSEE_COM'}, inplace=True)
# Merge longitude and latitude columns by the Code INSEE
FR_re_df = FR_re_df.merge(FR_geo[['INSEE_COM', 'lat', 'lon']],
on=['INSEE_COM'],
how='left')
# Translate Code INSEE column back to municipality_code
FR_re_df.rename(columns={'INSEE_COM': 'municipality_code'}, inplace=True)
In [ ]:
#import importlib
#importlib.reload(util.nuts_converter)
#from util.nuts_converter import NUTSConverter
#nuts_converter = NUTSConverter(downloader, eurostat_eu_directory_path)
FR_postcode2nuts_path = filepaths['Eurostat']
FR_re_df = nuts_converter.add_nuts_information(FR_re_df, 'FR', FR_postcode2nuts_path,
lau_name_type='NATIONAL',
closest_approximation=True,
how=['municipality_code', 'latlon'])
# Report the number of facilites whose NUTS codes were successfully determined
determined = FR_re_df['nuts_1_region'].notnull().sum()
print('NUTS successfully determined for', determined, 'out of', FR_re_df.shape[0], 'facilities in FR.')
# Report the number of facilites whose NUTS codes could not be determined
not_determined = FR_re_df['nuts_1_region'].isnull().sum()
print('NUTS could not be determined for', not_determined, 'out of', FR_re_df.shape[0], 'facilities in FR.')
Let us now check the facilities without NUTS classification.
In [ ]:
# Check the facilities without NUTS classification
no_nuts = FR_re_df['nuts_1_region'].isnull()
# Find the masks where some information for finding the proper NUTS code is present
lat_or_lon_present = ~(FR_re_df['lat'].isna() & FR_re_df['lon'].isna())
municipality_code_present = ~(FR_re_df['municipality_code'].isnull())
municipality_name_present = ~(FR_re_df['municipality'].isnull())
# Show the cases where NUTS classification failed even though it shouldn't have
print('1. No NUTS code but latitude/longitude info present')
problematic_lat_lon = FR_re_df[no_nuts & lat_or_lon_present][['lat', 'lon']]
display(problematic_lat_lon)
print('2. No NUTS code but municipality code info present')
problematic_municipality_codes = FR_re_df[no_nuts & municipality_code_present]['municipality_code'].unique()
display(problematic_municipality_codes)
print('3. No NUTS code but municipality name info present')
problematic_municipality_names = FR_re_df[no_nuts & municipality_name_present]['municipality'].unique()
display(problematic_municipality_names)
We see that no row with known longitude and latitude was left unclassified.
What we also see is that some municipality codes did not translate to the corresponding NUTS codes. Further inspection shows that those codes are not present in the official NUTS translation tables.
In [ ]:
# Check if the any problematic code is actually present in the translation table
present_any = False
for code in problematic_municipality_codes:
mask = nuts_converter.municipality2nuts_df['municipality_code'].str.match(code)
present_any = present_any or mask.any()
print(present_any)
We also see that problematic municipality names are either not present in the official translation tables or more than one municipality in the tables bears them.
In [ ]:
# Print only the names of those problematic municipalities, which appear in the translation table only once.
for name in problematic_municipality_names:
mask = nuts_converter.municipality2nuts_df['municipality'].str.match(name)
if mask.sum() == 1:
print(name)
Therefore, we can confirm that NUTS classification codes were determined with the highest success rate possible.
In [ ]:
FR_re_df['electrical_capacity'] = FR_re_df['electrical_capacity'] / 1000
In [ ]:
# Load the data
FR_re_filepath = filepaths['gouv.fr']
FR_re_df_old = pd.read_excel(FR_re_filepath,
sheet_name='Commune',
encoding='UTF8',
thousands='.',
decimals=',',
header=[3, 4],
skipfooter=9, # skip the summary rows
index_col=[0, 1], # required for MultiIndex
converters={'Code officiel géographique': str})
FR_re_df_old.tail()
This French data source contains number of installations and sum of installed capacity per energy source per municipality. The list is limited to the plants which are covered by article 10 of february 2000 by an agreement to a purchase commitment.
In [ ]:
# Rearrange data
FR_re_df_old.index.rename(['insee_com', 'municipality'], inplace=True)
FR_re_df_old.columns.rename(['energy_source_level_2', None], inplace=True)
FR_re_df_old = (FR_re_df_old
.stack(level='energy_source_level_2', dropna=False)
.reset_index(drop=False))
In [ ]:
# Choose the translation terms for France, create dictionary and show dictionary
idx_FR = columnnames[(columnnames['country'] == 'FR') & (columnnames['data_source'] == 'gouv.fr')].index
column_dict_FR = columnnames.loc[idx_FR].set_index('original_name')['opsd_name'].to_dict()
column_dict_FR
In [ ]:
# Translate columnnames
FR_re_df_old.rename(columns=column_dict_FR, inplace=True)
In [ ]:
# Drop all rows that contain NA
FR_re_df_old = FR_re_df_old.dropna()
FR_re_df_old.head(10)
In [ ]:
FR_re_df_old['data_source'] = 'Ministry for the Ecological and Inclusive Transition'
FR_re_df_old['as_of_year'] = 2017 # Year for which the dataset has been compiled by the data source
If the number of installations is less than 3, it is marked with an s instead of the number 1 or 2 due to statistical confidentiality (as explained by the data provider). Here, the s is changed to < 3. This is done in the same step as the other value translations of the energy sources.
In [ ]:
# Choose the translation terms for France, create dictionary and show dictionary
idx_FR = valuenames[(valuenames['country'] == 'FR') & (valuenames['data_source'] == 'gouv.fr')].index
value_dict_FR = valuenames.loc[idx_FR].set_index('original_name')['opsd_name'].to_dict()
value_dict_FR
In [ ]:
# Replace all original value names by the OPSD value names
FR_re_df_old.replace(value_dict_FR, inplace=True)
In [ ]:
# Create dictionnary in order to assign energy_source to its subtype
energy_source_dict_FR = valuenames.loc[idx_FR].set_index(
'opsd_name')['energy_source_level_2'].to_dict()
# Column energy_source partly contains subtype information, thus this column is copied
# to new column for energy_source_subtype...
FR_re_df_old['technology'] = FR_re_df_old['energy_source_level_2']
# ...and the energy source subtype values in the energy_source column are replaced by
# the higher level classification
FR_re_df_old['energy_source_level_2'].replace(energy_source_dict_FR, inplace=True)
# Assign energy_source_level_1 to the dataframe
FR_re_df_old['energy_source_level_1'] = 'Renewable energy'
FR_re_df_old.reset_index(drop=True, inplace=True)
# Choose energy source level 2 entries where energy source level 2 is Bioenergy in order to
# seperate Bioenergy subtypes to energy source level 3 and subtypes for the rest to technology
idx_FR_Bioenergy = FR_re_df_old[FR_re_df_old['energy_source_level_2'] == 'Bioenergy'].index
# Assign technology to energy source level 3 for all entries where energy source level 2 is
# Bioenergy and delete those entries from technology
FR_re_df_old[['energy_source_level_3']] = FR_re_df_old.iloc[idx_FR_Bioenergy][['technology']]
FR_re_df_old.loc[idx_FR_Bioenergy,'technology'] = np.nan
In [ ]:
# Column names of merge key have to be named identically
FR_re_df_old.rename(columns={'municipality_code': 'INSEE_COM'}, inplace=True)
# Merge longitude and latitude columns by the Code INSEE
FR_re_df_old = FR_re_df_old.merge(FR_geo[['INSEE_COM', 'lat', 'lon']],
on=['INSEE_COM'],
how='left')
# Translate Code INSEE column back to municipality_code
FR_re_df_old.rename(columns={'INSEE_COM': 'municipality_code'}, inplace=True)
In [ ]:
FR_postcode2nuts_path = filepaths['Eurostat']
FR_re_df_old = nuts_converter.add_nuts_information(FR_re_df_old, 'FR', FR_postcode2nuts_path,
how=['municipality_code', 'latlon'])
# how=['municipality', 'municipality_code', 'latlon']
# Report the number of facilites whose NUTS codes were successfully determined
determined = FR_re_df_old['nuts_1_region'].notnull().sum()
print('NUTS successfully determined for', determined, 'out of', FR_re_df_old.shape[0], 'facilities in FR.')
# Report the number of facilites whose NUTS codes could not be determined
not_determined = FR_re_df_old['nuts_1_region'].isnull().sum()
print('NUTS could not be determined for', not_determined, 'out of', FR_re_df_old.shape[0], 'facilities in FR.')
In [ ]:
# Show the facilities without NUTS classification
FR_re_df_old[FR_re_df_old['nuts_1_region'].isnull()]
As we can see, the NUTS codes were determined successfully for all the facilities in the dataset.
Some municipalities are not covered by the new data set, provided by ODRE. Now, we find those municipalities and integrate them with the new data.
The only column present in the old data, but not in the new, is number_of_installations
. Since the old data
were aggregated on the municipality level, the column in question refers to the numbers of power plants in the
municipalitis. Since the new data covers individual plants, if we set the column number_of_installations
to 1
for all the plants in the the new data, we will make the two sets consistent with one another and be able
to concatenate them.
We will set site_name
to 'Aggregated data for municipality
' for all the rows from the old data, where municipality
refers to the name of the municipality for which the row has been compiled.
In [ ]:
# For each column present in the new data's column space, but not the old,
# add an empty column to the old data.
for new_column in FR_re_df.columns:
if new_column not in FR_re_df.columns:
FR_re_df_old[new_column] = np.nan
# Define the mask to select the municipalities from the old data, that are not covered
# by the new.
not_included = ~(FR_re_df_old['municipality_code'].isin(FR_re_df['municipality_code']))
FR_re_df_old[not_included]
# Add a dummy column to the new data frame
# representing the number of power plants (always 1)
FR_re_df['number_of_installations'] = 1
# Mark the old data rows as aggregations on municipality level.
FR_re_df_old['site_name'] = 'Aggregated data for ' + FR_re_df_old['municipality']
# Concatenate the new data with the old rows referring to the municipalities
# not covered by the new.
FR_re_df = pd.concat([FR_re_df, FR_re_df_old[not_included]], ignore_index=True, axis='index', sort=True)
In [ ]:
columns_to_keep = ['EIC_code', 'municipality_group_code', 'IRIS_code', 'as_of_year',
'commissioning_date', 'connection_date', 'data_source', 'departement',
'departement_code', 'disconnection_date',
'electrical_capacity', 'energy_source_level_1', 'energy_source_level_2',
'energy_source_level_3', 'lat', 'lon',
'municipality', 'municipality_code',
'municipality_group', 'number_of_installations', 'nuts_1_region',
'nuts_2_region', 'nuts_3_region', 'FR_region', 'FR_region_code', 'site_name',
'source_station_code', 'technology']
FR_re_df = FR_re_df[columns_to_keep]
FR_re_df.reset_index(drop=True, inplace=True)
In [ ]:
visualize_points(FR_re_df['lat'],
FR_re_df['lon'],
'France',
categories=FR_re_df['energy_source_level_2']
)
In [ ]:
FR_re_df.to_pickle('intermediate/FR_renewables.pickle')
del FR_re_df
The data which will be processed below is provided by the following data source:
Urzad Regulacji Energetyki (URE) / Energy Regulatory Office - Installed capacities of renewable-energy power plants in Poland. The plants are anonymized in the sense that no names, post codes or geographical coordinates are present. They are described by: the energy type their use, installed capacity, województwo (province) and powiat (district) that they are located in.
In [ ]:
# Download the data
filepaths = downloader.download_data_for_country('PL')
# Get the local paths to the data files
PL_re_filepath = filepaths['Urzad Regulacji Energetyki']
PL_postcode2nuts_filepath = filepaths['Eurostat']
PL_geo_filepath = filepaths['Geonames']
In [ ]:
# Read the data into a pandas dataframe
PL_re_df = pd.read_excel(PL_re_filepath,
encoding='latin',
header=2,
skipfooter=14
)
# Show 5 random rows
PL_re_df.sample(n=5)
There are only five columns:
Lp.
: the ordinal number of the entry (power plant), effectively serving as its identification number.Województwo
: the province (voivodeship) where the plant is locatedPowiat
: the district where the plant is locatedRodzaj_OZE
: the code of the energy the plants uses. According to the legend in the .xlsx file, the codes are as follows:BG
: biogasBM
: biomassPVA
: solar energyWIL
: wind energyWO
: hydroenergyWS
: using the technology of co-firing biomass, biogas or bioliquids with other fuels (fossil fuels and biomass / biogas / bioliquids)Moc zainstalowana [MW]
: installed capacity (in MWs).The type corresponding to WS
does not fit into the OPSD energy hiearchy, so we can drop such plants.
In [ ]:
# Get the mask for selecting the WS plants
ws_mask = PL_re_df['Rodzaj_OZE'] == 'WS'
# Drop them
print('Dropping', ws_mask.sum(), 'out of', PL_re_df.shape[0], 'power plants.')
PL_re_df.drop(PL_re_df.index[ws_mask], axis=0, inplace=True)
PL_re_df.reset_index(drop=True, inplace=True)
To ease the work, we can translate the columns' names to English using the OPSD translation tables.
In [ ]:
# Choose the translation terms for Poland, create and show the dictionary
columnnames = pd.read_csv(os.path.join('input', 'column_translation_list.csv'))
idx_PL = columnnames[(columnnames['country'] == 'PL') &
(columnnames['data_source'] == 'Urzad Regulacji Energetyki')].index
column_dict_PL = columnnames.loc[idx_PL].set_index('original_name')['opsd_name'].to_dict()
column_dict_PL
In [ ]:
# Translate column names
PL_re_df.rename(columns=column_dict_PL, inplace=True)
# Show a couple of rows
PL_re_df.head(2)
Let us do few quick checks to see state of the data:
NA
values?electrical_capacity
proper numbers?energy_type
(codes of energy types) consistent strings? Here we check if all the codes appear in one and only one form. For example, PVA
is the code for solar energy and we want to make sure that only PVA
appears in the column, not other variations such as pva
, Pva
etc.We will need the answers to those questions to know how to proceed with processing.
In [ ]:
print('The number of missing values in the data:', PL_re_df.isna().sum().sum())
print('Are all capacities proper numbers?', PL_re_df['electrical_capacity'].dtype == 'float64')
print('What about the energy codes?', PL_re_df['energy_type'].unique())
In [ ]:
# Check the voivodeships
print('Show the names of the voivodeships.')
PL_re_df['region'].unique()
We can see that each name comes in two forms: (1) with the first letter capital and (2) with the first letter lowercase. One province is referred to by three different strings: 'Śląskie'
, 'śląskie'
, and 'śląskie '
(the last with a trailing white space). In order to standardize this column, we trim and capitalize all the strings appearing in it.
In [ ]:
PL_re_df['region'] = PL_re_df['region'].str.strip().str.capitalize()
PL_re_df['region'].unique()
Now, let us check the strings for districts (powiats).
In [ ]:
districts = PL_re_df['district'].unique()
districts.sort()
districts
As we see in the list, the same district can be referred to by more than one string. We identify the following ways a district is referred to in the dataset:
Kraków
),m.
or m. st.
to the form 1 (e.g. m. Kraków
or m. st. Warszawy
) andkrakowski
).Some districts, such as Krakow, appear in all the three forms, but there are those which appear in two (e.g. Bytom
and m. Bytom
). This will pose a problem when we later try to assign the plants their NUTS codes. Furthermore, the NUTS translation tables do not map districts to the codes, but lower administrative units (municipalities) and postcodes to NUTS. We solve this issue at a later point in the notebook, Section Georeferencing (NUTS classification), and not here as it requires heavier processing than warranted during initial explorative analysis and lightweight cleaning of the data.
We note that the districts lipowski
and hojnowski
are misspelled, as they should actually be lipnowski
and hajnowski
, so we can correct the typos now.
In [ ]:
# Correct the typos
PL_re_df.loc[PL_re_df['district'] == 'lipowski', 'district'] = 'lipnowski'
PL_re_df.loc[PL_re_df['district'] == 'hojnowski', 'district'] = 'hajnowski'
In [ ]:
# Choose the translation terms for Poland, create dictionary
idx_PL = valuenames[valuenames['country'] == 'PL'].index
value_dict_PL = valuenames.loc[idx_PL].set_index('original_name')['opsd_name'].to_dict()
# Set energy source level 3
PL_re_df['energy_source_level_3'] = PL_re_df['energy_type'].replace(value_dict_PL)
# Create dictionnary in order to assign energy_source_level_2 to its subtype
idx_PL = valuenames[valuenames['country'] == 'PL'].index
energy_source_dict_PL = valuenames.loc[idx_PL].set_index('original_name')['energy_source_level_2'].to_dict()
# Add energy_source_level_2
PL_re_df['energy_source_level_2'] = PL_re_df['energy_type'].replace(energy_source_dict_PL)
# Standardize the values for technology
# 1. np.nan means that technology should not be specified for the respective kind of sources
# according to the hierarchy (http://open-power-system-data.org/2016-10-25-opsd_tree.svg)
# 2. 'Other or unspecified technology' means that technology should be specified
# but it was unclear or missing in the original dataset.
technology_translation_dictionary = {
'BG': np.nan,
'BM': np.nan,
'PVA': 'Other or unspecified technology', # Photovoltaics?
'WIL': 'Other or unspecified technology', # Onshore?
'WO': 'Other or unspecified technology', # Run-of-river
}
PL_re_df['technology'] = PL_re_df['energy_type'].replace(technology_translation_dictionary)
# Add energy_source_level_1
PL_re_df['energy_source_level_1'] = 'Renewable energy'
# Show the hierarchy of sources present in the dataset
PL_re_df[['energy_source_level_1', 'energy_source_level_2', 'energy_source_level_3', 'technology']].drop_duplicates().sort_values(by='energy_source_level_2')
We have already seen that the district names are not standardized and observed that we cannot use them directly to get the corresponding NUTS codes.
There is a way to get around this issue. We can do it as folows:
nuts_converter
.By inspection, we observe that all the district names in the zip have one of the following two forms:
Powiat *
where *
is a possessive adjective.So, we standardize all the strings in the district
column as follows:
m.
or m. st.
, remove m.
(or m. st.
) from the beginning of the string.Powiat
(note the ending whitespace) to it.
In [ ]:
# Define the function to standardize district names from the original data
def standardize_districts(original_string):
if original_string[-1] == ',': # there is one district whose name ends with ','; that's a typo in the data
original_string = original_string[:-1]
if original_string.startswith('m. st. '):
return original_string[7:]
elif original_string.startswith('m. '):
return original_string[3:]
elif any([original_string.endswith(suffix) for suffix in ['ski', 'cki', 'zki']]):
return 'Powiat ' + original_string
else:
return original_string
In [ ]:
# Get geo-information
zip_PL_geo = zipfile.ZipFile(PL_geo_filepath)
# Read generated postcode/location file
PL_geo = pd.read_csv(zip_PL_geo.open('PL.txt'), sep='\t', header=None)
# add column names as defined in associated readme file
PL_geo.columns = ['country_code', 'postcode', 'place_name', 'admin_name1',
'admin_code1', 'admin_name2', 'admin_code2', 'admin_name3',
'admin_code3', 'lat', 'lon', 'accuracy']
# Drop rows of possible duplicate postal_code
PL_geo.drop_duplicates('postcode', keep='last', inplace=True)
PL_geo['postcode'] = PL_geo['postcode'].astype(str)
# Get the names
geonames_districts = PL_geo['admin_name2'].unique()
# Show them
geonames_districts
In [ ]:
# Standardize the district names from the original data
PL_re_df['standardized_district'] = PL_re_df['district'].apply(standardize_districts)
standardized_districts = PL_re_df['standardized_district'].unique()
# Check which districts could not be found in the GeoNames data
#print(len([x for x in semi if x in geopowiats]), len([x for x in semi if x not in geopowiats]))
not_found = set(standardized_districts).difference(set(geonames_districts))
number_of_not_found = len(not_found)
total = len(standardized_districts)
print('{}/{} names could not be found. Those are:'.format(number_of_not_found, total))
print(not_found)
We can now apply a heuristic method for finding the corresponding name in the GeoNames data. It is based on similarity between strings. It turns out that it works fine, except for a couple of cases, which we deal with manually.
In [ ]:
# We define the similarity between two strings, string1 and string2,
# as the length of the longest prefix of string1 that appears in string2.
# Note 1: this measure of similarity is not necessarily symmetrical.
# Note 2: a prefix of a string is its substring that starts from the beginning of the string.
def calculate_similarity(string1, string2):
for n in range(len(string1), 1, -1):
prefix = string1[0:(n-1)]
if prefix in string2:
return len(prefix)
return 0
# Define a function to find, among a group of candidate strings,
# the most similar string to the one given as the reference string.
def find_the_most_similar(reference_string, candidate_strings):
the_most_similar = None
maximal_similarity = 0
for candidate_string in candidate_strings:
similarity = calculate_similarity(reference_string, candidate_string)
if similarity > maximal_similarity:
maximal_similarity = similarity
the_most_similar = candidate_string
return the_most_similar, maximal_similarity
In [ ]:
already_mapped = PL_re_df[['district', 'standardized_district']].drop_duplicates().to_dict(orient='records')
already_mapped = {mapping['district'] : mapping['standardized_district'] for mapping in already_mapped
if mapping['standardized_district'] in geonames_districts}
In [ ]:
# Make a dictionary to map each district from the original data to its GeoNames equivalent.
# The districts whose standardized versions have been found in the GeoNames data to their standardizations.
# The mappings for other districts will be found using the previously defined similarity measures.
districts_map = PL_re_df[['district', 'standardized_district']].drop_duplicates().to_dict(orient='records')
districts_map = {mapping['district'] : mapping['standardized_district'] for mapping in districts_map}
# Override the mappings for the 49 districts whose standardized names have not been found in the GeoNames data.
for district, standardized_district in districts_map.items():
#standardized_district = ['standardized_district']
if standardized_district not in geonames_districts:
#print('---------')
if standardized_district.startswith('Powiat'):
standardized_district = standardized_district[7:]
#print(district)
capitalized = standardized_district.capitalize()
lowercase = standardized_district.lower()
candidate1, similarity1 = find_the_most_similar(capitalized, geonames_districts)
candidate2, similarity2 = find_the_most_similar(lowercase, geonames_districts)
if similarity1 > similarity2:
districts_map[district] = candidate1
#print('\t', candidate1, similarity1)
elif similarity2 > similarity1:
districts_map[district] = candidate2
#print('\t', candidate2, similarity2)
else:
# Break the ties by mapping to the shorter string
if len(candidate1) < len(candidate2):
districts_map[district] = candidate1
#print('\t', candidate1, '|', candidate2, similarity1)
else:
districts_map[district] = candidate2
#print('\t', candidate2, '|', candidate1, similarity2)
# Apply the override to PL_re_df
PL_re_df['standardized_district'] = PL_re_df['district'].apply(lambda district: districts_map[district])
# Show the results
PL_re_df[['district', 'standardized_district']].drop_duplicates()
The following districts have not been mapped correctly: wołowski
, m. Nowy Sącz
and rzeszowski
. Let us clear their mappings so that we can assign them their NUTS codes manually later.
In [ ]:
# Clear the mappings for wołowski, Nowy Sącz, rzeszowski, hojnowski.
for district in ['wołowski', 'm. Nowy Sącz', 'rzeszowski', 'hojnowski']:
districts_map[district] = ''
PL_re_df.loc[PL_re_df['district'] == district, 'standardized_district'] = ''
# For each mapping, select a postcode from the GeoNames data
df_dict = {'original' : [], 'geonames' : []}
for original_name in districts_map:
geonames_name = districts_map[original_name]
df_dict['original'].append(original_name)
df_dict['geonames'].append(geonames_name)
mapping_df = pd.DataFrame.from_dict(df_dict)
# To make sure that the selected postcodes do appear in the NUTS table,
# we drop, from PL_geo, all rows with the postcodes not in the postcode-to-NUTS table for Poland.
PL_table = nuts_converter.open_postcode2nuts(filepaths['Eurostat'])['CODE']
PL_geo = pd.merge(PL_geo, PL_table, how='inner', left_on='postcode', right_on='CODE')
PL_geo.drop(['CODE'], axis='columns', inplace=True)
#
merged = pd.merge(mapping_df,
PL_geo[['admin_name2', 'postcode']],
how='left',
left_on='geonames',
right_on='admin_name2')
# Rename the column postcode to make its meaning straightforward
merged.rename(columns={'postcode' : 'random_postcode'}, inplace=True)
merged = merged.drop_duplicates(['geonames'])
print(PL_re_df.shape)
PL_re_df = pd.merge(PL_re_df,
merged[['geonames', 'random_postcode']],
how='left',
left_on='standardized_district',
right_on='geonames')
# Show results
PL_re_df.head(2)
Show the rows for which we could not find postcodes.
In [ ]:
display(PL_re_df[PL_re_df['random_postcode'].isnull()])
PL_re_df['random_postcode'].isnull().sum()
There are only 17 such power plants and all of them are placed in the districts which we deliberately left out for manual classification.
In [ ]:
PL_postcode2nuts_path = filepaths['Eurostat']
PL_re_df = nuts_converter.add_nuts_information(PL_re_df, 'PL', PL_postcode2nuts_path,
postcode_column='random_postcode', how=['postcode'])
# Report the number of facilites whose NUTS codes were successfully sudetermined
determined = PL_re_df['nuts_1_region'].notnull().sum()
print('NUTS successfully determined for', determined, 'out of', PL_re_df.shape[0], 'facilities in PL.')
# Manual assignments
manual_nuts3_map = {
'wołowski' : 'PL518',
'm. Nowy Sącz' : 'PL218',
'rzeszowski' : 'PL325'
}
for district in manual_nuts3_map:
nuts3 = manual_nuts3_map[district]
nuts2 = nuts3[:-1]
nuts1 = nuts3[:-2]
mask = (PL_re_df['district'] == district)
PL_re_df.loc[mask, ['nuts_1_region', 'nuts_2_region', 'nuts_3_region']] = [nuts1, nuts2, nuts3]
# Report the number of facilites whose NUTS codes could not be determined
not_determined = PL_re_df['nuts_1_region'].isnull().sum()
print('NUTS could not be determined for', not_determined, 'out of', PL_re_df.shape[0], 'facilities in PL.')
In [ ]:
PL_re_df['data_source'] = 'Urzad Regulacji Energetyki'
PL_re_df['as_of_year'] = 2019 # The year for which the dataset has been compiled by the data source
In [ ]:
# Choose which column to keep
PL_re_df = PL_re_df.loc[:, [ 'URE_id', 'region', 'district',
'nuts_1_region', 'nuts_2_region', 'nuts_3_region',
'electrical_capacity',
'energy_source_level_1', 'energy_source_level_2', 'energy_source_level_3',
'technology',
'data_source', 'as_of_year']]
In [ ]:
PL_re_df.to_pickle('intermediate/PL_renewables.pickle')
del PL_re_df
The data which will be processed below is provided by the following data sources:
Swiss Federal Office of Energy - Data of all renewable power plants receiving "Kostendeckende Einspeisevergütung" (KEV) which is the Swiss feed in tarif for renewable power plants. Geodata is based on municipality codes.
The available municipality code in the original data provides an approximation for the geocoordinates of the renewable power plants. The postcode will be assigned to latitude and longitude coordinates with the help of the postcode table.
geonames.org - The postcode data from Switzerland is provided by Geonames and licensed under a Creative Commons Attribution 3.0 license.
In [ ]:
# Download the data and get the local paths of the downloaded files
filepaths = downloader.download_data_for_country('CH')
CH_re_filepath = filepaths['BFE']
CH_geo_filepath = filepaths['Geonames']
CH_postcode2nuts_filepath = filepaths['Eurostat']
In [ ]:
# Get data of renewables per municipality
CH_re_df = pd.read_excel(CH_re_filepath,
sheet_name='KEV Bezüger 2018',
encoding='UTF8',
thousands='.',
decimals=','
#header=[0]
#skipfooter=9, # contains summarized values
#index_col=[0, 1], # required for MultiIndex
#converters={'Code officiel géographique':str}
)
In [ ]:
# Choose the translation terms for Switzerland, create dictionary and show dictionary
idx_CH = columnnames[columnnames['country'] == 'CH'].index
column_dict_CH = columnnames.loc[idx_CH].set_index('original_name')['opsd_name'].to_dict()
column_dict_CH
In [ ]:
# Translate columnnames
CH_re_df.columns = [column_name.replace("\n", "") for column_name in CH_re_df.columns]
CH_re_df.rename(columns=column_dict_CH, inplace=True)
In [ ]:
CH_re_df['data_source'] = 'BFE'
In [ ]:
# Choose the translation terms for Switzerland, create dictionary
idx_CH = valuenames[valuenames['country'] == 'CH'].index
value_dict_CH = valuenames.loc[idx_CH].set_index('original_name')['opsd_name'].to_dict()
Separate and assign energy source level 1-3 and technology
In [ ]:
# Assign energy_source_level_1 to the dataframe
CH_re_df['energy_source_level_1'] = 'Renewable energy'
In [ ]:
# Create dictionnary in order to assign energy_source to its subtype
#energy_source_dict_CH = valuenames.loc[idx_CH].set_index('opsd_name')['energy_source_level_2'].to_dict()
#
# ...and the energy source subtype values in the energy_source column are replaced by
# the higher level classification
#CH_re_df['energy_source_level_2'].replace(energy_source_dict_CH, inplace=True)
CH_re_df['energy_source_level_3'] = CH_re_df['technology']
# Create dictionnary in order to assign energy_source_level_2 to its subtype
idx_CH = valuenames[valuenames['country'] == 'CH'].index
energy_source_dict_CH = valuenames.loc[idx_CH].set_index('original_name')['energy_source_level_2'].to_dict()
# Add energy_source_level_2
CH_re_df['energy_source_level_2'] = CH_re_df['energy_source_level_2'].replace(energy_source_dict_CH)
# Translate values in order to standardize energy_source_level_3
value_dict_CH = valuenames.loc[idx_CH].set_index('original_name')['opsd_name'].to_dict()
CH_re_df['energy_source_level_3'].replace(value_dict_CH, inplace=True)
# Standardize the values for technology
# 1. np.nan means that technology should not be specified for the respective kind of sources
# according to the hierarchy (http://open-power-system-data.org/2016-10-25-opsd_tree.svg)
# 2. 'Other or unspecified technology' means that technology should be specified
# but it was unclear or missing in the original dataset.
technology_translation_dictionary = {
'Klärgasanlage': np.nan,
'Dampfprozess': 'Steam turbine',
'übrige Biomasse - WKK-Anlage': 'Other or unspecified technology',
'übrige Biomasse - Dampfprozess': 'Steam turbine',
'Schlammverbrennungsanlage': 'Combustion engine',
'WKK-Prozess': 'Other or unspecified technology',
'Kehrrichtverbrennungsanlage': 'Combustion engine',
'Integrierte Anlage': 'Photovoltaics',
'Angebaute Anlage': 'Photovoltaics',
'Freistehende Anlage': 'Photovoltaics',
'Trinkwasserkraftwerk': 'Other or unspecified technology',
'Durchlaufkraftwerk': 'Run-of-river',
'Dotierwasserkraftwerk': 'Other or unspecified technology',
'Ausleitkraftwerk': 'Other or unspecified technology',
'Wind Offshore': 'Other or unspecified technology',
'Abwasserkraftwerk': 'Other or unspecified technology',
'Unbekannt': 'Other or unspecified technology',
np.nan: 'Onshore',
None: 'Onshore'
}
CH_re_df['technology'].replace(technology_translation_dictionary, inplace=True)
# Add energy_source_level_1
CH_re_df['energy_source_level_1'] = 'Renewable energy'
# Show the hierarchy of sources present in the dataset
energy_columns = ['energy_source_level_1', 'energy_source_level_2', 'energy_source_level_3', 'technology']
CH_re_df[energy_columns].drop_duplicates().sort_values(by='energy_source_level_2')
The power plants with energy_source_level_3=Biomass and biogas
and technology=Steam turbine
do not belong to the renewable energy power plants, so we can remove them.
In [ ]:
drop_mask = (CH_re_df['energy_source_level_3'] == 'Biomass and biogas') & \
(CH_re_df['technology'] == 'Steam turbine')
drop_indices = drop_mask[drop_mask].index
CH_re_df.drop(drop_indices, axis='index', inplace=True)
In [ ]:
CH_re_df.reset_index(drop=True, inplace=True)
Replace the rest of the original terms with their OPSD equivalents
In [ ]:
CH_re_df.replace(value_dict_CH, inplace=True)
In [ ]:
# Get geo-information
zip_CH_geo = zipfile.ZipFile(CH_geo_filepath)
# Read generated postcode/location file
CH_geo = pd.read_csv(zip_CH_geo.open('CH.txt'), sep='\t', header=None)
# add column names as defined in associated readme file
CH_geo.columns = ['country_code', 'postcode', 'place_name', 'admin_name1',
'admin_code1', 'admin_name2', 'admin_code2', 'admin_name3',
'admin_code3', 'lat', 'lon', 'accuracy']
# Drop rows of possible duplicate postal_code
CH_geo.drop_duplicates('postcode', keep='last', inplace=True)
CH_geo['postcode'] = CH_geo['postcode'].astype(str)
In [ ]:
# harmonise data class
CH_geo.postcode = CH_geo.postcode.astype(int)
In [ ]:
# Add longitude/latitude infomation assigned by municipality code
CH_re_df = pd.merge(CH_re_df,
CH_geo[['lat', 'lon', 'postcode']],
left_on='municipality_code',
right_on='postcode',
how='left'
)
In [ ]:
zip_CH_geo.close()
In [ ]:
CH_postcode2nuts_path = filepaths['Eurostat']
# Use the string versions of postcode and municipality code columns
CH_re_df['postcode_str'] = CH_re_df['postcode'].astype(str).str[:-2]
CH_re_df['municipality_code_str'] = CH_re_df['municipality_code'].astype(str)
CH_re_df = nuts_converter.add_nuts_information(CH_re_df, 'CH', CH_postcode2nuts_path,
postcode_column='postcode_str',
municipality_code_column='municipality_code_str',
lau_name_type='NATIONAL', how=['postcode', 'municipality'])
# Report the number of facilites whose NUTS codes were successfully sudetermined
determined = CH_re_df['nuts_1_region'].notnull().sum()
print('NUTS successfully determined for', determined, 'out of', CH_re_df.shape[0], 'facilities in CH.')
# Report the number of facilites whose NUTS codes could not be determined
not_determined = CH_re_df['nuts_1_region'].isnull().sum()
print('NUTS could not be determined for', not_determined, 'out of', CH_re_df.shape[0], 'facilities in CH.')
Let us check the stations for which NUTS codes could not be determined.
In [ ]:
CH_re_df[CH_re_df['nuts_1_region'].isnull()][['postcode', 'municipality']]
In [ ]:
# Check the facilities without NUTS classification
no_nuts = CH_re_df['nuts_1_region'].isnull()
# Find the masks where some information for finding the proper NUTS code is present
municipality_name_present = ~(CH_re_df['municipality'].isnull())
# Show the cases where NUTS classification failed even though it shouldn't have
problematic_municipality_names = CH_re_df[no_nuts & municipality_name_present]['municipality'].unique()
print('Problematic municipalities:', ', '.join(list(problematic_municipality_names)) + '.')
print('Are those names present in the official NUTS tables for CH?')
if nuts_converter.municipality2nuts_df['municipality'].isin(problematic_municipality_names).any():
print('At least one is.')
else:
print('No, none is.')
We see that the municipalities of only plants for which we could not determine the NUTS codes cannot be found in the official translation tables, so there was no possibility to assign them their NUTS classification codes.
In [ ]:
# kW to MW
CH_re_df['electrical_capacity'] /= 1000
# kWh to MWh
CH_re_df['production'] /= 1000
In [ ]:
columns_to_keep = ['project_name', 'energy_source_level_2','energy_source_level_3', 'technology',
'electrical_capacity', 'production', 'tariff', 'commissioning_date', 'contract_period_end',
'street', 'municipality_code', 'municipality', 'nuts_1_region', 'nuts_2_region',
'nuts_3_region', 'canton', 'company', 'title', 'surname', 'first_name', 'data_source',
'energy_source_level_1', 'lat', 'lon', 'postcode']
CH_re_df = CH_re_df.loc[:, columns_to_keep]
CH_re_df.reset_index(drop=True, inplace=True)
In [ ]:
visualize_points(CH_re_df['lat'],
CH_re_df['lon'],
'Switzerland',
categories=CH_re_df['energy_source_level_2']
)
In [ ]:
CH_re_df.to_pickle('intermediate/CH_renewables.pickle')
del CH_re_df
Check and validation of the renewable power plants list as well as the creation of CSV/XLSX/SQLite files can be found in Part 2 of this script. It also generates a daily time series of cumulated installed capacities by energy source.
The data for the UK are provided by the following sources:
UK Government Department of Business, Energy & Industrial Strategy (BEIS) - the data contain information on the UK renewable energy sources and are updated at the end of each quarter.
geonames.org - the data about latitued and longitudes of the UK postcodes.
In [ ]:
# Download the data and get the local paths to the corresponding files
filepaths = downloader.download_data_for_country('UK')
UK_re_filepath = filepaths['BEIS']
UK_geo_filepath = filepaths['Geonames']
UK_postcode2nuts_filepath = filepaths['Eurostat']
In [ ]:
# Read the renewable powerplants data into a dataframe
UK_re_df = pd.read_csv(UK_re_filepath,
header=2,
encoding='latin1',
parse_dates=['Record Last Updated (dd/mm/yyyy)','Operational'],
infer_datetime_format=True,
thousands=','
)
# Drop empty columns and rows
UK_re_df.dropna(axis='index', how='all', inplace=True)
UK_re_df.dropna(axis='columns', how='all', inplace=True)
The downloaded dataset has to be cleaned:
CHP Enabled
contains five different strings: "No
", "Yes
", "no
", "yes
", and "No
" with a trailing white space, even though they represent only two distinct values. So, we have to ensure a 1-to-1 mapping between the true values of a feature and their representations for all the features present in the set.Battery
, Flywheels
and Liquid Air Energy Storage
are of no interest, so the facilities using them should be omitted.
In [ ]:
# Keep only operational facilities in the dataset
UK_re_df = UK_re_df.loc[UK_re_df["Development Status"] == "Operational"]
UK_re_df.reset_index(inplace=True, drop=True)
In [ ]:
# Standardize string columns
strip_and_lower = ['CHP Enabled']
strip_only = ['Country', 'County', 'Operator (or Applicant)', 'Mounting Type for Solar']
for column in strip_and_lower:
util.helper.standardize_column(UK_re_df, column, lower=True)
for column in strip_only:
util.helper.standardize_column(UK_re_df, column, lower=False)
In [ ]:
# Drop Flywheels, Battery and Liquid Air Energy Storage
UK_re_df = UK_re_df[~UK_re_df['Technology Type'].isin(['Flywheels', 'Battery', 'Liquid Air Energy Storage'])]
UK_re_df.reset_index(drop=True, inplace=True)
In [ ]:
# Copy the column "Technology Type" to a new column named "technology"
UK_re_df['technology'] = UK_re_df['Technology Type']
In [ ]:
# Choose the translation terms for the UK and create the translation dictionary
idx_UK = columnnames[columnnames['country'] == 'UK'].index
column_dict_UK = columnnames.loc[idx_UK].set_index('original_name')['opsd_name'].to_dict()
# Show the dictionary
column_dict_UK
In [ ]:
# Translate column names
UK_re_df.rename(columns=column_dict_UK, inplace=True)
In [ ]:
UK_re_df['data_source'] = 'BEIS'
In [ ]:
# Create dictionnary in order to assign energy_source_level_2 to its subtype
idx_UK = valuenames[valuenames['country'] == 'UK'].index
energy_source_dict_UK = valuenames.loc[idx_UK].set_index('original_name')['energy_source_level_2'].to_dict()
# Add energy_source_level_2
UK_re_df['energy_source_level_2'] = UK_re_df['energy_source_level_3'].replace(energy_source_dict_UK)
# Translate values in order to standardize energy_source_level_3
value_dict_UK = valuenames.loc[idx_UK].set_index('original_name')['opsd_name'].to_dict()
UK_re_df['energy_source_level_3'].replace(value_dict_UK, inplace=True)
# Standardize the values for technology
# 1. np.nan means that technology should not be specified for the respective kind of sources
# according to the hierarchy (http://open-power-system-data.org/2016-10-25-opsd_tree.svg)
# 2. 'Other or unspecified technology' means that technology should be specified
# but it was unclear or missing in the original dataset.
technology_translation_dictionary = {
'Biomass (co-firing)': 'Other or unspecified technology',
'Biomass (dedicated)': 'Other or unspecified technology',
'Advanced Conversion Technologies': 'Other or unspecified technology',
'Anaerobic Digestion': 'Other or unspecified technology',
'EfW Incineration': np.nan,
'Large Hydro': 'Other or unspecified technology',
'Small Hydro': 'Other or unspecified technology',
'Landfill Gas': np.nan,
'Solar Photovoltaics': 'Photovoltaics',
'Sewage Sludge Digestion': np.nan,
'Tidal Barrage and Tidal Stream': np.nan,
'Shoreline Wave': np.nan,
'Wind Offshore': 'Offshore',
'Wind Onshore': 'Onshore',
'Pumped Storage Hydroelectricity': 'Pumped storage'
}
UK_re_df['technology'].replace(technology_translation_dictionary, inplace=True)
# Add energy_source_level_1
UK_re_df['energy_source_level_1'] = 'Renewable energy'
# Show the hierarchy of sources present in the dataset
UK_re_df[['energy_source_level_1', 'energy_source_level_2', 'energy_source_level_3', 'technology']].drop_duplicates()
The facilities' location details comprise of the information on the address, county, region, country (England, Scotland, Wales, Northern Ireland), post code, and Easting (X) and Northing (Y) coordinates of each facility in the OSGB georeferencing system. To convert the easting and northing cordinates to standard WG84 latitude and longitude, we use package bng_latlon
.
In [ ]:
# Define a wrapper for bng_to_latlon for handling None values
def to_lat_lon(easting, northing):
if pd.isnull(easting) or pd.isnull(northing):
return (None, None)
else:
return bng_to_latlon.OSGB36toWGS84(easting, northing)
# Convert easting and northing columns to numbers
UK_re_df['X-coordinate'] = pd.to_numeric(
UK_re_df['X-coordinate'].astype(str).str.replace(',', ''),
errors='coerce'
)
UK_re_df['Y-coordinate'] = pd.to_numeric(
UK_re_df['Y-coordinate'].astype(str).str.replace(',', ''),
errors='coerce'
)
# Convert easting and northing coordinates to standard latitude and longitude
latlon = UK_re_df.apply(lambda row: to_lat_lon(row["X-coordinate"], row["Y-coordinate"]),
axis=1
)
# Split a column of (latitude, longitude) pairs into two separate coordinate columns
latitude = latlon.apply(lambda x: x[0])
longitude = latlon.apply(lambda x: x[1])
# Add them to the dataframe
UK_re_df['latitude'] = latitude
UK_re_df['longitude'] = longitude
If the Easting and Northing coordinates of a facility are not provided, its latitude and longitude cannot be determined. For such sources, we look up the WGS84 coordinates in the geodataset provided by geonames.org, where the UK postcodes are paired with their latitudes and longitudes.
In [ ]:
# Get geo-information
zip_UK_geo = zipfile.ZipFile(UK_geo_filepath)
# Read generated postcode/location file
UK_geo = pd.read_csv(zip_UK_geo.open('GB_full.txt'), sep='\t', header=None)
# add column names as defined in associated readme file
UK_geo.columns = ['country_code', 'postcode', 'place_name', 'admin_name1',
'admin_code1', 'admin_name2', 'admin_code2', 'admin_name3',
'admin_code3', 'lat', 'lon', 'accuracy']
# Drop rows of possible duplicate postal_code
UK_geo.drop_duplicates('postcode', keep='last', inplace=True)
UK_geo['postcode'] = UK_geo['postcode'].astype(str)
In [ ]:
# Find the rows where latitude and longitude are unknown
missing_latlon_mask = UK_re_df['latitude'].isna() | UK_re_df['longitude'].isna()
missing_latlon = UK_re_df[missing_latlon_mask]
# Add longitude/latitude infomation assigned by post code
updated_latlon = pd.merge(missing_latlon,
UK_geo[['lat', 'lon', 'postcode']],
left_on='postcode',
right_on='postcode',
how='left'
)
# Return the updated rows to the original frame
UK_re_df = pd.merge(UK_re_df,
updated_latlon[['uk_beis_id', 'lat', 'lon']],
on='uk_beis_id',
how='left'
)
# Use the bng_to_latlon coordinates (columns: 'latitude' and 'longitude') if present,
# otherwise, use those obtained with UK_geo (columns: 'lat' and 'lon').
UK_re_df['longitude'] = UK_re_df.apply(lambda row: row['longitude'] if not pd.isnull(row['longitude'])
else row['lon'],
axis=1
)
UK_re_df['latitude'] = UK_re_df.apply(lambda row: row['latitude'] if not pd.isnull(row['latitude'])
else row['lat'],
axis=1
)
# Drop the UK_geo columns (lat/lon)
# as the information was moved to the 'latitude' and 'longitude' columns.
UK_re_df.drop(['lat', 'lon'], axis='columns', inplace=True)
In [ ]:
zip_UK_geo.close()
In the cases where the full post code was not present in geonames.org, use its prefix to find the latitude / longitude pairs of locations covered by that prefix. Then, approximate those facilities' locations by the centroids of their prefix areas.
In [ ]:
# Find the rows where latitude and longitude are unknown
missing_latlon_mask = UK_re_df['latitude'].isna() | UK_re_df['longitude'].isna()
missing_latlon = UK_re_df[missing_latlon_mask].copy()
missing_latlon = missing_latlon.reset_index()
# Determine their post code prefixes
prefixes = missing_latlon.apply(lambda row: str(row['postcode']).split(' ')[0],
axis=1
)
missing_latlon['Prefix'] = prefixes
# Determine the centroids of the areas covered by the prefixes
grouped_UK_geo=UK_geo.groupby(by=lambda i: str(UK_geo['postcode'].loc[i]).split(' ')[0])
# Assing the centroid coordinates to the facilities with unknown coordinates
updated_latlon = pd.merge(missing_latlon,
grouped_UK_geo.mean(),
left_on="Prefix",
right_index=True,
how="left"
)
# Return the updated rows to the original frame
UK_re_df = pd.merge(UK_re_df,
updated_latlon[['uk_beis_id', 'lat', 'lon']],
on='uk_beis_id',
how='left'
)
# Keep the already known coordinates (columns: 'latitude' and 'longitude') if present,
# otherwise, use those obtained by approximation (columns: 'lat' and 'lon').
UK_re_df['longitude'] = UK_re_df.apply(lambda row: row['longitude'] if not pd.isnull(row['longitude'])
else row['lon'],
axis=1
)
UK_re_df['latitude'] = UK_re_df.apply(lambda row: row['latitude'] if not pd.isnull(row['latitude'])
else row['lat'],
axis=1
)
# Drop the UK_geo columns (lat/lon)
# as the information was moved to the 'latitude' and 'longitude' columns.
UK_re_df.drop(['lat', 'lon'], axis='columns', inplace=True)
In [ ]:
UK_postcode2nuts_filepath = filepaths['Eurostat']
UK_re_df = nuts_converter.add_nuts_information(UK_re_df, 'UK', UK_postcode2nuts_filepath,
latitude_column='latitude',
longitude_column='longitude', closest_approximation=True,
lau_name_type='NATIONAL', how=['latlon', 'municipality'])
# Report the number of facilites whose NUTS codes were successfully sudetermined
determined = UK_re_df['nuts_1_region'].notnull().sum()
print('NUTS successfully determined for', determined, 'out of', UK_re_df.shape[0], 'facilities in UK.')
# Report the number of facilites whose NUTS codes could not be determined
not_determined = UK_re_df['nuts_1_region'].isnull().sum()
print('NUTS could not be determined for', not_determined, 'out of', UK_re_df.shape[0], 'facilities in UK.')
Let us see the facilities for which the NUTS codes could not be determined.
In [ ]:
UK_re_df[UK_re_df['nuts_1_region'].isnull()]
There are two such rows only. The langitude and longitude coordinates, as well as municipality codes, are missing from the data set, so NUTS codes could not have been determined.
In [ ]:
visualize_points(UK_re_df['latitude'],
UK_re_df['longitude'],
'United Kingdom',
categories=UK_re_df['energy_source_level_2']
)
We see that some facilities appear to be located in the sea. Let us plot the original OSGB coordinates to see if translation to the standard longitude and latitude coordinates failed for some locations.
In [ ]:
max_X = UK_re_df['X-coordinate'].max()
min_X = UK_re_df['X-coordinate'].min()
max_Y = UK_re_df['Y-coordinate'].max()
min_Y = UK_re_df['Y-coordinate'].min()
figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
ax = plt.axes(projection=ccrs.OSGB())
ax.coastlines('10m')
ax.scatter(UK_re_df['X-coordinate'], UK_re_df['Y-coordinate'],s=0.5)
plt.show()
As we can see, the maps are basically the same, which confirms that translation to the longitude and latitude coordinates is done correctly and that they reflect the positions specified by the original X and Y OSGB coordinates.
In [ ]:
# Rename 'longitude' and 'latitude' to 'lon' and 'lat' to conform to the naming convention
# used for other countries.
UK_re_df.rename(columns={'longitude': 'lon', 'latitude': 'lat'}, inplace=True)
# Define the columns to keep
columns_of_interest = ['commissioning_date', 'uk_beis_id', 'operator', 'site_name',
'energy_source_level_1', 'energy_source_level_2', 'energy_source_level_3', 'technology',
'electrical_capacity', 'chp', 'support_robranding', 'support_fit', 'support_cfd',
'capacity_individual_turbine', 'number_of_turbines', 'solar_mounting_type',
'status', 'address', 'municipality', 'nuts_1_region', 'nuts_2_region', 'nuts_3_region',
'region', 'country', 'postcode', 'lon', 'lat', 'data_source'
]
for col in columns_of_interest:
if col not in UK_re_df.columns:
print(col)
# Clean the dataframe from columns other than those specified above
UK_re_df = UK_re_df.loc[:, columns_of_interest]
UK_re_df.reset_index(drop=True, inplace=True)
UK_re_df.columns
In [ ]:
UK_re_df.to_pickle('intermediate/UK_renewables.pickle')
The data for Sweden are provided by the following sources:
In [ ]:
# Download the data and get the local paths to the corresponding files
filepaths = downloader.download_data_for_country('SE')
print(filepaths)
SE_re_filepath = filepaths['Vindbrukskollen']
SE_geo_filepath = filepaths['Geonames']
SE_postcode2nuts_filepath = filepaths['Eurostat']
In [ ]:
# Define the function for converting the column "Senast sparads" to date type
#def from_int_to_date(int_date):
# print(int_date)
# str_date =str(int_date)
# year = str_date[:4]
# month = str_date[4:6]
# day = str_date[6:8]
# str_date = '{}/{}/{}'.format(year, month, day)
# return pd.to_datetime(str_date, format='%Y/%m/%d')
# Read the data
SE_re_df = pd.read_excel(SE_re_filepath,
sheet_name='Vindkraftverk',
na_values='-',
parse_dates=['Uppfört', 'Senast sparad'],
infer_datetime_format=True,
#converters={'Senast sparad' : from_int_to_date}
)
# Show 5 rows from the beginning
SE_re_df.head(5)
In [ ]:
SE_re_df.dtypes
In [ ]:
# Drop empty rows and columns
SE_re_df.dropna(axis='index', how='all', inplace=True)
SE_re_df.dropna(axis='columns', how='all', inplace=True)
In [ ]:
# Make sure that the column Uppfört is of the date type
SE_re_df['Uppfört'] = pd.to_datetime(SE_re_df['Uppfört'], format='%Y-%m-%d')
In [ ]:
# Keep only operational wind farms
subset_mask = SE_re_df['Status'].isin(['Beviljat', 'Uppfört'])
SE_re_df.drop(SE_re_df[~subset_mask].index, axis='index', inplace=True)
In [ ]:
# Remove the farms whose capacity is not known.
subset_mask = SE_re_df['Maxeffekt (MW)'].isna()
SE_re_df.drop(SE_re_df[subset_mask].index, axis='index', inplace=True)
In [ ]:
# Standardize string columns
string_columns = ['Modell', 'Fabrikat', 'Elområde', 'Kommun', 'Län', 'Handlingstyp', 'Placering']
for col in string_columns:
util.helper.standardize_column(SE_re_df, col, lower=False)
In [ ]:
# Choose the translation terms for the UK and create the translation dictionary
idx_SE = columnnames[columnnames['country'] == 'SE'].index
column_dict_SE = columnnames.loc[idx_SE].set_index('original_name')['opsd_name'].to_dict()
# Show the dictionary
display(column_dict_SE)
In [ ]:
# Translate column names
SE_re_df.rename(columns=column_dict_SE, inplace=True)
In [ ]:
SE_re_df['data_source'] = 'Vindbrukskollen'
In [ ]:
# Choose the translation terms for Sweden
idx_SE = valuenames[valuenames['country'] == 'SE'].index
value_dict_SE = valuenames.loc[idx_SE].set_index('original_name')['opsd_name'].to_dict()
value_dict_SE
In [ ]:
# Replace all original value names by the OPSD value names
SE_re_df.replace(value_dict_SE, inplace=True)
# Set nans in the technology column to 'Unknown or unspecified technology'
SE_re_df['technology'].fillna('Unknown or unspecified technology', inplace=True)
In [ ]:
# Add energy level 2
SE_re_df['energy_source_level_2'] = 'Wind'
In [ ]:
# Add energy_source_level_1
SE_re_df['energy_source_level_1'] = 'Renewable energy'
# Show the hierarchy of sources present in the dataset
SE_re_df[['energy_source_level_1', 'energy_source_level_2', 'technology']].drop_duplicates()
The coordinates in the columns sweref99tm_north
and sweref99tm_east
are specified in the SWEREF 99 TM coordinate system, used in Sweden. To convert those coordinates to the usual WGS84 latitudes and longitudes, we use the function sweref99tm_latlon_transform
from the module util.helper
, provided by Jon Olauson.
In [ ]:
# Get latitude and longitude columns
lat, lon = util.helper.sweref99tm_latlon_transform(SE_re_df['sweref99tm_north'], SE_re_df['sweref99tm_east'])
# Include them in the dataframe
SE_re_df['lat'] = lat
SE_re_df['lon'] = lon
In [ ]:
SE_postcode2nuts_filepath = filepaths['Eurostat']
SE_re_df = nuts_converter.add_nuts_information(SE_re_df, 'SE', SE_postcode2nuts_filepath,
lau_name_type='NATIONAL', how=['municipality', 'latlon'])
# Report the number of facilites whose NUTS codes were successfully sudetermined
determined = SE_re_df['nuts_1_region'].notnull().sum()
print('NUTS successfully determined for', determined, 'out of', SE_re_df.shape[0], 'facilities in SE.')
# Report the number of facilites whose NUTS codes could not be determined
not_determined = SE_re_df['nuts_1_region'].isnull().sum()
print('NUTS could not be determined for', not_determined, 'out of', SE_re_df.shape[0], 'facilities in SE.')
In [ ]:
# Define which columns should be kept
columns_to_keep = ['municipality', 'county', 'nuts_1_region', 'nuts_2_region', 'nuts_3_region', 'lat', 'lon',
'energy_source_level_1', 'energy_source_level_2', 'technology', 'se_vindbrukskollen_id',
'site_name', 'manufacturer',
'electrical_capacity', 'commissioning_date', 'data_source']
# Keep only the selected columns
SE_re_df = SE_re_df.loc[:, columns_to_keep]
In [ ]:
visualize_points(SE_re_df['lat'],
SE_re_df['lon'],
'Sweden',
categories=SE_re_df['technology']
)
In [ ]:
SE_re_df.reset_index(inplace=True, drop=True)
SE_re_df.to_pickle('intermediate/SE_renewables.pickle')
The data for Czech Republic are provided by the following source:
Downloading the data from the original source may take 1-2 hours because it's done by scraping the information from HTML pages.
If downloading fails because of the ERU's server refusing connections:
eru.csv
in the CZ input directory;Alternatively, you can download the data from the OPSD server.
In [ ]:
# Download the data and get the local paths to the corresponding files
print('Start:', datetime.datetime.now())
downloader = Downloader(version, input_directory_path, source_list_filepath, download_from)
filepaths = downloader.download_data_for_country('CZ')
print('End:', datetime.datetime.now())
CZ_re_filepath = filepaths['ERU']
CZ_geo_filepath = filepaths['Geonames']
CZ_postcode2nuts_filepath = filepaths['Eurostat']
In [ ]:
# Define a converter for CZ postcode strings
def to_cz_postcode_format(postcode_str):
return postcode_str[:3] + ' ' + postcode_str[3:]
# Read the data from the csv file
CZ_re_df = pd.read_csv(CZ_re_filepath,
escapechar='\\',
dtype = {
'number_of_sources' : int,
},
parse_dates=['licence_approval_date'],
infer_datetime_format=True,
converters = {
'site_postcode' : to_cz_postcode_format,
'holder_postcode' : to_cz_postcode_format
}
)
# Show a few rows
CZ_re_df.head(5)
Let's inspect the dataframe's columns:
In [ ]:
CZ_re_df.dtypes
It contains 30 columns:
site_name
, site_region
, site_postcode
, site_locality
, site_district
give us basic information on the site;megawatts_electric_total
shows us the total electric capacity of the site;megawatts_electric_hydro
, megawatts_electric_solar
, megawatts_electric_biogas_and_biomass
, megawatts_electric_wind
, megawatts_electric_unspecified
show us how total capacity breaks down to those renewable types from the OPSD energy hierarchy;megawatts_thermal_
represent the amiunt of input energy required (and will be equal to zero in most cases);watercourse
and watercourse_length_km
represent the name and length of the watercourse used by the site (if any);holder_name
, holder_region
, holder_address
, holder_postcode
, holder_locality
, holder_district
, holder_representative
give us basic information on the site's owner;licence_number
and licence_approval_date
show us the licence number given to the holder and its approval date.link
points to the ERU page with the site's data in HTML.Since some sites use conventional types of energy, it is possible that megawatts_electric_total > megawatts_electric_hydro + megawatts_electric_solar + megawatts_electric_biogas_and_biomass + megawatts_electric_wind + megawatts_electric_unspecified
. If the sum of renewable-energy capacities is equal to zero, that means that the correspoding row actually represents a conventional powerplant, so it should be excluded.
Let us now check how many sites use how many types of renewable energy sources.
In [ ]:
mwe_columns = [col for col in CZ_re_df.columns if 'megawatts_electric' in col and col != 'megawatts_electric_total']
mwt_columns = [col for col in CZ_re_df.columns if 'megawatts_thermal' in col and col != 'megawatts_thermal_total']
def count_types(row):
global mwe_columns
different_types = sum([row[col] > 0 for col in mwe_columns])
return different_types
CZ_re_df.apply(count_types, axis=1).value_counts()
As of April 2020, as we can see in the output above, there are only 4 sites which use more than one type of renewable energy, and there are 193 sites which do not use renewable energy at all.
In [ ]:
# Drop empty columns and rows
CZ_re_df.dropna(axis='index', how='all', inplace=True)
CZ_re_df.dropna(axis='columns', how='all', inplace=True)
# Drop rows with no data on electrical capacity and the rows where total electrical capacity is 0
empty_mask = (CZ_re_df['megawatts_electric_total'] == 0) | (CZ_re_df['megawatts_electric_total'].isnull())
CZ_re_df = CZ_re_df.loc[~empty_mask]
CZ_re_df.reset_index(inplace=True, drop=True)
# Replace NANs with zeroes in mwe and mwt columns
replacement_dict = {col : 0 for col in mwe_columns + mwt_columns}
CZ_re_df.fillna(replacement_dict, inplace=True)
# Drop the rows where renewable-energy share of the total capacity is equal to zero
conventional_mask = (CZ_re_df['megawatts_electric_hydro'] +
CZ_re_df['megawatts_electric_solar'] +
CZ_re_df['megawatts_electric_biogas_and_biomass'] +
CZ_re_df['megawatts_electric_wind'] +
CZ_re_df['megawatts_electric_unspecified']) == 0
CZ_re_df = CZ_re_df.loc[~conventional_mask]
CZ_re_df.reset_index(inplace=True, drop=True)
There are sites which use different types of renewable source to produce electric energy. Those are the sites where at least two of the following columns are not equal to zero: megawatts_electric_hydro
, megawatts_electric_solar
, megawatts_electric_biogas_and_biomass
, megawatts_electric_wind
, megawatts_electric_unspecified
. The data that come in this shape are said to be in the so called wide format. For the purpose of our later processing, it would be more convenient to have the data where each row is associated to one and only one type of energy (the so called long format). Therefore, we must first restructure our data from the wide to long format.
In [ ]:
# Define the function which will extract the data about the type of energy specified by the given column
# and return it as a dataframe in the "long format"
def select_and_reformat(df, column):
# Use the mwe and mwt columns defined above
global mwe_columns
global mwt_columns
# Declare the given column and its mwt counterpart as exceptions
mwt_exception = column.replace('electric', 'thermal')
exceptions = [column, mwt_exception]
# Exclude all the mwe and mwt columns which do not correspond to the given energy type
columns_to_skip = [col for col in mwe_columns + mwt_columns if col not in exceptions]
# Keep all the other columns
columns_to_keep = [col for col in df.columns if col not in columns_to_skip]
# Find the stations which use the given type of energy
selection_mask = (df[column] > 0)
# Keep them and select the columns we decided to keep
selection_df = df[selection_mask][columns_to_keep]
# Create a new column which will indicate the energy type
selection_df['energy_type'] = " ".join(column.split('_')[2:])
# Remove the energy type name from the columns representing electrical capacity
# and megawatts thermal
selection_df.rename(columns = {column : 'electrical_capacity',
mwt_exception : 'megawatts_thermal'},
inplace=True)
selection_df.drop(columns=['megawatts_electric_total', 'megawatts_thermal_total'],
inplace=True)
# Ensure the rows are properly indexed as 0,1,2,...
selection_df.reset_index(inplace=True, drop=True)
return selection_df
# Create a dataframe for each energy type
dataframes = []
for column in mwe_columns:
selection = select_and_reformat(CZ_re_df, column)
energy_type = selection['energy_type'].unique()[0]
dataframes.append(selection)
# Concatenate the dataframes
CZ_re_df = pd.concat(dataframes, ignore_index=False)
CZ_re_df.reset_index(inplace=True, drop=True)
Let us see what is this restructured dataframe like.
In [ ]:
CZ_re_df
The number of columns has been reduced as we have transformed the data to the long format. The rows representning conventional power plants have been excluded. Since only few sites use multiple types of energy, the total number of rows has not increased.
In [ ]:
# Choose the translation terms for CZ and create the translation dictionary
idx_CZ = columnnames[columnnames['country'] == 'CZ'].index
column_dict_CZ = columnnames.loc[idx_CZ].set_index('original_name')['opsd_name'].to_dict()
# Show the dictionary
column_dict_CZ
In [ ]:
# Translate column names
CZ_re_df.rename(columns=column_dict_CZ, inplace=True)
In [ ]:
# Choose the translation terms for Czech Republic
idx_CZ = valuenames[valuenames['country'] == 'CZ'].index
In [ ]:
# Choose the translation terms for energy source level 3
energy3_dict_CZ = valuenames.loc[idx_CZ].set_index('original_name')['opsd_name'].to_dict()
energy3_dict_CZ
# Add energy source level 3
CZ_re_df['energy_source_level_3'] = CZ_re_df['technology'].replace(energy3_dict_CZ)
# Choose the terms for energy source level 2
energy2_dict_CZ = valuenames.loc[idx_CZ].set_index('original_name')['energy_source_level_2'].to_dict()
CZ_re_df['energy_source_level_2'] = CZ_re_df['technology'].replace(energy2_dict_CZ)
# Standardize the values for technology
# 1. np.nan means that technology should not be specified for the respective kind of sources
# according to the hierarchy (http://open-power-system-data.org/2016-10-25-opsd_tree.svg)
# 2. 'Other or unspecified technology' means that technology should be specified
# but it was unclear or missing in the original dataset.
technology_dict = {
'biogas and biomass' : np.nan,
'wind' : 'Onshore',
'solar' : 'Other or unspecified technology',
'hydro' : 'Run-of-river',
'unspecified' : np.nan
}
CZ_re_df['technology'] = CZ_re_df['technology'].replace(technology_dict)
# Add energy_source_level_1
CZ_re_df['energy_source_level_1'] = 'Renewable energy'
# Show the hierarchy of sources present in the dataset
CZ_re_df[['energy_source_level_1', 'energy_source_level_2', 'energy_source_level_3', 'technology']].drop_duplicates()
In [ ]:
CZ_re_df['data_source'] = 'ERU'
In [ ]:
# Get geo-information
zip_CZ_geo = zipfile.ZipFile(CZ_geo_filepath)
# Read generated postcode/location file
CZ_geo = pd.read_csv(zip_CZ_geo.open('CZ.txt'), sep='\t', header=None)
# add column names as defined in associated readme file
CZ_geo.columns = ['country_code', 'postcode', 'place_name', 'admin_name1',
'admin_code1', 'admin_name2', 'admin_code2', 'admin_name3',
'admin_code3', 'lat', 'lon', 'accuracy']
# Drop rows of possible duplicate postal_code
CZ_geo.drop_duplicates('postcode', keep='last', inplace=True)
# Add longitude/latitude infomation assigned by postcode
CZ_re_df = pd.merge(CZ_re_df,
CZ_geo[['lat', 'lon', 'postcode']],
left_on='postcode',
right_on='postcode',
how='left'
)
In [ ]:
CZ_postcode2nuts_filepath = filepaths['Eurostat']
CZ_re_df = nuts_converter.add_nuts_information(CZ_re_df, 'CZ', CZ_postcode2nuts_filepath, how=['postcode'])
# Report the number of facilites whose NUTS codes were successfully determined
determined = CZ_re_df['nuts_1_region'].notnull().sum()
print('NUTS successfully determined for', determined, 'out of', CZ_re_df.shape[0], 'facilities in CZ.')
# Report the number of facilites whose NUTS codes could not be determined
not_determined = CZ_re_df['nuts_1_region'].isnull().sum()
print('NUTS could not be determined for', not_determined, 'out of', CZ_re_df.shape[0], 'facilities in CZ.')
In [ ]:
# Define which columns should be kept
columns_to_keep = ['site_name', 'region', 'municipality', 'locality', 'postcode',
'nuts_1_region', 'nuts_2_region', 'nuts_3_region', 'lat', 'lon',
'energy_source_level_1', 'energy_source_level_2', 'energy_source_level_3', 'technology',
'owner', 'electrical_capacity', 'data_source']
# Keep only the selected columns
CZ_re_df = CZ_re_df.loc[:, columns_to_keep]
In [ ]:
visualize_points(CZ_re_df['lat'],
CZ_re_df['lon'],
'Czechia',
categories=CZ_re_df['energy_source_level_2']
)
In [ ]:
CZ_re_df.reset_index(inplace=True, drop=True)
CZ_re_df.to_pickle('intermediate/CZ_renewables.pickle')
del CZ_re_df
In [ ]:
zip_archive = zipfile.ZipFile(input_directory_path + '.zip', 'w', zipfile.ZIP_DEFLATED)
print("Zipping the raw files...")
for filename in os.listdir(input_directory_path):
print("Adding", filename, "to the zip.")
filepath = os.path.join(input_directory_path, filename)
zip_archive.write(filepath)
zip_archive.close()
print("Done!")
#shutil.rmtree(input_directory_path)