Import of Python modules needed to process the data and creation of required output folders. Note: The package pyproj
needs to installed as they are not part of the standard Anaconda installation.
In [ ]:
import datetime
import json
import logging
import os
import urllib.parse
import urllib.request
import sqlite3
import zipfile
import io
import requests
import csv
import numpy as np
import pandas as pd
import pyproj # required for transforming coordinates
import yaml
import hashlib
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%d %b %Y %H:%M:%S'
)
logger = logging.getLogger()
# create download and output folder if they do not exist
os.makedirs(os.path.join('download'), exist_ok=True)
os.makedirs(os.path.join('output'), exist_ok=True)
os.makedirs(os.path.join('output', 'original_data'), exist_ok=True)
The original data can either be downloaded from the original data sources as specified below or from the OPSD-Server. Default option is to download from the original sources as the aim of the project is to stay as close to original sources as possible. However, if problems with downloads e.g. due to changing URLs occur, you can still run the script with the original data from the OPSD-server.
In [ ]:
# Activate or uncomment to download from original data sources
download_from = 'original_sources'
# Activate or uncomment to download from existing OPSD versions
# download_from = 'opsd_server'
# version = '2016-10-27'
The meta data of the national data sources, which is required for downloading and importing the data, are laid down in the following section. The following meta data information are required, which partly differ for each data source depending on the filetype for import.
CSV format
Country: Country ISO code
Name of source: Short name of source
url_initial: Initial URL on data source webpage
url_template: Direct URL to data file
filename: Name of data file without filetype extension
filename_opsd: Name of data file for OPSD download
'2016-10-27': Name of data file in previous OPSD version
filetype: Name of filetype (e.g. csv)
sep: Column separator (e.g. ',')
skiprows: Number of first rows to be skipped during import
decimal: Decimal separator (e.g. '.')
encoding: Type of encoding (e.g. 'utf-8')
Excel format (xls, xlsx)
Country: Country ISO code
Name of source: Short name of source
url_initial: Initial URL on data source webpage
url_template: Direct URL to data file
filename: Name of data file without filetype extension
filename_opsd: Name of data file for OPSD download
'2016-10-27': Name of data file in previous OPSD version
filetype: Name of filetype (e.g. xlsx)
sheetname: Name of sheet in workbook
skiprows: Number of first rows to be skipped during import
In [ ]:
conf = """
BE:
Elia:
url_initial: http://www.elia.be/en/grid-data/power-generation/generating-facilities
url_template: http://publications.elia.be/upload/ProductionParkOverview.xls?TS=20120416193815
filename: ProductionParkOverview
filename_opsd:
'2016-10-27': ProductionParkOverview
'2017-03-03': ProductionParkOverview
'2017-07-03': ProductionParkOverview
'2018-02-27': ProductionParkOverview
'2018-12-20': ProductionParkOverview
filetype: xls
sheetname: 'ProductionParkOverview'
skiprows: 1
NL:
Tennet_Q1:
url_initial: http://www.tennet.org/english/operational_management/export_data.aspx
url_template: http://www.tennet.org/english/operational_management/export_data.aspx?exporttype=installedcapacity&format=csv&quarter=2018-1&submit=3
filename: export_Q12018
filename_opsd:
'2016-10-27': export_Q12015
'2017-03-03': export_Q12016
'2017-07-03': export_Q12016
'2018-02-27': export_Q12017
'2018-12-20': export_Q12018
filetype: csv
sep: ','
skiprows: 0
decimal: '.'
encoding: 'utf-8'
Tennet_Q2:
url_initial: http://www.tennet.org/english/operational_management/export_data.aspx
url_template: http://www.tennet.org/english/operational_management/export_data.aspx?exporttype=installedcapacity&format=csv&quarter=2018-2&submit=3
filename: export_Q22018
filename_opsd:
'2016-10-27': export_Q22015
'2017-03-03': export_Q22016
'2017-07-03': export_Q22016
'2018-02-27': export_Q22017
'2018-12-20': export_Q22018
filetype: csv
sep: ','
skiprows: 0
decimal: '.'
encoding: 'utf-8'
Tennet_Q3:
url_initial: http://www.tennet.org/english/operational_management/export_data.aspx
url_template: http://www.tennet.org/english/operational_management/export_data.aspx?exporttype=installedcapacity&format=csv&quarter=2018-3&submit=3
filename: export_Q32018
filename_opsd:
'2016-10-27': export_Q32015
'2017-03-03': export_Q32016
'2017-07-03': export_Q32016
'2018-02-27': export_Q32017
'2018-12-20': export_Q32018
filetype: csv
sep: ','
skiprows: 0
decimal: '.'
encoding: 'utf-8'
Tennet_Q4:
url_initial: http://www.tennet.org/english/operational_management/export_data.aspx
url_template: http://www.tennet.org/english/operational_management/export_data.aspx?exporttype=installedcapacity&format=csv&quarter=2018-4&submit=3
filename: export_Q42018
filename_opsd:
'2016-10-27': export_Q42015
'2017-03-03': export_Q42016
'2017-07-03': export_Q42016
'2018-02-27': export_Q42017
'2018-12-20': export_Q42018
filetype: csv
sep: ','
skiprows: 0
decimal: '.'
encoding: 'utf-8'
IT:
Terna:
url_initial: http://www.terna.it/it-it/sistemaelettrico/transparencyreport/generation/installedgenerationcapacity.aspx
url_template: http://download.terna.it/terna/0000/0216/16.XLSX
filename: 16
filename_opsd:
'2016-10-27': 16
'2017-03-03': 16
'2017-07-03': 16
'2018-02-27': 16
'2018-12-20': 16
filetype: xls
sheetname: 'UPR PmaxOver 100MW'
skiprows: 0
FR:
RTE:
url_initial: https://clients.rte-france.com/lang/an/visiteurs/vie/prod/parc_reference.jsp
url_template: http://clients.rte-france.com/servlets/CodesEICServlet
filename: Centrales_production_reference
filename_opsd:
'2016-10-27': Centrales_production_reference
'2017-03-03': Centrales_production_reference
'2017-07-03': Centrales_production_reference
'2018-02-27': Centrales_production_reference
'2018-12-20': Centrales_production_reference
filetype: zip
sep: '\t'
skiprows: 2
decimal: ','
encoding: 'cp1252'
ES:
SEDE:
url_initial: https://sedeaplicaciones.minetur.gob.es/electra/BuscarDatos.aspx
url_template: http://www6.mityc.es/aplicaciones/electra/ElectraExp.csv.zip
filename: ElectraExp
filename_opsd:
'2016-10-27': ElectraExp
'2017-03-03': ElectraExp
'2017-07-03': ElectraExp
'2018-02-27': ElectraExp
'2018-12-20': ElectraExp
filetype: zip
sep: ';'
skiprows: 0
decimal: ','
encoding: 'utf-8'
FI:
EnergyAuthority:
url_initial: https://www.energiavirasto.fi/en/web/energy-authority/power-plant-register
url_template: https://www.energiavirasto.fi/documents/10191/0/Energiaviraston+voimalaitosrekisteri+29102018.xlsx/4b4b87f5-9eee-409d-836f-5b0040dba7de
filename: Energiaviraston+voimalaitosrekisteri+29102018
filename_opsd:
'2016-10-27': Energiaviraston+Voimalaitosrekisteri+040316
'2017-03-03': Energiaviraston+Voimalaitosrekisteri+010117
'2017-07-03': Energiaviraston+voimalaitosrekisteri+06072017
'2018-02-27': Energiaviraston+voimalaitosrekisteri+01022018
'2018-12-20': Energiaviraston+voimalaitosrekisteri+29102018
filetype: xlsx
sheetname: 'English'
skiprows: 1
PL:
GPI:
url_initial: http://gpi.tge.pl/en/wykaz-jednostek;jsessionid=C2472043DF326CED2F9C0840B503F5B0.gpi-app1
url_template: http://gpi.tge.pl/en/wykaz-jednostek?p_p_id=powerunits_WAR_powerunitsportlet&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_cacheability=cacheLevelPage&p_p_col_id=column-1&p_p_col_count=1
filename: units_list
filename_opsd:
'2016-10-27': units_list
'2017-03-03': units_list
'2017-07-03': units_list
'2018-02-27': units_list
'2018-12-20': units_list
filetype: csv
sep: ';'
skiprows: 0
decimal: '.'
encoding: 'utf-8'
UK:
GOV:
url_initial: https://www.gov.uk/government/statistics/electricity-chapter-5-digest-of-united-kingdom-energy-statistics-dukes
url_template: https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/558040/DUKES_5.10.xls
filename: DUKES_5.10
filename_opsd:
'2016-10-27': dukes5_10
'2017-03-03': dukes5_10
'2017-07-03': DUKES_5.10
'2018-02-27': DUKES_5.10
filetype: xls
sheetname: 'Database'
skiprows: 3
CZ:
CEPS:
url_initial: http://www.ceps.cz/en/all-data#AvailableCapacity
url_template: http://www.ceps.cz/en/all-data#AvailableCapacity
url_request1: http://www.ceps.cz/en/all-data?do=loadGraphData&method=AvailableCapacity&filter_data={"dateFrom":"2018-01-01 00:00:00","dateTo":"2018-12-20 23:59:59","version":"YF","agregation":"MI","function":"AVG"}&graph_id=1026&move_graph=year&download=xls
url_request2: http://www.ceps.cz/download-data/?format=txt
params:
format: txt
do: loadGraphData
method: AvailableCapacity
filter_data: {"dateFrom":"2018-01-01 00:00:00","dateTo":"2018-12-20 23:59:59","version":"YF","agregation":"MI","function":"AVG"}
download: txt
graph_id: 1026
move_graph: year
headers:
Host: www.ceps.cz
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0
Accept: application/json, text/javascript, */*; q=0.01
Accept-Language: de,en-US;q=0.7,en;q=0.3
Accept-Encoding: gzip, deflate
Referer: http://www.ceps.cz/en/all-data
X-Requested-With: XMLHttpRequest
Cookie: BIGipServer~produkce~WEB2017-80=rd1o00000000000000000000ffffac180a87o80
Connection: keep-alive
filename: data
filename_opsd:
'2016-10-27': Data
'2017-03-03': Data
'2017-07-03': Data
'2018-02-27': data
'2018-12-20': data
filetype: csv
sep: ','
skiprows: 0
decimal: '.'
encoding: 'utf-8'
CH:
BFE:
url_initial: http://www.bfe.admin.ch/themen/00490/00491/index.html?lang=de&dossier_id=01049
url_template: http://www.bfe.admin.ch/php/modules/publikationen/stream.php?extlang=de&name=de_496108515.zip&endung=Statistik%20der%20Wasserkraftanlagen%20der%20Schweiz
filename: 'Statistik der Wasserkraftanlagen der Schweiz 1.1.2018'
filename_opsd:
'2016-10-27': 'Statistik der Wasserkraftanlagen der Schweiz 1.1.2016'
'2017-03-03': 'Statistik der Wasserkraftanlagen der Schweiz 1.1.2016'
'2017-07-03': 'Statistik der Wasserkraftanlagen der Schweiz 1.1.2017'
'2018-02-27': 'Statistik der Wasserkraftanlagen der Schweiz 1.1.2017'
'2018-12-20': 'Statistik der Wasserkraftanlagen der Schweiz 1.1.2018'
filetype: zip
sheetname: 'Vorschau'
skiprows: 0
"""
conf = yaml.load(conf)
The download function standardizes the download process of the different national data sources. The required input parameter are the URL, the filename, and the filetype. The function downloads the corresponding data source and saves the file in the folder download
. Additionally, the file is saved in the folder output/original_data
together with the results of this processing script.
In [ ]:
def downloadandcache(url, filename, filetype):
"""This function downloads a file into a folder called
downloads and returns the local filepath."""
filename = str(filename)
filetype = filetype
now = datetime.datetime.now()
datestring = ""
datestring = str(now.year)+"-"+str(now.month)+"-"+str(now.day)
filepath = os.path.join(
'download', datestring+"-"+filename+"."+filetype)
filepath_original_data = os.path.join(
'output', 'original_data', filename+"."+filetype)
# Check if file exists, otherwise download it
if not os.path.exists(filepath):
print("Downloading file", filename+"."+filetype)
urllib.request.urlretrieve(url, filepath)
urllib.request.urlretrieve(url, filepath_original_data)
else:
print("Using local file from", filepath)
filepath = './'+filepath
return filepath
The import function reads the downloaded data files and imports it into a data frame. The function uses the meta data defined in 2.1. Depending on the filetype, pandas-specific import function are used to read the files.
In [ ]:
def importdata(country, tso):
"""This function imports the downloaded data
and returns a data frame for further processing."""
now = datetime.datetime.now()
datestring = ""
datestring = str(now.year)+"-"+str(now.month)+"-"+str(now.day)
param = conf[country][tso]
filepath = os.path.join('download',
datestring+"-"+str(
param['filename'])+"."+param['filetype'])
if param['filetype'] == 'csv':
data_import = pd.read_csv(filepath,
sep=param['sep'],
skiprows=param['skiprows'],
decimal=param['decimal'],
encoding=param['encoding'],
low_memory=False)
elif param['filetype'] == 'dbf':
dbf = Dbf5(filepath, codec=param['encoding'])
data_import = dbf.to_dataframe()
else:
data_import = pd.read_excel(filepath,
sheetname=param['sheetname'],
skiprows=param['skiprows'])
data_import['country'] = str(country)
data_import['source'] = str(tso)
return data_import
In [ ]:
# Hash function
def get_sha_hash(path, blocksize=65536):
sha_hasher = hashlib.sha256()
with open(path, 'rb') as f:
buffer = f.read(blocksize)
while len(buffer) > 0:
sha_hasher.update(buffer)
buffer = f.read(blocksize)
return sha_hasher.hexdigest()
To provide a standardizes set of power plant information among all national data sources, a set of required columns is defined which is subsequently filled with available data. The following columns and their structure are the basis for all national data sources.
Note: If information for specific columns are not available, the data entry is empty. On the other hand, if the national data sources provides other information than required by the scheme, these information are not processed.
In [ ]:
columns_sorted = ['name',
'company',
'street',
'postcode',
'city',
'country',
'capacity',
'energy_source',
'technology',
'chp',
'commissioned',
'type',
'lat',
'lon',
'eic_code',
'additional_info',
'comment',
'source']
The subsequent download loops of the different entries, countries and data sources, of the meta data file (section 2.1) and calls the previously defined download function which is feeded with the data source specific meta data. After downloading the data files, they are further handled to recieve files which are ready for a direct import through pandas. In most cases, readable filetypes are provided (e.g. csv, xls, xlsx). However, the following exceptions need special attention:
download
.download
.
In [ ]:
for country, tso in conf.items():
for tso, param in tso.items():
# Read and define input values for download function
if download_from == 'original_sources':
url_data = param['url_template']
filename_data = param['filename']
filetype_data = param['filetype']
if download_from == 'opsd_server':
url_data = 'http://data.open-power-system-data.org/conventional_power_plants/'
+ version + '/original_data/'
filename_data = param['filename_opsd'][version]
filetype_data = param['filetype']
url_data = url_data + str(filename_data).replace(" ", "%20")
+ '.' + str(filetype_data)
print(url_data)
# Call of download function
if download_from == 'original_sources' and not tso == 'CEPS':
downloadandcache(url_data, filename_data, filetype_data)
# Special case for CZ
if download_from == 'original_sources' and tso == 'CEPS':
now = datetime.datetime.now()
datestring = ""
datestring = str(now.year)+"-"+str(now.month)+"-"+str(now.day)
filepath = os.path.join(
'download', datestring+"-"+filename_data+"."+filetype_data)
filepath_original_data = os.path.join(
'output', 'original_data', filename_data+"."+filetype_data)
# Check if file exists, otherwise download/create it
if not os.path.exists(filepath):
print("Downloading file", filename_data+"."+filetype_data)
res = requests.get(param['url_request1'], params=param['params'], headers=param['headers'])
res = requests.get(param['url_request2'], params=param['params'], headers=param['headers'])
fix = res.text
df = pd.read_csv(io.StringIO(fix), sep=";", skiprows=2)
df = df.drop(['Unnamed: 5', 'Date'], axis=1)
df.to_csv(filepath,index=False,encoding='utf-8')
df.to_csv(filepath_original_data,index=False,encoding='utf-8')
else:
print("Using local file from", filepath)
if download_from == 'opsd_server':
downloadandcache(url_data, filename_data, filetype_data)
# Treatment of special cases for FR, ES, and CH
now = datetime.datetime.now()
datestring = str(now.year) + "-" + str(now.month) + "-" + str(now.day)
filepath = os.path.join(
"download", datestring + "-" + str(filename_data))
# Special case FR (RTE): ZIP-file with corrupted xls-file is provided, which needs to be renamed to csv
if country == 'FR':
with zipfile.ZipFile(filepath + ".zip", "r") as O:
O.extractall("download")
if not os.path.exists(filepath + ".csv"):
os.rename(os.path.join("download", filename_data + ".xls"),
filepath + ".csv")
# change filetype from zip to csv
conf[country][tso]['filetype'] = "csv"
# Special case ES (SEDE): ZIP-file with csv file
if country == 'ES':
with zipfile.ZipFile(filepath + ".zip", "r") as O:
O.extractall("download")
if not os.path.exists(filepath + ".csv"):
os.rename(os.path.join("download", filename_data + ".csv"),
filepath + ".csv")
# change filetype from zip to csv
conf[country][tso]['filetype'] = "csv"
# Special case CH (BFE): ZIP-file with xlsx file
if country == 'CH':
with zipfile.ZipFile(filepath + ".zip", "r") as O:
O.extractall("download")
if not os.path.exists(filepath + ".xlsx"):
os.rename(os.path.join("download", filename_data + ".xlsx"),
filepath + ".xlsx")
# change filetype from zip to csv
conf[country][tso]['filetype'] = "xlsx"
# print(conf)
The data is provided by the Belgian transmission network operator ELIA. It encompasses a detailed list of Belgian generation units with comprehensive information on technologies and energy fuels.
In [ ]:
data_BE = importdata('BE', 'Elia')
data_BE.head()
The imported data is standardized with respect to the columns as defined in section 2.4. In a first step, existing and output-relevant columns are translated and remaining columns are deleted in a second step. Columns which are not exist in the data set, but required for the output are additionally added in this process.
In [ ]:
# Translate columns
dict_columns_BE = {'ARP': 'company',
'Generation plant': 'name',
'Plant Type': 'technology',
'Technical Nominal Power (MW)': 'capacity',
'Remarks': 'comment',
'Fuel': 'energy_source',
'Country': 'country',
'Source': 'source'}
data_BE.rename(columns=dict_columns_BE, inplace=True)
# Apply general template of columns
data_BE = data_BE.reindex(columns=columns_sorted)
# Drop rows without capacity entries, so that row with
# "Unit connected to Distribution Grid" is dropped
data_BE = data_BE.dropna(subset=['capacity'])
# Adjust types of entries in all columns
data_BE.capacity = data_BE.capacity.astype(float)
data_BE.head()
The generation type provides information on the 'usage' of the power plants (beside electricity generaiton), thus if the plant is an industrial power plant or provides thermal heat for district heating.
The Belgian data source provides only a general information on the heat supply (here: WKK). Thus, due to these general informaiton, we classify corresponding plants as both, industrial or combined heat power plant, and cannot distringuish both types.
In [ ]:
# Generate entries in column "type" according to technology "WKK"
data_BE.loc[data_BE['technology'] == 'WKK', 'type'] = 'CHP/IPP'
data_BE.loc[data_BE['name'].str.contains('WKK'), 'type'] = 'CHP/IPP'
# Generate entries in column "CHP" according to column "type"
data_BE.loc[(data_BE['type'] == 'CHP') |
(data_BE['type'] == 'IPP') |
(data_BE['type'] == 'CHP/IPP'), 'chp'] = 'Yes'
Overall translation of all technology types mentioned in the column "technology" and subsequent translation check.
In [ ]:
# Translate technologies
dict_technology_BE = {'GT': 'Gas turbine',
'BG': 'NaN',
'CL': 'Steam turbine',
'WKK': 'NaN',
'CCGT': 'Combined cycle',
'D': 'NaN',
'HU': 'NaN',
'IS': 'NaN',
'NU': 'Steam turbine',
'TJ': 'Gas turbine',
'WT': 'NaN',
' ': 'NaN',
'nan': 'NaN',
}
data_BE["technology"].replace(dict_technology_BE, inplace=True)
data_BE["technology"].replace('NaN', np.nan, inplace=True)
data_BE.loc[(data_BE['name'].str.contains(' ST') |
data_BE['name'].str.contains(' ST ')) &
((data_BE['technology'] == 'NaN') |
data_BE['technology'].isnull()), 'technology'] = 'Steam turbine'
data_BE.loc[(data_BE['name'].str.contains(' GT') |
data_BE['name'].str.contains(' GT ')) &
((data_BE['technology'] == 'NaN') |
data_BE['technology'].isnull()), 'technology'] = 'Gas turbine'
# Check if all technologies have been translated
for technology in data_BE["technology"].unique():
if (technology not in dict_technology_BE.values()) & (str(technology) != 'NaN'):
logger.error("Untranslated technology: " + str(technology))
data_BE.head()
Overall translation of all energy sources types mentioned in the column "energy_sources" and subsequent translation check. Deletion of rows containing "wind" as energy source.
In [ ]:
# Translate energy sources
dict_energysources_BE = {'BIO': 'Biomass and biogas',
'BF': 'Other fossil fuels',
'CL': 'Lignite',
'CP': 'Hard coal',
'CG': 'Other fossil fuels',
'GO': 'Oil',
'LF': 'Oil',
'LV': 'Oil',
'CP/BF': 'Mixed fossil fuels',
'CP/CG': 'Mixed fossil fuels',
'FA/BF': 'Mixed fossil fuels',
'NG/BF': 'Mixed fossil fuels',
'NG': 'Natural gas',
'NU': 'Nuclear',
'WR': 'Non-renewable waste',
'WA': 'Hydro',
'WI': 'Wind',
'WP': 'Biomass and biogas'}
data_BE["energy_source"].replace(dict_energysources_BE, inplace=True)
data_BE["energy_source"].replace('NaN', np.nan, inplace=True)
# Check if all energy sources have been translated
for energysource in data_BE["energy_source"].unique():
if (energysource not in dict_energysources_BE.values()) & (str(energysource) != "NaN"):
logger.error("Untranslated energy source: " + str(energysource))
# Delete unwanted energy source
data_BE = data_BE[data_BE.energy_source != 'Wind']
In this section a manually compiled list is used to define the geographic coordinates of indivdual power plants.
In [ ]:
# Access the second list with EIC-Codes and geographic coordinates
filepath = os.path.join('input', 'input_plant_locations_BE.csv')
additional_data_BE = pd.read_csv(
filepath, encoding='utf-8', header=0, index_col=None)
# Initiate matching
data_BE = data_BE.merge(additional_data_BE,
left_on='name',
right_on='name',
how='left',
suffixes=('_x', ''))
# Drop columns after merger
colsToDrop = ['eic_code_x', 'lat_x', 'lon_x']
data_BE = data_BE.drop(colsToDrop, axis=1)
data_BE.head()
The data is provided by the Dutch transmission network operator TenneT. It encompasses the daily available generation capacity, thus a list of Dutch generation units being operational on a specific day. The data is downloaded for all four quarter in 2015.
Import of quartely data
In [ ]:
data_NL_Q1 = importdata('NL', 'Tennet_Q1')
data_NL_Q2 = importdata('NL', 'Tennet_Q2')
data_NL_Q3 = importdata('NL', 'Tennet_Q3')
data_NL_Q4 = importdata('NL', 'Tennet_Q4')
Merge quartely data
In [ ]:
dataframes = [data_NL_Q1, data_NL_Q2, data_NL_Q3, data_NL_Q4]
data_NL = pd.concat(dataframes)
The imported data is standardized with respect to the columns as defined in section 2.4. In a first step, existing and output-relevant columns are translated and remaining columns are deleted in a second step. Columns which are not exist in the data set, but required for the output are additionally added in this process.
In [ ]:
# Merge columns "street" and "Number" to one column called "Street"
data_NL['street'] = data_NL[['street', 'Number']].apply(
lambda x: '{} {}'.format(x[0], x[1]), axis=1)
# Drop columns not needed anymore
colsToDrop = ['Location', 'Date', 'Number']
data_NL = data_NL.drop(colsToDrop, axis=1)
# Rename columns
dict_columns_NL = {'Connected body': 'company',
'Entity': 'name',
'Fuel': 'energy_source',
'Capacity': 'capacity',
'zipcode': 'postcode',
'place-name': 'city'}
data_NL.rename(columns=dict_columns_NL, inplace=True)
# Adjust types of entries in all columns
data_NL.capacity = data_NL.capacity.astype(float)
Overall translation of all energy sources types mentioned in the column "energy_sources" and subsequent translation check. Generation of entries for the column "technology" according to information given in the column "energy_source" by TenneT.
In [ ]:
# Rename types of energy sources
dict_energysources_NL = {'E01': 'Solar',
'E02': 'Wind',
'E03': 'Hydro',
'E04': 'Biomass and biogas',
'E05': 'Hard coal',
'E06': 'Natural gas',
'E07': 'Oil',
'E08': 'Nuclear',
'E09': 'Other or unspecified energy sources'}
data_NL["energy_source"].replace(dict_energysources_NL, inplace=True)
# Check if all energy sources have been translated
for energysource in data_NL["energy_source"].unique():
if (energysource not in dict_energysources_NL.values()) & (str(energysource) != "NaN"):
logger.error("Not renamed energy source: " + str(energysource))
# Generate technology entry according to energy source
data_NL.loc[data_NL['energy_source'] == 'Nuclear',
'technology'] = 'Steam turbine'
data_NL.loc[data_NL['energy_source'] == 'Hard coal',
'technology'] = 'Steam turbine'
# Delete unwanted energy sources in column "energy_source"
data_NL = data_NL[data_NL.energy_source != 'Solar']
data_NL = data_NL[data_NL.energy_source != 'Wind']
Adjustment of the capacity entry for the row relating to the power plant named "Rijnmond II".
In [ ]:
# Show data entries for Rijnmond II
# data_NL.loc[data_NL['name'] == 'Rijnmond II','capacity']
In [ ]:
# Data for power plant 'Rijnmond II' are daily total capacity
if download_from == 'opsd_server':
if version == '2016-10-27':
data_NL['capacity_new'] = (data_NL['capacity']/24).where(
data_NL.name == 'Rijnmond II')
data_NL.loc[data_NL[
'name'] == 'Rijnmond II', 'capacity'] = data_NL.loc[data_NL[
'name'] == 'Rijnmond II', 'capacity_new']
data_NL = data_NL.drop(['capacity_new'], axis=1)
We estimate the installed capacity by the highest available daily capacity for each unit.
In [ ]:
# Filter rows by considering "name" and maximum "capacity
data_NL = data_NL.sort_values(
'capacity', ascending=False).groupby('name', as_index=False).first()
# Apply general template of columns
data_NL = data_NL.reindex(columns=columns_sorted)
In this section a manually compiled list is used to define the geographic coordinates of indivdual power plants.
In [ ]:
# Access the second list with EIC-Codes and geographic coordinates
filepath = os.path.join('input', 'input_plant_locations_NL.csv')
additional_data_NL = pd.read_csv(
filepath, encoding='utf-8', header=0, index_col=None)
# Initiate matching
data_NL = data_NL.merge(additional_data_NL,
left_on='name',
right_on='name',
how='left',
suffixes=('_x', ''))
# Drop columns after merger
colsToDrop = ['eic_code_x', 'lat_x', 'lon_x']
data_NL = data_NL.drop(colsToDrop, axis=1)
data_NL.head()
The data is provided by the French transmission network operator RTE. It encompasses a detailed list of French generation units with a capacity of more than 100 MW.
In [ ]:
data_FR = importdata('FR', 'RTE')
The imported data is standardized with respect to the columns as defined in section 2.4. In a first step, existing and output-relevant columns are translated and remaining columns are deleted in a second step. Columns which are not exist in the data set, but required for the output are additionally added in this process.
In [ ]:
# Translate columns
dict_columns_FR = {'Type': 'energy_source',
'Nom de la centrale de production': 'name',
'Capacité de production Installée (MW)': 'capacity',
'Date de création': 'commissioned',
'Country': 'country',
'Source': 'source'}
data_FR.rename(columns=dict_columns_FR, inplace=True)
# Apply general template of columns
data_FR = data_FR.reindex(columns=columns_sorted)
# Delete unwanted row by referring to column "Name"
data_FR = data_FR.dropna(subset=['name'])
# Delete place holder datetime
data_FR["commissioned"].replace('01/01/2000', np.nan, inplace=True)
# Define commissioning year
data_FR['commissioned'] = pd.to_datetime(
data_FR['commissioned'], format='%d/%m/%Y')
data_FR['commissioned'] = pd.DatetimeIndex(
data_FR['commissioned']).year
# Adjust types of entries in all columns
data_FR.capacity = data_FR.capacity.astype(float)
The list comprises duplicate entries for specific power plants. If they are commissioned, decommissioned, or retrofitted, the power plant is listed twice with differences in the installed capacity and the commissioning years. In the following, the first data entry is choosen as it seems to be most recent one.
Note: The commissioning or decommissioning year is currently not adjusted. A comment is set in the corresponding column 'comment'.
In [ ]:
# Show duplicate data entries
data_FR[data_FR.duplicated(subset='name', keep=False)]
In [ ]:
# Set comment for duplicate entries
data_FR.loc[data_FR.duplicated(
subset='name',
keep=False),
'commissioned'] = np.nan
data_FR.loc[data_FR.duplicated(
subset='name',
keep=False),
'comment'] = 'Commissioning year not reported due to duplicate entries'
# Drop last duplicate
data_FR = data_FR.drop_duplicates(subset='name', keep='first')
Generation of entries for technologies. Overall translation of all energy sources types mentioned in the column "energy_sources" and subsequent translation check.
In [ ]:
# Generate technology entries according to energy sources
data_FR.loc[data_FR['energy_source'] == 'Hydraulique STEP',
'technology'] = 'Pumped storage'
data_FR.loc[data_FR['energy_source'] == '''Hydraulique fil de l'eau / éclusée''',
'technology'] = 'Run-of-river'
data_FR.loc[data_FR['energy_source'] == 'Hydraulique lacs',
'technology'] = 'Reservoir'
data_FR.loc[data_FR['energy_source'] == 'Nucléaire',
'technology'] = 'Steam turbine'
data_FR.loc[data_FR['energy_source'] == 'Charbon',
'technology'] = 'Steam turbine'
# Translate types of energy sources
dict_energysources_FR = {'Autre': 'Other or unspecified energy sources',
'Charbon': 'Hard coal',
'Fioul': 'Oil',
'Gaz': 'Natural gas',
'Hydraulique STEP': 'Hydro',
'''Hydraulique fil de l'eau / éclusée''': 'Hydro',
'Hydraulique lacs': 'Hydro',
'Marin': 'Marine',
'Nucléaire': 'Nuclear',
'Biomasse': 'Biomass and biogas'}
data_FR["energy_source"].replace(dict_energysources_FR, inplace=True)
# Check if all energy sources have been translated
for energysource in data_FR["energy_source"].unique():
if (energysource not in dict_energysources_FR.values()) & (str(energysource) != "NaN"):
logger.error("Untranslated energy source: " + str(energysource))
# Delete unwanted energy sources in column "energy_source"
data_FR = data_FR[data_FR.energy_source != 'Wind']
data_FR = data_FR[data_FR.energy_source != 'Solar']
data_FR = data_FR[data_FR.energy_source != 'Marine']
In this section a manually compiled list is used to define the geographic coordinates of indivdual power plants.
In [ ]:
# Access the second list with EIC-Codes and geographic coordinates
filepath = os.path.join('input', 'input_plant_locations_FR.csv')
additional_data_FR = pd.read_csv(
filepath, encoding='utf-8', header=0, index_col=None)
# Initiate matching
data_FR = data_FR.merge(additional_data_FR,
left_on='name',
right_on='name',
how='left',
suffixes=('_x', ''))
# Drop columns after merger
colsToDrop = ['eic_code_x', 'lat_x', 'lon_x']
data_FR = data_FR.drop(colsToDrop, axis=1)
data_FR.head()
The data is provided by the Polish Power Exchange GPI. It encompasses a detailed list of large Polish generation units with information on the single power plant blocks.
In [ ]:
data_PL = importdata('PL', 'GPI')
The imported data is standardized with respect to the columns as defined in section 2.4. In a first step, existing and output-relevant columns are translated and remaining columns are deleted in a second step. Columns which are not exist in the data set, but required for the output are additionally added in this process.
In [ ]:
# Rename first column
data_PL.columns.values[0] = 'company'
# Rename columns
dict_columns_PL = {'Generating unit name': 'name',
'Comments': 'comment',
'Available capacity [MW]': 'capacity',
'Basic fuel': 'energy_source',
'Country': 'country',
'Source': 'source'}
data_PL.rename(columns=dict_columns_PL, inplace=True)
# Fill columns "energy_source" and "company" with the belonging entries
cols = ['energy_source', 'company']
data_PL[cols] = data_PL[cols].ffill()
# Delete empty and therefore unwanted rows by referring to column "Generating unit code"
data_PL = data_PL.dropna(subset=['Generating unit code'])
# Apply general template of columns
data_PL = data_PL.reindex(columns=columns_sorted)
# Adjust types of entries in all columns
data_PL.capacity = data_PL.capacity.astype(float)
Overall translation of all energy sources types mentioned in the column "energy_sources" and subsequent translation check.
In [ ]:
# Rename energy sources types
dict_energysources_PL = {'Brown coal': 'Lignite',
'Black coal': 'Hard coal',
'Water': 'Hydro',
'Natural gas': 'Natural gas',
}
data_PL["energy_source"].replace(dict_energysources_PL, inplace=True)
# Check if all energy sources have been translated
for energysource in data_PL["energy_source"].unique():
if (energysource not in dict_energysources_PL.values()) & (str(energysource) != "NaN"):
logger.error("Not renamed energy source: " + str(energysource))
Generation of entries for the column "technology" according to information given in the column "energy_source".
In [ ]:
# Generate entries in column "technology" according to energy source "hydro"
data_PL.loc[data_PL['energy_source'] == 'Hydro', 'technology'] = 'Pumped storage'
In this section a manually compiled list is used to define the geographic coordinates of indivdual power plants.
In [ ]:
# Access the second list
filepath = os.path.join('input', 'input_plant_locations_PL.csv')
additional_data_PL = pd.read_csv(
filepath, encoding='utf-8', header=0, index_col=None)
# Initiate merger
data_PL = data_PL.merge(additional_data_PL,
left_on='name',
right_on='name',
how='left',
suffixes=('_x', ''))
# Drop columns after merger
colsToDrop = ['eic_code_x', 'lat_x', 'lon_x']
data_PL = data_PL.drop(colsToDrop, axis=1)
data_PL.head()
The data is provided by the Czech transmission network operator CEPS. It encompasses the daily available capacity reported by the transmission system operator.
In [ ]:
data_CZ = importdata('CZ', 'CEPS')
The imported data is standardized with respect to the columns as defined in section 2.4. In a first step, existing and output-relevant columns are translated and remaining columns are deleted in a second step. Columns which are not exist in the data set, but required for the output are additionally added in this process.
In [ ]:
# Merge columns "Power plant" and "Generating unit" to one column called "Name"
data_CZ['name'] = data_CZ[['Power plant', 'Generating unit']].apply(
lambda x: '{} {}'.format(x[0], x[1]), axis=1)
# Rename columns
dict_columns_CZ = {'Available capacity [MW]': 'capacity',
'Type of source': 'technology',
'Country': 'country',
}
data_CZ.rename(columns=dict_columns_CZ, inplace=True)
# Filter rows by considering "name" and maximum "capacity"
data_CZ = data_CZ.sort_values(
'capacity', ascending=False).groupby('name', as_index=False).first()
# Apply general template of columns
data_CZ = data_CZ.reindex(columns=columns_sorted)
# Adjust types of entries in all columns
data_CZ.capacity = data_CZ.capacity.astype(float)
Overall translation of all technology types mentioned in the column "technology" and subsequent translation check.
In [ ]:
# Generate entries in column "energy_source" according to column "technology"
data_CZ.loc[data_CZ['technology'] == 'Jaderná elektrárna',
'energy_source'] = 'Nuclear'
data_CZ.loc[data_CZ['technology'] == 'Přečerpávací vodní elektrárna',
'energy_source'] = 'Hydro'
data_CZ.loc[data_CZ['technology'] == 'Parní elektrárna',
'energy_source'] = 'Other or unspecified energy sources'
data_CZ.loc[data_CZ['technology'] == 'Paroplynová elektrárna',
'energy_source'] = 'Other or unspecified energy sources'
# Translate technologies
dict_technologies_CZ = {'Přečerpávací vodní elektrárna': 'Pumped storage',
'Parní elektrárna': 'Steam turbine',
'Jaderná elektrárna': 'Steam turbine',
'Paroplynová elektrárna': 'Combined cycle'}
data_CZ["technology"].replace(dict_technologies_CZ, inplace=True)
# Check if all technologies have been translated
for technology in data_CZ["technology"].unique():
if (technology not in dict_technologies_CZ.values()) & (str(technology) != "NaN"):
logger.error("Untranslated technology: " + str(technology))
In this section a manually compiled list is used to define the geographic coordinates of indivdual power plants.
In [ ]:
# Access the second list with EIC-Codes and geographic coordinates
filepath = os.path.join('input', 'input_plant_locations_CZ.csv')
additional_data_CZ = pd.read_csv(
filepath, encoding='utf-8', header=0, index_col=None)
# Initiate merger
data_CZ = data_CZ.merge(additional_data_CZ,
left_on='name',
right_on='name',
how='left',
suffixes=('_x', ''))
# Drop columns after merger
colsToDrop = ['eic_code_x', 'lat_x', 'lon_x']
data_CZ = data_CZ.drop(colsToDrop, axis=1)
data_CZ.head()
The data is provided by the Swiss Ministry of Energy BFE. It encompasses a detailed list of Swiss hydro generation units with comprehensive information on technical specifications.
Hydro power
In [ ]:
data_CH = importdata('CH', 'BFE')
Nuclear power plants
In [ ]:
filepath = os.path.join('input', 'input_plant-list_CH_conventional.csv')
data_nuclear_CH = pd.read_csv(
filepath, encoding='utf-8', header=0, index_col=None)
Consolidation of columns
In [ ]:
# Merge columns "ZE-Erste Inbetriebnahme" and "ZE-Letzte Inbetriebnahme" to one column called "Commissioned"
data_CH['commissioned'] = data_CH[
['ZE-Erste Inbetriebnahme', 'ZE-Letzte Inbetriebnahme']].apply(
lambda x: max(x[0], x[1]), axis=1)
# Merge columns "Bemerkung (1) - (10)" to one column "Comment"
data_CH['comment'] = data_CH[['Bemerkung (1)',
'Bemerkung (2)',
'Bemerkung (3)',
'Bemerkung (4)',
'Bemerkung (5)',
'Bemerkung (6)',
'Bemerkung (7)',
'Bemerkung (8)',
'Bemerkung (9)',
'Bemerkung (10)']].apply(
lambda x:
'{}; {}; {}; {}; {}; {}; {}; {}; {}; {}'.format(
x[0],
x[1],
x[2],
x[3],
x[4],
x[5],
x[6],
x[7],
x[8],
x[9]), axis=1)
data_CH['comment'] = data_CH['comment'].str.replace('nan; ', '')
data_CH['comment'] = data_CH['comment'].str.replace('nan', '')
Transformation of geographical hydro power data to WGS84 projection
In [ ]:
# LatLon with WGS84 datum used by GPS units and Google Earth
wgs84 = pyproj.Proj("+init=EPSG:4326")
# CH1903 projection used in BFE-Data
ch1903 = pyproj.Proj("+init=EPSG:21781")
data_CH[['lon', 'lat']] = data_CH[
['ZE-Koordinaten unscharf (Nord)',
'ZE-Koordinaten unscharf (Ost)']].apply(
lambda row: pyproj.transform(
ch1903, wgs84, row[1], row[0]), axis=1).apply(pd.Series)
Translation and harmonization of columns
The imported data is standardized with respect to the columns as defined in section 2.4. In a first step, existing and output-relevant columns are translated and remaining columns are deleted in a second step. Columns which are not exist in the data set, but required for the output are additionally added in this process.
In [ ]:
# Translate columns
dict_columns_CH = {'WKA-Name': 'name',
'ZE-Standort': 'city',
'WKA-Typ': 'technology',
'ZE-Status': 'availability',
'Inst. Turbinenleistung': 'capacity'}
data_CH.rename(columns=dict_columns_CH, inplace=True)
# Adjust type of entries in column "Capacity"
data_CH.capacity = data_CH.capacity.astype(float)
# Adjust availabilities
dict_availabilities_CH = {'im Normalbetrieb': '1',
'im Bau': '0',
'im Umbau': '0',
'stillgelegt': '0'}
data_CH["availability"].replace(dict_availabilities_CH, inplace=True)
# List only operating plants
data_CH = data_CH[data_CH.availability != '0']
# Apply general template of columns
data_CH = data_CH.reindex(columns=columns_sorted)
In [ ]:
# Set energy source to "hydro"
data_CH['energy_source'] = 'Hydro'
# Adjust technologies
dict_technologies_CH = {'L': 'Run-of-river',
'S': 'Reservoir',
'P': 'Pumped storage with natural inflow',
'U': 'Pumped storage'}
data_CH["technology"].replace(dict_technologies_CH, inplace=True)
In this section a manually compiled list is used to define the geographic coordinates of indivdual hydro power plants.
In [ ]:
# Access the second list with EIC-Codes and geographic coordinates
filepath = os.path.join('input', 'input_plant_locations_CH.csv')
additional_data_CH = pd.read_csv(
filepath, encoding='utf-8', header=0, index_col=None)
# Initiate merger
data_CH = data_CH.merge(additional_data_CH,
left_on='name',
right_on='name',
how='left',
suffixes=('_x', ''))
# Drop columns after merger
colsToDrop = ['eic_code_x', 'lat_x', 'lon_x']
data_CH = data_CH.drop(colsToDrop, axis=1)
In [ ]:
data_CH = data_CH.append(data_nuclear_CH)
data_CH.head()
The data is provided by the Italian transmission network operator TERNA. It encompasses a detailed list of Italian generation units of more than 100 MW.
In [ ]:
data_IT = importdata('IT', 'Terna')
The imported data is standardized with respect to the columns as defined in section 2.4. In a first step, existing and output-relevant columns are translated and remaining columns are deleted in a second step. Columns which are not exist in the data set, but required for the output are additionally added in this process.
In [ ]:
# Translate columns
dict_columns_IT = {'Descrizione Impianto': 'name',
'TIPOLOGIA': 'energy_source',
'Comune': 'city',
'PMAX [MW]': 'capacity',
'Country': 'country',
'Source': 'source',
'Zona': 'additional_info'}
data_IT.rename(columns=dict_columns_IT, inplace=True)
# Apply general template of columns
data_IT = data_IT.reindex(columns=columns_sorted)
# Consider of geographical information in column "additional_info"
data_IT['additional_info'] = data_IT[['additional_info']].apply(
lambda x: 'Zone: {}'.format(x[0]), axis=1)
# Adjust types of entries in all columns
data_IT.capacity = data_IT.capacity.astype(float)
Overall translation of all energy source types mentioned in the column "energy_sources" and subsequent translation check. Deletion of rows containing "wind" and "geothermal_power"as energy source.
In [ ]:
# Translate types of energy sources
dict_energysources_IT = {'GEOTERMICO': 'Geothermal',
'TERMOELETTRICO': 'Fossil fuels',
'IDROELETTRICO': 'Hydro',
'EOLICO': 'Wind'}
data_IT["energy_source"].replace(dict_energysources_IT, inplace=True)
# Check if all energy sources have been translated
for energysource in data_IT["energy_source"].unique():
if (energysource not in dict_energysources_IT.values()) & (str(energysource) != "NaN"):
logger.error("Untranslated energy source: " + str(energysource))
# Delete unwanted energy sources in column "energy_source"
data_IT = data_IT[data_IT.energy_source != 'Wind']
data_IT = data_IT[data_IT.energy_source != 'Geothermal']
In this section a manually compiled list is used to define the geographic coordinates of indivdual power plants.
In [ ]:
# Access the second list with EIC-Codes and geographic coordinates
filepath = os.path.join('input', 'input_plant_locations_IT.csv')
additional_data_IT = pd.read_csv(
filepath, encoding='utf-8', header=0, index_col=None)
# Initiate merger
data_IT = data_IT.merge(additional_data_IT,
left_on='name',
right_on='name',
how='left',
suffixes=('_x', ''))
# Drop columns after merger
colsToDrop = ['eic_code_x', 'lat_x', 'lon_x']
data_IT = data_IT.drop(colsToDrop, axis=1)
data_IT.head()
The data is provided by the Finnish Energy Authority. It encompasses a detailed list of Finnish generation units of at least one megavolt ampere [1 MVA].
In [ ]:
data_FI = importdata('FI', 'EnergyAuthority')
The imported data is standardized with respect to the columns as defined in section 2.4. In a first step, existing and output-relevant columns are translated and remaining columns are deleted in a second step. Columns which are not exist in the data set, but required for the output are additionally added in this process.
In [ ]:
# Generate entries in column "CHP"
data_FI.loc[data_FI[
'Combined Heat and Power Production, Industry,Maximum, Total, MW'] > 0,
'chp'] = 'Yes'
data_FI.loc[data_FI[
'Combined Heat and Power Production, District Heating, Total, MW'] > 0,
'chp'] = 'Yes'
# Drop columns not needed anymore
colsToDrop = ['Business ID',
'Location',
'Separate power production, Maximum, Hour, MW',
'Separate power production, Decomissioned, Hour, MW',
'Combined Heat and Power Production, Industry,Maximum, Total, MW',
'Combined Heat and Power Production, Industry,Hour, Total, MW',
'Combined Heat and Power Production, Industry, Decomissioned, Total, MW',
'Combined Heat and Power Production, District Heating, Total, MW',
'Combined Heat and Power Production, District Heating, Hour, MW',
'Combined Heat and Power Production, District Heating, Decomissioned, Total, MW',
'Separate power production, Maximum, Total, MW',
'Hour, total, MW',
'Decomissioned, Total, MW',
'Standby fuel ',
'Standby fuel']
data_FI = data_FI.drop(colsToDrop, axis=1)
# Rename columns
dict_columns_FI = {'Name': 'name',
'Company': 'company',
'Type': 'type',
'Address': 'street',
'Town': 'city',
'Postal code': 'postcode',
'Maximum, total, MW': 'capacity',
'Main fuel': 'energy_source',
'Country': 'country',
'Source': 'source'}
data_FI.rename(columns=dict_columns_FI, inplace=True)
# Apply general template of columns
data_FI = data_FI.reindex(columns=columns_sorted)
# Adjust types of entries in all columns
data_FI.capacity = data_FI.capacity.astype(float)
Overall translation of all energy sources types mentioned in the column "energy_sources" and subsequent translation check. Generation of entries for the column "energy_scoures" according to information given in the column "type".
In [ ]:
# Rename types of energy sources
dict_energysources_FI = {'Biogas': 'Biomass and biogas',
'Black liquor and concentrated liquors': 'Biomass and biogas',
'Blast furnace gas': 'Other fossil fuels',
'By-products from wood processing industry': 'Biomass and biogas',
'Exothermic heat from industry': 'Other or unspecified energy sources',
'Forest fuelwood': 'Biomass and biogas',
'Gasified waste': 'Non-renewable waste',
'Hard coal and anthracite': 'Hard coal',
'Heavy distillates': 'Oil',
'Industrial wood residues': 'Biomass and biogas',
'Light distillates': 'Oil',
'Medium heavy distillates': 'Oil',
'Mixed fuels': 'Mixed fossil fuels',
'Natural gas': 'Natural gas',
'Nuclear energy': 'Nuclear',
'Other by-products and wastes used as fuel': 'Other fossil fuels',
'Other non-specified energy sources': 'Other or unspecified energy sources',
'Peat': 'Biomass and biogas',
' ': 'Other or unspecified energy sources',
np.nan: 'Other or unspecified energy sources'}
data_FI["energy_source"].replace(dict_energysources_FI, inplace=True)
data_FI["energy_source"].replace('NaN', np.nan, inplace=True)
# Check if all energy sources have been translated
for energysource in data_FI["energy_source"].unique():
if (energysource not in dict_energysources_FI.values()) & (str(energysource) != "NaN"):
logger.error("Untranslated energy source: " + str(energysource))
# Generate entries in column "energy_sources" for hydro and wind stations according to column "type"
data_FI.loc[data_FI['type'] == 'Hydro power', 'energy_source'] = 'Hydro'
data_FI.loc[data_FI['type'] == 'Wind power', 'energy_source'] = 'Wind'
Generation of entries for the column "technology" according to information given in the column "energy_source". Deletion of rows containing "wind" as energy source.
In [ ]:
# Generate entries in column "technology" according to column "energy_source"
data_FI.loc[data_FI['energy_source'] == 'Nuclear',
'technology'] = 'Steam turbine'
data_FI.loc[data_FI['energy_source'] == 'Hard coal',
'technology'] = 'Steam turbine'
# Delete unwanted energy source (wind) in column "energy_source"
data_FI = data_FI[data_FI.energy_source != 'Wind']
Overall translation of all types mentioned in the column "type" and subsequent translation check.
In [ ]:
# Rename types
dict_types_FI = {'District heating CHP': 'CHP',
'Hydro power': 'NaN',
'Industry CHP': 'IPP',
'Nuclear energy': 'NaN',
'Separate electricity production': 'NaN',
'Wind power': 'NaN'}
data_FI["type"].replace(dict_types_FI, inplace=True)
data_FI["type"].replace('NaN', np.nan, inplace=True)
In this section a manually compiled list is used to define the geographic coordinates of indivdual power plants.
In [ ]:
# Access the second list with EIC-Codes and geographic coordinates
filepath = os.path.join('input', 'input_plant_locations_FI.csv')
additional_data_FI = pd.read_csv(
filepath, encoding='utf-8', header=0, index_col=None)
# Initiate merger
data_FI = data_FI.merge(additional_data_FI,
left_on='name',
right_on='name',
how='left',
suffixes=('_x', ''))
# Drop columns after merger
colsToDrop = ['eic_code_x', 'lat_x', 'lon_x']
data_FI = data_FI.drop(colsToDrop, axis=1)
data_FI.head()
The data is provided by the Spanish SEDE - Ministry of Industry, Energy and Tourism. It encompasses a detailed list of Spanish generation units with comprehensive information on technologies and energy fuels.
In [ ]:
data_ES = importdata('ES', 'SEDE')
Overall adjustment of all columns within the dataframe. Translation, addition, deletion, sorting of columns as well as adjustment of the column entries' types. Adjustment of the entries' units from kW to MW in the columns "Capacity" (corresponding to the net capacity in the original data set).
In [ ]:
# Delete unwanted regions
data_ES = data_ES[data_ES.Autonomia != 'Ceuta']
data_ES = data_ES[data_ES.Autonomia != 'Melilla']
data_ES = data_ES[data_ES.Autonomia != 'Canarias']
data_ES = data_ES[data_ES.Autonomia != 'Baleares']
# Delete unwanted energy source
data_ES = data_ES[data_ES.Tecnologia != 'Fotovoltaica']
# Drop columns not needed anymore
colsToDrop = ['N. Orden',
'Tipo Regimen',
'Autonomia',
'Provincia',
'F. Alta',
'F. Baja',
'F. Alta Provicional',
'Alta Registro']
data_ES = data_ES.drop(colsToDrop, axis=1)
# Rename columns
dict_columns_ES = {'Titular': 'company',
'Nombre de la instalacion': 'name',
'Municipio': 'city',
'Tecnologia': 'technology',
'Comment': 'comment',
'Potencia Neta': 'capacity',
'Potencia Bruta': 'gross_capacity',
'Combustible': 'energy_source',
'F. Puesta En Servicio': 'commissioned',
'Country': 'country',
'Source': 'source'}
data_ES.rename(columns=dict_columns_ES, inplace=True)
# Check if all columns have been renamed
for columnnames in data_ES.columns:
if columnnames not in dict_columns_ES.values():
logger.error("Not renamed column: " + columnnames)
# Apply general template of columns
data_ES = data_ES.reindex(columns=columns_sorted)
# Change unit of column 'capacity' from kW to MW
data_ES.capacity = data_ES.capacity.astype(float)
data_ES['capacity'] = (data_ES['capacity'] / 1000)
# Define commissioning year
data_ES['commissioned'] = pd.to_datetime(
data_ES['commissioned'], format='%d/%m/%Y')
data_ES['commissioned'] = pd.DatetimeIndex(
data_ES['commissioned']).year
Overall translation of all energy sources types mentioned in the column "energy_sources" and subsequent translation check.
In [ ]:
dict_energysources_ES = {'Biocombustibles liquidos': 'Biomass and biogas',
'Biogas': 'Biomass and biogas',
'Biogas de digestion': 'Biomass and biogas',
'Biogas de vertedero': 'Biomass and biogas',
'Biomasa industrial agricola': 'Biomass and biogas',
'Biomasa industrial forestal': 'Biomass and biogas',
'Biomasa primaria': 'Biomass and biogas',
'Calor residual': 'Other or unspecified energy sources',
'Carbon': 'Hard coal',
'CARBON IMPORTADO': 'Hard coal',
'Cultivos energeticos agricolas o forestales': 'Biomass and biogas',
'DIESEL': 'Oil',
'Energias residuales': 'Non-renewable waste',
'Fuel': 'Oil',
'FUEL-OIL 0,3': 'Oil',
'FUELOLEO': 'Oil',
'GAS DE REFINERIA': 'Natural gas',
'Gas natural': 'Natural gas',
'GAS NATURAL': 'Natural gas',
'Gas residual': 'Natural gas',
'Gasoleo': 'Oil',
'GASOLEO': 'Oil',
'HULLA+ANTRACITA': 'Hard coal',
'Licores negros': 'Biomass and biogas',
'LIGNITO NEGRO': 'Lignite',
'LIGNITO PARDO': 'Lignite',
'NUCLEAR': 'Nuclear',
'Propano': 'Natural gas',
'Residuo aprovechamiento forestal o selvicola': 'Other bioenergy and renewable waste',
'Residuos': 'Non-renewable waste',
'Residuos actividad agricolas o jardineria': 'Other bioenergy and renewable waste',
'Residuos industriales': 'Non-renewable waste',
'Residuos solidos urbanos': 'Non-renewable waste',
'RESIDUOS SOLIDOS URBANOS': 'Non-renewable waste',
' ': 'Other or unspecified energy sources',
np.nan: 'Other or unspecified energy sources'}
data_ES["energy_source"].replace(dict_energysources_ES, inplace=True)
data_ES["energy_source"].replace('NaN', np.nan, inplace=True)
# Check if all energy sources have been translated
for energysource in data_ES["energy_source"].unique():
if (energysource not in dict_energysources_ES.values()) & (str(energysource) != "NaN"):
logger.error("Not renamed energy source: " + str(energysource))
Overall translation of all technology types mentioned in the column "technology" and subsequent translation check.
In [ ]:
dict_technologies_ES = {'Aprovechamiento de energias residuales': 'NaN',
'Biogas': 'NaN',
'Biomasa': 'NaN',
'Cogeneracion': 'NaN',
'Eolica terrestre': 'Onshore',
'Fotovoltaica': 'Photovoltaics',
'Hidraulica fluyente': 'Run-of-river',
'Hidraulica': 'Differently categorized hydro',
'Motor': 'Combustion engine',
'Otras': 'NaN',
'Termica': 'NaN',
'Termica clasica': 'NaN',
'Termonuclear': 'Steam turbine',
'Tratamiento de residuos': 'NaN',
'Turbina': 'NaN',
'Turbina de gas': 'Gas turbine',
'Residuos': 'NaN',
'Solar Termoelectrica': 'Concentrated solar power',
' ': 'NaN',
'nan': 'NaN',
np.nan: 'NaN',
}
data_ES["technology"].replace(dict_technologies_ES, inplace=True)
data_ES["technology"].replace('NaN', np.nan, inplace=True)
# Check if all technologies have been translated
for technology in data_ES["technology"].unique():
if (technology not in dict_technologies_ES.values()) & (str(technology) != "NaN"):
logger.error("Untranslated technology: " + str(technology))
Generation of entries for the column "energy_source" according to information given in the column "technology".
In [ ]:
# Generate entries in column "energy_source" according to technologies
data_ES.loc[data_ES['technology'] == 'Differently categorized hydro',
'energy_source'] = 'Hydro'
data_ES["technology"].replace('Differently categorized hydro', np.nan, inplace=True)
data_ES.loc[data_ES['technology'] == 'Run-of-river',
'energy_source'] = 'Hydro'
data_ES.loc[data_ES['technology'] == 'Onshore',
'energy_source'] = 'Wind'
data_ES.loc[data_ES['technology'] == 'Photovoltaics',
'energy_source'] = 'Solar'
data_ES.loc[data_ES['technology'] == 'Concentrated solar power',
'energy_source'] = 'Solar'
# Delete unwanted energy sources
data_ES = data_ES[data_ES.energy_source != 'Wind']
data_ES = data_ES[data_ES.energy_source != 'Solar']
In this section a manually compiled list is used to define the geographic coordinates of indivdual power plants. Please note: The naming of power plants in the Spanish dataset is not unique creating duplicated entries during merge with geographical data.
In [ ]:
# Access the second list with EIC-Codes and geographic coordinates
filepath = os.path.join('input', 'input_plant_locations_ES.csv')
additional_data_ES = pd.read_csv(
filepath, encoding='utf-8', header=0, index_col=None)
# Merge plant data with manually compiled geo-coordinates (currently duplicate entries due to merge based on name)
data_ES = data_ES.merge(additional_data_ES,
left_on='name',
right_on='name',
how='left',
suffixes=('_x', ''))
# Drop duplicate entries
data_ES = data_ES.drop_duplicates(subset=['company', 'name', 'city'], keep='first')
# Drop columns after merge
colsToDrop = ['eic_code_x', 'lat_x', 'lon_x']
data_ES = data_ES.drop(colsToDrop, axis=1)
data_ES.head()
The data is provided by the British government's Statistical Office. It encompasses a detailed list of British generation units with comprehensive information on technologies and energy fuels.
In [ ]:
data_UK = importdata('UK', 'GOV')
The imported data is standardized with respect to the columns as defined in section 2.4. In a first step, existing and output-relevant columns are translated and remaining columns are deleted in a second step. Columns which are not exist in the data set, but required for the output are additionally added in this process.
In [ ]:
# Rename sixth column
data_UK.columns.values[5] = 'Location'
# Drop rows without station names, so that the footnotes at the end of the list are deleted
data_UK = data_UK.dropna(subset=['Station Name'])
# Drop columns not needed anymore
colsToDrop = ['Footnotes']
data_UK = data_UK.drop(colsToDrop, axis=1)
# Rename columns
dict_columns_UK = {'Company Name': 'company',
'Station Name': 'name',
'Installed Capacity (MW)': 'capacity',
'Country': 'country',
'Location': 'location',
'Fuel': 'energy_source',
'Year of commission or year generation began': 'commissioned',
'Source': 'source'}
data_UK.rename(columns=dict_columns_UK, inplace=True)
# Check if all columns have been renamed
for columnnames in data_UK.columns:
if columnnames not in dict_columns_UK.values():
logger.error("Not renamed column: " + columnnames)
# Adjust names of region
dict_regions_UK = {'East': 'England',
'East Midlands': 'England',
'London': 'England',
'North East': 'England',
'North West': 'England',
'South East': 'England',
'South West': 'England',
'West Midlands': 'England',
'Yorkshire and the Humber': 'England',
'N Ireland': 'Northern Ireland'}
data_UK["location"].replace(dict_regions_UK, inplace=True)
# Merge columns "Country" and "Location" to one column called "Country"
data_UK['additional_info'] = data_UK[['location']].apply(
lambda x: 'Region: {}'.format(x[0]), axis=1)
# Drop column "Location" after merger
colsToDrop = ['location']
data_UK = data_UK.drop(colsToDrop, axis=1)
# Adjust commissioning years
dict_commissioning_UK = {'1926/2002': '2002'}
data_UK["commissioned"].replace(dict_commissioning_UK, inplace=True)
# Apply general template of columns
data_UK = data_UK.reindex(columns=columns_sorted)
# Adjust types of entries in all columns
data_UK.capacity = data_UK.capacity.astype(float)
Generation of entries for the column "technology" according to information given in the column "energy_source".
In [ ]:
# Generate entries in column "technology" according to column "energy_source"
data_UK.loc[data_UK['energy_source'] == 'Hydro / pumped storage',
'technology'] = 'Pumped storage'
data_UK.loc[data_UK['energy_source'] == 'Pumped storage',
'technology'] = 'Pumped storage'
data_UK.loc[data_UK['energy_source'] == 'Wind',
'technology'] = 'Onshore'
data_UK.loc[data_UK['energy_source'] == 'Wind (offshore)',
'technology'] = 'Offshore'
data_UK.loc[data_UK['energy_source'] == 'Nuclear',
'technology'] = 'Steam turbine'
data_UK.loc[data_UK['energy_source'] == 'CCGT',
'technology'] = 'Combined cycle'
data_UK.loc[data_UK['energy_source'] == 'OCGT',
'technology'] = 'Gas turbine'
Overall translation of all energy sources types mentioned in the column "energy_source" and subsequent translation check. Deletion of rows containing "wind" as energy source.
In [ ]:
dict_energysources_UK = {'Biomass': 'Biomass and biogas',
'Biomass / gas / waste derived fuel': 'Mixed fossil fuels',
'CCGT': 'Natural gas',
'Coal': 'Hard coal',
'Coal / biomass': 'Mixed fossil fuels',
'Coal / biomass / gas / waste derived fuel': 'Mixed fossil fuels',
'Coal / oil': 'Mixed fossil fuels',
'Coal/oil': 'Mixed fossil fuels',
'Diesel': 'Oil',
'Gas': 'Natural gas',
'Gas / oil': 'Mixed fossil fuels',
'Gas oil': 'Oil',
'Gas oil / kerosene': 'Oil',
'Hydro': 'Hydro',
'Hydro / pumped storage': 'Hydro',
'Light oil': 'Oil',
'Meat & bone meal': 'Other bioenergy and renewable waste',
'Nuclear': 'Nuclear',
'OCGT': 'Natural gas',
'Oil': 'Oil',
'Light oil ': 'Oil',
'Pumped storage': 'Hydro',
'Straw': 'Biomass and biogas',
'Waste': 'Non-renewable waste',
'Wind': 'Wind',
'Wind (offshore)': 'Wind',
'Solar': 'Solar'}
data_UK["energy_source"].replace(dict_energysources_UK, inplace=True)
# Check if all energy sources have been translated
for energysource in data_UK["energy_source"].unique():
if (energysource not in dict_energysources_UK.values()) & (str(energysource) != "NaN"):
logger.error("Not renamed energy source: " + str(energysource))
# Delete unwanted energy sources
data_UK = data_UK[data_UK.energy_source != 'Wind']
data_UK = data_UK[data_UK.energy_source != 'Solar']
In this section a manually compiled list is used to define the geographic coordinates of indivdual power plants.
In [ ]:
# Access the second list with EIC-Codes and geographic coordinates
filepath = os.path.join('input', 'input_plant_locations_UK.csv')
additional_data_UK = pd.read_csv(
filepath, encoding='utf-8', header=0, index_col=None)
# Initiate merger
data_UK = data_UK.merge(additional_data_UK,
left_on='name',
right_on='name',
how='left',
suffixes=('_x', ''))
# Drop columns after merger
colsToDrop = ['eic_code_x', 'lat_x', 'lon_x']
data_UK = data_UK.drop(colsToDrop, axis=1)
data_UK.head()
The data is provided by the power exchange Nordpool. It encompasses a detailed list of Norwegian generation units with a capacity of more than 100 MW for 2013.
In [ ]:
# Access local file
filepath = os.path.join('input', 'input_plant-list_NO.csv')
data_NO = pd.read_csv(filepath, encoding='utf-8', header=0, index_col=None)
data_NO.head()
The data is provided by the power exchange Nordpool. It encompasses a detailed list of Swedish generation units with a capacity of more than 100 MW for 2014.
In [ ]:
# Access local file
filepath = os.path.join('input', 'input_plant-list_SE.csv')
data_SE = pd.read_csv(filepath, encoding='utf-8', header=0, index_col=None)
data_SE.head()
The data is provided by the Slovakian utility Slovenské elektrárne a.s. (SEAS). It encompasses a detailed list of Slovak generation units with comprehensive information on technologies and energy fuels.
In [ ]:
# Access local file
filepath = os.path.join('input', 'input_plant-list_SK.csv')
data_SK = pd.read_csv(filepath, encoding='utf-8', header=0, index_col=None)
# Delete unwanted energy source
data_SK = data_SK[data_SK.energy_source != 'Solar']
data_SK.head()
The data is provided by several Slovenian utilities. The respective data links are given in the column "source". This list encompasses Slovenian generation units with comprehensive information on technologies and energy fuels.
In [ ]:
# Access local file
filepath = os.path.join('input', 'input_plant-list_SI.csv')
data_SI = pd.read_csv(filepath, encoding='utf-8', header=0, index_col=None)
data_SI.head()
The data for conventional power plants is provided by several Austrian utilities. The respective data links are given in the column "source". The specifications of Austrian hydro power plants, however, solely are based on Verbund AG. The resulting list encompasses Austrian generation units with comprehensive information on technologies and energy fuels.
In [ ]:
# Access local file for conventional plants
filepath_conventional = os.path.join(
'input', 'input_plant-list_AT_conventional.csv')
data_AT_conventional = pd.read_csv(
filepath_conventional, encoding='utf-8', header=0, index_col=None)
# Access local file for hydro plants
filepath_hydro = os.path.join('input', 'input_plant-list_AT_hydro.csv')
data_AT_hydro = pd.read_csv(
filepath_hydro, encoding='utf-8', header=0, index_col=None)
# Merge the lists
data_AT = data_AT_conventional.append(data_AT_hydro, ignore_index=True)
data_AT.head()
The data is provided by the Danish transmission network operator Energinet.dk. It encompasses a detailed list of Danish generation units with comprehensive information on technologies and energy fuels.
In [ ]:
# Access local file for conventional plants
filepath = os.path.join('input', 'input_plant-list_DK.csv')
data_DK = pd.read_csv(
filepath, encoding='utf-8', header=0, index_col=None)
# List only operating plants
data_DK = data_DK[data_DK.availability != '0']
data_DK = data_DK[data_DK.availability != 'partly']
data_DK.head()
In the following, the national datasets are consolidated to a single European dataset. Unfortunately, the Belgian dataset cannot be integrated due to the copyright by the data owner ELIA.
In [ ]:
dataframes = [#data_BE,
data_NL,
data_FR,
data_PL,
data_CZ,
data_CH,
data_IT,
data_FI,
data_ES,
data_UK,
data_NO,
data_SE,
data_SK,
data_SI,
data_AT,
data_DK]
data_EU = pd.concat(dataframes)
In [ ]:
# Import energy source level definition
energy_source_levels = pd.read_csv(
os.path.join('input', 'energy_source_levels.csv'), index_col=None, header=0)
# Merge energy source levels to data set
data_EU = data_EU.reset_index().merge(
energy_source_levels,
how='left',
left_on='energy_source',
right_on='energy_source_level_1').drop_duplicates(
subset=['name',
'city',
'country',
'capacity'], keep='first').set_index('name')
data_EU = data_EU.reset_index().merge(
energy_source_levels,
how='left',
left_on='energy_source',
right_on='energy_source_level_2').drop_duplicates(
subset=['name',
'city',
'country',
'capacity'], keep='first').set_index('name')
data_EU = data_EU.reset_index().merge(
energy_source_levels,
how='left',
left_on='energy_source',
right_on='energy_source_level_3').drop_duplicates(
subset=['name',
'city',
'country',
'capacity'], keep='first').set_index('name')
# Combine different energy source levels created by merge
data_EU['energy_source_level_1'] = data_EU[
['energy_source_level_1',
'energy_source_level_1_x',
'energy_source_level_1_y']].fillna('').sum(axis=1)
data_EU['energy_source_level_2'] = data_EU[
['energy_source_level_2',
'energy_source_level_2_y']].fillna('').sum(axis=1)
data_EU['energy_source_level_3'] = data_EU[
['energy_source_level_3']].fillna('').sum(axis=1)
# Drop auxiliary columns due to merge
colsToDrop = ['energy_source_level_1_y',
'energy_source_level_2_y',
'energy_source_level_3_y',
'energy_source_level_1_x',
'energy_source_level_2_x',
'energy_source_level_3_x']
data_EU = data_EU.drop(colsToDrop, axis=1)
data_EU.head()
First, we define the ordering of the columns. Secondly, the data types are redefined. At the moment, this has the drawback that empty columns are redefined as float
instead of object
.
In [ ]:
columns_sorted_output = ['name',
'company',
'street',
'postcode',
'city',
'country',
'capacity',
'energy_source',
'technology',
'chp',
'commissioned',
'type',
'lat',
'lon',
'eic_code',
'energy_source_level_1',
'energy_source_level_2',
'energy_source_level_3',
'additional_info',
'comment',
'source']
# Set ordering of columns
data_EU = data_EU.reset_index()
data_EU = data_EU.reindex(columns=columns_sorted_output)
# Set data types for columns
data_EU = data_EU.astype(str)
data_EU[['capacity', 'commissioned', 'lat', 'lon']] = data_EU[
['capacity', 'commissioned', 'lat', 'lon']].astype(float)
data_EU.replace('nan', np.nan, inplace=True)
# data_EU.dtypes
# Set index
data_EU = data_EU.set_index('name')
data_EU.head()
We document the data packages meta data in the specific format JSON as proposed by the Open Knowledge Foundation. See the Frictionless Data project by OKFN (http://data.okfn.org/) and the Data Package specifications (http://dataprotocols.org/data-packages/) for more details.
In order to keep the notebook more readable, we first formulate the metadata in the human-readable YAML format using a multi-line string. We then parse the string into a Python dictionary and save that to disk as a JSON file.
In [ ]:
with open(os.path.join('input', 'metadata.yml'), 'r') as f:
metadata = yaml.load(f.read())
datapackage_json = json.dumps(metadata, indent=4, separators=(',', ': '))
In [ ]:
output_path = 'output'
data_EU.to_csv(os.path.join(
output_path, 'conventional_power_plants_EU.csv'),
encoding='utf-8',
index_label='name')
data_EU.to_excel(
os.path.join(output_path, 'conventional_power_plants_EU.xlsx'),
sheet_name='plants',
index_label='name')
data_EU.to_sql(
'conventional_power_plants_EU',
sqlite3.connect(os.path.join(output_path, 'conventional_power_plants.sqlite')),
if_exists="replace",
index_label='name')
with open(os.path.join(output_path, 'datapackage.json'), 'w') as f:
f.write(datapackage_json)
In [ ]:
files = [
'conventional_power_plants_DE.csv', 'conventional_power_plants_DE.xlsx',
'conventional_power_plants_EU.csv', 'conventional_power_plants_EU.xlsx',
'conventional_power_plants.sqlite']
with open('checksums.txt', 'w') as f:
for file_name in files:
file_hash = get_sha_hash(os.path.join(output_path, file_name))
f.write('{},{}\n'.format(file_name, file_hash))