Our goal: Scrape a table of U.S. nuclear reactors into a CSV.
In [1]:
import requests
from bs4 import BeautifulSoup
import csv
In [5]:
# define the url
URL = 'https://www.nrc.gov/reactors/operating/list-power-reactor-units.html'
# get the page
nrc_page = requests.get(URL)
# specify the encoding
nrc_page.encoding = 'UTF-8'
# turn it into soup
soup = BeautifulSoup(nrc_page.text, 'html.parser')
In [8]:
reactor_table = soup.find('table')
In [27]:
with open('reactors.csv', 'w') as outfile:
writer = csv.DictWriter(outfile, fieldnames=['name', 'link', 'docket', 'reactor_type',
'license', 'location', 'owner', 'region'])
writer.writeheader()
for row in reactor_table.find_all('tr')[1:]:
# each <tr> has some <td> cells inside it; we'll move these into variables,
# do some string manipulations and write to the CSV
cells = row.find_all('td')
# reactor name, detail page link and docket number are all part of the first cell
# the .contents() method returns a list of a tag's children -->
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#contents-and-children
name = cells[0].contents[0].string
link = 'https://www.nrc.gov' + cells[0].contents[0]['href']
docket = cells[0].contents[2].strip()
# license number is in the second cell
license = cells[1].string.strip()
# reactor type is in the third cell
reactor_type = cells[2].string.strip()
# location is in the fourth cell
location = cells[3].string.strip()
# some of the locations have multiple internal spaces -- here's a trick for dealing with that
# https://stackoverflow.com/a/1546251
location = ' '.join(location.split())
# owner is in the fifth cell
owner = cells[4].contents[0].strip()
# region is in the sixth cell
region = cells[5].string.strip()
writer.writerow({
'name': name,
'link': link,
'docket': docket,
'reactor_type': reactor_type,
'license': license,
'location': location,
'owner': owner,
'region': region
})