Let's scrape some nuclear reactors

Our goal: Scrape a table of U.S. nuclear reactors into a CSV.

Import the libraries



In [1]:

    
import requests
from bs4 import BeautifulSoup
import csv

Fetch and parse the HTML



In [5]:

    
# define the url
URL = 'https://www.nrc.gov/reactors/operating/list-power-reactor-units.html'

# get the page
nrc_page = requests.get(URL)

# specify the encoding
nrc_page.encoding = 'UTF-8'

# turn it into soup
soup = BeautifulSoup(nrc_page.text, 'html.parser')

Find the table



In [8]:

    
reactor_table = soup.find('table')

Loop over the rows and write to CSV



In [27]:

    
with open('reactors.csv', 'w') as outfile:
    
    writer = csv.DictWriter(outfile, fieldnames=['name', 'link', 'docket', 'reactor_type',
                                                 'license', 'location', 'owner', 'region'])

    writer.writeheader()

    for row in reactor_table.find_all('tr')[1:]:
        # each <tr> has some <td> cells inside it; we'll move these into variables,
        # do some string manipulations and write to the CSV
        cells = row.find_all('td')

        # reactor name, detail page link and docket number are all part of the first cell
        # the .contents() method returns a list of a tag's children -->
        # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#contents-and-children
        name = cells[0].contents[0].string
        link = 'https://www.nrc.gov' + cells[0].contents[0]['href']
        docket = cells[0].contents[2].strip()
        
        # license number is in the second cell
        license = cells[1].string.strip()
        
        # reactor type is in the third cell
        reactor_type = cells[2].string.strip()

        # location is in the fourth cell
        location = cells[3].string.strip()
        
        # some of the locations have multiple internal spaces -- here's a trick for dealing with that
        # https://stackoverflow.com/a/1546251
        location = ' '.join(location.split())
        
        # owner is in the fifth cell
        owner = cells[4].contents[0].strip()
        
        # region is in the sixth cell
        region = cells[5].string.strip()

        writer.writerow({
            'name': name,
            'link': link,
            'docket': docket,
            'reactor_type': reactor_type,
            'license': license,
            'location': location,
            'owner': owner,
            'region': region
        })