In [1]:
#Import modules
import requests
from bs4 import BeautifulSoup
In [2]:
#Example URL
theURL = "https://www.hbw.com/species/brown-wood-owl-strix-leptogrammica"
In [3]:
#Get content of the species web page
response = requests.get(theURL)
In [4]:
#Convert to a "soup" object, which BS4 is designed to work with
soup = BeautifulSoup(response.text,'lxml')
div in HTML lingo) labeled "<div class="ds-ssp_comp>" in the HTML. So we'll search the 'soup' for this section, which returns a list of one object, then we extract that one object to a variable named subSection. https://www.crummy.com/software/BeautifulSoup/bs4/doc/#searching-by-css-class
In [5]:
#Find all sections with the CSS class 'ds-ssp_comp' and get the first (only) item found
div = soup.find_all('div',class_='ds-ssp_comp')
section = div[0]
<em> are the subspecies entries.
In [6]:
#Find all lines in the section with the tag 'em'
subSpecies = section.find_all('em')
In [7]:
#Extract to a variable
for subSpp in subSpecies:
print (subSpp.get_text())