Some BeautifulSoup examples to help you scrape the HTML from web pages.
See BeautifulSoup documentation for technical reference.
In [ ]:
# Import all the things!
import urllib.request
from datetime import *
from lxml import html
from bs4 import BeautifulSoup
In [ ]:
# Scrape all HTML from webpage.
def scrapewebpage(url):
# Open URL and get HTML.
web = urllib.request.urlopen(url)
# Make sure there wasn't any errors opening the URL.
if (web.getcode() == 200):
html = web.read()
return(html)
else:
print("Error %s reading %s" % str(web.getcode()), url)
# Helper function that scrape the webpage and turn it into soup.
def makesoup(url):
html = scrapewebpage(url)
return(BeautifulSoup(html, "lxml"))
In [ ]:
# Scrape Wikipedia main page.
wp_soup = makesoup("https://en.wikipedia.org/wiki/Main_Page")
In [ ]:
# Match the <h2> tag with id the id mp-itn-h2
h2 = wp_soup.find(id="mp-itn-h2")
h2
In [ ]:
# Only get the text inside <h2>.
h2.get_text()
In [ ]:
# Scrape Wikipedia main page.
wp_soup = makesoup("https://en.wikipedia.org/wiki/Main_Page")
In [ ]:
# Find the first HTML tag that has class mw-headline.
headline = wp_soup.find("", "mw-headline")
headline
In [ ]:
# Only get the text inside the <span>.
headline.get_text()
In [ ]:
# Scrape Wikipedia main page.
wp_soup = makesoup("https://en.wikipedia.org/wiki/Main_Page")
In [ ]:
# Find all HTML tag that has class mw-headline.
all_headlines = wp_soup.find_all("", "mw-headline")
all_headlines
In [ ]:
# Now we have a list that we can use a for loop.
for headline in all_headlines:
headline = headline.get_text()
print(headline)
In [ ]:
# Scrape Wikipedia main page.
wp_soup = makesoup("https://en.wikipedia.org/wiki/Main_Page")
In [ ]:
# Find all HTML tag that has class mw-headline.
all_h3 = wp_soup.find_all("h3")
all_h3
In [ ]:
# Now we have a list that we can use a for loop.
for h3 in all_h3:
h3 = h3.get_text()
print(h3)
In [ ]:
# Scrape a Wikipedia page with a table.
champ_soup = makesoup("https://en.wikipedia.org/wiki/European_Road_Championships")
In [ ]:
# Find <table class="wikitable">.
table = champ_soup.find("table", "wikitable")
table
In [ ]:
# Go through each row and take the text from 1st and 2nd column.
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('td')
if len(cols) > 0:
Year = cols[0].get_text() # Get the text in the 1st column.
Country = cols[1].get_text() # Get the text in the 2nd column.
print(Year + " " + Country)
In [ ]:
table = champ_soup.find("table", "wikitable")
# Get cell value from row 5, column 1.
cell = table.find_all('tr')[5].find_all('td')[1].get_text()
cell
In [ ]:
# Scrape Wikipedia main page.
wp_soup = makesoup("https://en.wikipedia.org/wiki/Main_Page")
In [ ]:
# Find <table class="mp-middle">.
middle_table = wp_soup.find("table", id="mp-middle")
# In the <table>, find <h2>.
h2 = middle_table.find("h2")
h2
In [ ]:
# Only get the text inside the <h2>.
h2.get_text()