Some BeautifulSoup examples to help you scrape the HTML from web pages.
See BeautifulSoup documentation for technical reference.
In [ ]:
    
# Import all the things!
import urllib.request
from datetime import *
from lxml import html
from bs4 import BeautifulSoup
    
In [ ]:
    
# Scrape all HTML from webpage.
def scrapewebpage(url):
	# Open URL and get HTML.
	web = urllib.request.urlopen(url)
	# Make sure there wasn't any errors opening the URL.
	if (web.getcode() == 200):
		html = web.read()
		return(html)
	else:
		print("Error %s reading %s" % str(web.getcode()), url)
# Helper function that scrape the webpage and turn it into soup.
def makesoup(url):
	html = scrapewebpage(url)
	return(BeautifulSoup(html, "lxml"))
    
In [ ]:
    
# Scrape Wikipedia main page.
wp_soup = makesoup("https://en.wikipedia.org/wiki/Main_Page")
    
In [ ]:
    
# Match the <h2> tag with id the id mp-itn-h2
h2 = wp_soup.find(id="mp-itn-h2")
h2
    
In [ ]:
    
# Only get the text inside <h2>.
h2.get_text()
    
In [ ]:
    
# Scrape Wikipedia main page.
wp_soup = makesoup("https://en.wikipedia.org/wiki/Main_Page")
    
In [ ]:
    
# Find the first HTML tag that has class mw-headline.
headline = wp_soup.find("", "mw-headline")
headline
    
In [ ]:
    
# Only get the text inside the <span>.
headline.get_text()
    
In [ ]:
    
# Scrape Wikipedia main page.
wp_soup = makesoup("https://en.wikipedia.org/wiki/Main_Page")
    
In [ ]:
    
# Find all HTML tag that has class mw-headline.
all_headlines = wp_soup.find_all("", "mw-headline")
all_headlines
    
In [ ]:
    
# Now we have a list that we can use a for loop.
for headline in all_headlines:
    headline = headline.get_text()
    print(headline)
    
In [ ]:
    
# Scrape Wikipedia main page.
wp_soup = makesoup("https://en.wikipedia.org/wiki/Main_Page")
    
In [ ]:
    
# Find all HTML tag that has class mw-headline.
all_h3 = wp_soup.find_all("h3")
all_h3
    
In [ ]:
    
# Now we have a list that we can use a for loop.
for h3 in all_h3:
    h3 = h3.get_text()
    print(h3)
    
In [ ]:
    
# Scrape a Wikipedia page with a table.
champ_soup = makesoup("https://en.wikipedia.org/wiki/European_Road_Championships")
    
In [ ]:
    
# Find <table class="wikitable">.
table = champ_soup.find("table", "wikitable")
table
    
In [ ]:
    
# Go through each row and take the text from 1st and 2nd column.
rows = table.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    if len(cols) > 0:
        Year = cols[0].get_text()        # Get the text in the 1st column.
        Country = cols[1].get_text()     # Get the text in the 2nd column.
        print(Year + " " + Country)
    
In [ ]:
    
table = champ_soup.find("table", "wikitable")
# Get cell value from row 5, column 1.
cell = table.find_all('tr')[5].find_all('td')[1].get_text()
cell
    
In [ ]:
    
# Scrape Wikipedia main page.
wp_soup = makesoup("https://en.wikipedia.org/wiki/Main_Page")
    
In [ ]:
    
# Find <table class="mp-middle">.
middle_table = wp_soup.find("table", id="mp-middle")
# In the <table>, find <h2>.
h2 = middle_table.find("h2")
h2
    
In [ ]:
    
# Only get the text inside the <h2>.
h2.get_text()