In [5]:
from bs4 import BeautifulSoup
import requests
# let's use BeautifulSoup to read an example webpage
html = requests.get("http://www.example.com").text
soup = BeautifulSoup(html)
In [11]:
# Let's read the text from the first paragraph div on the website.
print(soup.find('p'))
print(soup.find('p').text.split())
In [8]:
print(soup.p)
In [13]:
first_paragraph = soup.p
first_paragraph_test = soup.p.text
first_paragraph_words = soup.p.text.split()
# first_paragraph_id = soup.p['id'] # raises error
soup.p.get('id') # returns None
In [14]:
all_paragraphs = soup.find_all('p')
print(all_paragraphs)
In [15]:
# get all paragraphs with ids
ps_with_ids = [p for p in soup('p') if p.get('id')]
print(ps_with_ids)
In [19]:
# find tags with a specific class
important_paragraphs = soup('p', {'class': 'important'})
print(important_paragraphs)
important_paragraphs2 = soup('p', 'important')
important_paragraphs3 = [p for p in soup('p') if 'important' in p.get('class', [])]
In [23]:
# Okay, now let's use BeautifulSoup to parse a slightly more interesting webpage
url = 'http://shop.oreilly.com/category/browse-subjects/data.do?sortby=publicationDate&page=1'
soup = BeautifulSoup(requests.get(url).text)
tds = soup('td', 'thumbtext')
print(len(tds))
In [24]:
def is_video(td):
"""It's a video if it has exactly one price label and if the
stripped text inside that price label starts with 'Video'"""
pricelabels = td('span', 'pricelabel')
return (len(pricelabels) == 1 and pricelabels[0].text.strip().startswith("Video"))
print(len([td for td in tds if not is_video(td)]))
In [35]:
import re
def book_info(td):
'''given a BeautifulSoup <td? Tag representing an O'Reilly book,
extract the books details and return a dict'''
title = td.find('div', 'thumbheader').find('a').text
by_author = td.find('div', 'AuthorName').text
authors = [x.strip() for x in re.sub('^By ', '', by_author).split(',')]
isbn_link = td.find('td', 'thumbheader').find('a').get('href')
isbn = re.match('/product/(.*)\.do', isbn_link).groups()[0]
date = td.find('span', 'directorydate').text.strip()
return {
'title': title,
'authors': authors,
'isbn': isbn,
'date': date
}
base_url = 'http://shop.oreilly.com/category/browse-subjects/data.do?sortby=publicationDate&page='
books = []
NUM_PAGES = 31
for page_num in range(1, NUM_PAGES + 1):
print("souping range", page_num, ',', len(books), ' found so far')
url = base_url + str(page_num)
soup = BeautifulSoup(requests.get(url).text)
for td in soup('td', 'thumbtext'):
print(td.find('td', 'thumbheader'))
if not is_video(td):
books.append(book_info(td))
sleep(1)
In [ ]: