In [5]:
from bs4 import BeautifulSoup
import requests

# let's use BeautifulSoup to read an example webpage
html = requests.get("http://www.example.com").text
soup = BeautifulSoup(html)

In [11]:
# Let's read the text from the first paragraph div on the website.
print(soup.find('p'))
print(soup.find('p').text.split())


<p>This domain is established to be used for illustrative examples in documents. You may use this
    domain in examples without prior coordination or asking for permission.</p>
['This', 'domain', 'is', 'established', 'to', 'be', 'used', 'for', 'illustrative', 'examples', 'in', 'documents.', 'You', 'may', 'use', 'this', 'domain', 'in', 'examples', 'without', 'prior', 'coordination', 'or', 'asking', 'for', 'permission.']

In [8]:
print(soup.p)


<p>This domain is established to be used for illustrative examples in documents. You may use this
    domain in examples without prior coordination or asking for permission.</p>

In [13]:
first_paragraph = soup.p
first_paragraph_test = soup.p.text
first_paragraph_words = soup.p.text.split()

# first_paragraph_id = soup.p['id']  # raises error
soup.p.get('id')  # returns None

In [14]:
all_paragraphs = soup.find_all('p')
print(all_paragraphs)


[<p>This domain is established to be used for illustrative examples in documents. You may use this
    domain in examples without prior coordination or asking for permission.</p>, <p><a href="http://www.iana.org/domains/example">More information...</a></p>]

In [15]:
# get all paragraphs with ids
ps_with_ids = [p for p in soup('p') if p.get('id')]
print(ps_with_ids)


[]

In [19]:
# find tags with a specific class
important_paragraphs = soup('p', {'class': 'important'})
print(important_paragraphs)
important_paragraphs2 = soup('p', 'important')
important_paragraphs3 = [p for p in soup('p') if 'important' in p.get('class', [])]


[]

In [23]:
# Okay, now let's use BeautifulSoup to parse a slightly more interesting webpage

url = 'http://shop.oreilly.com/category/browse-subjects/data.do?sortby=publicationDate&page=1'
soup = BeautifulSoup(requests.get(url).text)

tds = soup('td', 'thumbtext')
print(len(tds))


30

In [24]:
def is_video(td):
    """It's a video if it has exactly one price label and if the
    stripped text inside that price label starts with 'Video'"""
    pricelabels = td('span', 'pricelabel')
    return (len(pricelabels) == 1 and pricelabels[0].text.strip().startswith("Video"))


print(len([td for td in tds if not is_video(td)]))


24

In [35]:
import re

def book_info(td):
    '''given a BeautifulSoup <td? Tag representing an O'Reilly book,
    extract the books details and return a dict'''
    title = td.find('div', 'thumbheader').find('a').text
    by_author = td.find('div', 'AuthorName').text
    authors = [x.strip() for x in re.sub('^By ', '', by_author).split(',')]
    isbn_link = td.find('td', 'thumbheader').find('a').get('href')
    isbn = re.match('/product/(.*)\.do', isbn_link).groups()[0]
    date = td.find('span', 'directorydate').text.strip()
    
    return {
        'title': title,
        'authors': authors,
        'isbn': isbn,
        'date': date
    }


base_url = 'http://shop.oreilly.com/category/browse-subjects/data.do?sortby=publicationDate&page='
books = []
NUM_PAGES = 31

for page_num in range(1, NUM_PAGES + 1):
    print("souping range", page_num, ',', len(books), ' found so far')
    url = base_url + str(page_num)
    soup = BeautifulSoup(requests.get(url).text)
    
    for td in soup('td', 'thumbtext'):
        print(td.find('td', 'thumbheader'))
        if not is_video(td):
            books.append(book_info(td))
    
    sleep(1)


souping range 1 , 0  found so far
None
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-35-cb66a761b904> in <module>()
     31         print(td.find('td', 'thumbheader'))
     32         if not is_video(td):
---> 33             books.append(book_info(td))
     34 
     35     sleep(1)

<ipython-input-35-cb66a761b904> in book_info(td)
      7     by_author = td.find('div', 'AuthorName').text
      8     authors = [x.strip() for x in re.sub('^By ', '', by_author).split(',')]
----> 9     isbn_link = td.find('td', 'thumbheader').find('a').get('href')
     10     isbn = re.match('/product/(.*)\.do', isbn_link).groups()[0]
     11     date = td.find('span', 'directorydate').text.strip()

AttributeError: 'NoneType' object has no attribute 'find'

In [ ]: