BeautifulSoup documentation

Docs


In [1]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [2]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

print(soup.prettify())


<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>

In [3]:
soup.title
# <title>The Dormouse's story</title>


Out[3]:
<title>The Dormouse's story</title>

In [4]:
soup.title.name
# u'title'


Out[4]:
u'title'

In [5]:
soup.title.string
# u'The Dormouse's story'


Out[5]:
u"The Dormouse's story"

In [6]:
soup.title.parent.name
# u'head'


Out[6]:
u'head'

In [7]:
soup.p
# <p class="title"><b>The Dormouse's story</b></p>


Out[7]:
<p class="title"><b>The Dormouse's story</b></p>

In [8]:
soup.p['class']
# u'title'


Out[8]:
[u'title']

In [9]:
soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


Out[9]:
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [10]:
soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


Out[10]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [11]:
soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>


Out[11]:
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

Extract all URLS


In [12]:
for link in soup.find_all('a'):
    print(link.get('href'))


http://example.com/elsie
http://example.com/lacie
http://example.com/tillie

Extract all text


In [13]:
print(soup.get_text())


The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...

Four kinds of Objects

  1. Tag
  2. NavigableString
  3. BeautifulSoup
  4. Comment

1. Tag


In [20]:
#soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'lxml')
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'html.parser')
tag = soup.b
type(tag)
# <class 'bs4.element.Tag'>


Out[20]:
bs4.element.Tag

In [21]:
tag.name


Out[21]:
u'b'

In [18]:
tag.name = "blockquote"
tag


Out[18]:
<blockquote class="boldest">Extremely bold</blockquote>

In [25]:
tag['id']


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-25-c68bda60b84a> in <module>()
----> 1 tag['id']

/Users/jarumugam/anaconda/envs/py27/lib/python2.7/site-packages/bs4/element.pyc in __getitem__(self, key)
    995         """tag[key] returns the value of the 'key' attribute for the tag,
    996         and throws an exception if it's not there."""
--> 997         return self.attrs[key]
    998 
    999     def __iter__(self):

KeyError: 'id'

In [26]:
tag.id

In [29]:
tag['class']


Out[29]:
[u'boldest']

In [27]:
tag.attrs


Out[27]:
{u'class': [u'boldest']}

In [30]:
tag['id'] = 'verybold'
tag['another-attribute'] = 1
tag
# <b another-attribute="1" id="verybold"></b>


Out[30]:
<b another-attribute="1" class="boldest" id="verybold">Extremely bold</b>

In [31]:
del tag['id']
del tag['another-attribute']
tag
# <b></b>


Out[31]:
<b class="boldest">Extremely bold</b>

In [33]:
tag['id']
# KeyError: 'id'


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-33-edca917d4588> in <module>()
----> 1 tag['id']
      2 # KeyError: 'id'

/Users/jarumugam/anaconda/envs/py27/lib/python2.7/site-packages/bs4/element.pyc in __getitem__(self, key)
    995         """tag[key] returns the value of the 'key' attribute for the tag,
    996         and throws an exception if it's not there."""
--> 997         return self.attrs[key]
    998 
    999     def __iter__(self):

KeyError: 'id'

In [34]:
print(tag.get('id'))
# None


None

In [35]:
css_soup = BeautifulSoup('<p class="body"></p>')
css_soup.p['class']
# ["body"]


Out[35]:
['body']

In [36]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.p['class']
# ["body", "strikeout"]


Out[36]:
['body', 'strikeout']

In [37]:
id_soup = BeautifulSoup('<p id="my id"></p>')
id_soup.p['id']


Out[37]:
'my id'

In [38]:
rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>')
rel_soup.a['rel']
# ['index']


Out[38]:
['index']

In [39]:
rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p)
# <p>Back to the <a rel="index contents">homepage</a></p>


<p>Back to the <a rel="index contents">homepage</a></p>

In [40]:
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml')
xml_soup.p['class']
# u'body strikeout'


Out[40]:
u'body strikeout'

2. NavigableString


In [42]:
tag.string
# u'Extremely bold'


Out[42]:
u'Extremely bold'

In [43]:
type(tag.string)
# <class 'bs4.element.NavigableString'>


Out[43]:
bs4.element.NavigableString

In [44]:
unicode_string = unicode(tag.string)
unicode_string
# u'Extremely bold'


Out[44]:
u'Extremely bold'

In [45]:
type(unicode_string)
# <type 'unicode'>


Out[45]:
unicode

In [46]:
tag.string.replace_with("No longer bold")
tag
# <blockquote>No longer bold</blockquote>


Out[46]:
<b class="boldest">No longer bold</b>

3. Beautiful Soup

This can be treated as a tag object for most purposes


In [48]:
soup.name


Out[48]:
u'[document]'

4. Comments

Special type of navigable string


In [50]:
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup)
comment = soup.b.string
type(comment)
# <class 'bs4.element.Comment'>


Out[50]:
bs4.element.Comment

In [51]:
comment


Out[51]:
u'Hey, buddy. Want to buy a used parser?'

In [52]:
print(soup.b.prettify())


<b>
 <!--Hey, buddy. Want to buy a used parser?-->
</b>

In [53]:
from bs4 import CData
cdata = CData("A CDATA block")
comment.replace_with(cdata)

print(soup.b.prettify())
# <b>
#  <![CDATA[A CDATA block]]>
# </b>


<b>
 <![CDATA[A CDATA block]]>
</b>

In [54]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [55]:
soup.head
# <head><title>The Dormouse's story</title></head>


Out[55]:
<head><title>The Dormouse's story</title></head>

In [56]:
soup.title
# <title>The Dormouse's story</title>


Out[56]:
<title>The Dormouse's story</title>

In [57]:
soup.body.b
# <b>The Dormouse's story</b>


Out[57]:
<b>The Dormouse's story</b>

In [58]:
soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


Out[58]:
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [59]:
soup.find_all('a')


Out[59]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [60]:
head_tag = soup.head
head_tag
# <head><title>The Dormouse's story</title></head>


Out[60]:
<head><title>The Dormouse's story</title></head>

Contents, children, and descendants


In [62]:
head_tag.contents
#[<title>The Dormouse's story</title>]


Out[62]:
[<title>The Dormouse's story</title>]

In [63]:
title_tag = head_tag.contents[0]
title_tag
# <title>The Dormouse's story</title>


Out[63]:
<title>The Dormouse's story</title>

In [64]:
title_tag.contents
# [u'The Dormouse's story']


Out[64]:
[u"The Dormouse's story"]

In [65]:
print len(soup.contents)
# 1
print soup.contents[0].name
# u'html'


2
None

A string does not have .contents


In [66]:
text = title_tag.contents[0]
text.contents
# AttributeError: 'NavigableString' object has no attribute 'contents'


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-66-529715b2710e> in <module>()
      1 text = title_tag.contents[0]
----> 2 text.contents
      3 # AttributeError: 'NavigableString' object has no attribute 'contents'

/Users/jarumugam/anaconda/envs/py27/lib/python2.7/site-packages/bs4/element.pyc in __getattr__(self, attr)
    728             raise AttributeError(
    729                 "'%s' object has no attribute '%s'" % (
--> 730                     self.__class__.__name__, attr))
    731 
    732     def output_ready(self, formatter="minimal"):

AttributeError: 'NavigableString' object has no attribute 'contents'

In [67]:
for child in title_tag.children:
    print(child)


The Dormouse's story

In [68]:
head_tag.contents


Out[68]:
[<title>The Dormouse's story</title>]

In [69]:
for child in head_tag.descendants:
    print(child)


<title>The Dormouse's story</title>
The Dormouse's story

In [70]:
print len(list(soup.children))
# 1
print len(list(soup.descendants))
# 25


2
27

In [71]:
title_tag.string


Out[71]:
u"The Dormouse's story"

In [72]:
head_tag.contents
# [<title>The Dormouse's story</title>]

head_tag.string
# u'The Dormouse's story'


Out[72]:
u"The Dormouse's story"

If a tag’s only child is another tag, and that tag has a .string, then the parent tag is considered to have the same .string as its child


In [73]:
head_tag.contents
# [<title>The Dormouse's story</title>]

head_tag.string
# u'The Dormouse's story'


Out[73]:
u"The Dormouse's story"

If a tag contains more than one thing, then it’s not clear what .string should refer to, so .string is defined to be None:


In [74]:
print(soup.html.string)


None

In [75]:
for string in soup.strings:
    print(repr(string))


u'\n'
u"The Dormouse's story"
u'\n'
u'\n'
u"The Dormouse's story"
u'\n'
u'Once upon a time there were three little sisters; and their names were\n'
u'Elsie'
u',\n'
u'Lacie'
u' and\n'
u'Tillie'
u';\nand they lived at the bottom of a well.'
u'\n'
u'...'
u'\n'

Removing white space


In [77]:
for string in soup.stripped_strings:
    print(repr(string))


u"The Dormouse's story"
u"The Dormouse's story"
u'Once upon a time there were three little sisters; and their names were'
u'Elsie'
u','
u'Lacie'
u'and'
u'Tillie'
u';\nand they lived at the bottom of a well.'
u'...'

Parent's and siblings


In [78]:
title_tag = soup.title
title_tag
# <title>The Dormouse's story</title>


Out[78]:
<title>The Dormouse's story</title>

In [79]:
title_tag.parent
# <head><title>The Dormouse's story</title></head>


Out[79]:
<head><title>The Dormouse's story</title></head>

In [80]:
title_tag.string.parent


Out[80]:
<title>The Dormouse's story</title>

In [81]:
html_tag = soup.html
type(html_tag.parent)


Out[81]:
bs4.BeautifulSoup

In [82]:
print(soup.parent)


None

In [85]:
link = soup.a
link
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


Out[85]:
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [86]:
for parent in link.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)


p
body
html
[document]

In [87]:
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>")
print(sibling_soup.prettify())


<html>
 <body>
  <a>
   <b>
    text1
   </b>
   <c>
    text2
   </c>
  </a>
 </body>
</html>

In [88]:
sibling_soup.b.next_sibling
# <c>text2</c>


Out[88]:
<c>text2</c>

In [89]:
sibling_soup.c.previous_sibling
# <b>text1</b>


Out[89]:
<b>text1</b>

In [90]:
print(sibling_soup.b.previous_sibling)
# None


None

In [91]:
print(sibling_soup.c.next_sibling)
# None


None

In [92]:
sibling_soup.b.string
# u'text1'


Out[92]:
u'text1'

In [93]:
print(sibling_soup.b.string.next_sibling)
# None


None

In [94]:
link = soup.a
link
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


Out[94]:
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [95]:
link.next_sibling
# u',\n'


Out[95]:
u',\n'

In [96]:
link.next_sibling.next_sibling


Out[96]:
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

In [97]:
for sibling in soup.a.next_siblings:
    print(repr(sibling))


u',\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
u' and\n'
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
u';\nand they lived at the bottom of a well.'

In [98]:
for sibling in soup.find(id="link3").previous_siblings:
    print(repr(sibling))


u' and\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
u',\n'
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
u'Once upon a time there were three little sisters; and their names were\n'

Going back and forth


In [99]:
last_a_tag = soup.find("a", id="link3")
last_a_tag
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>


Out[99]:
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [100]:
last_a_tag.next_sibling
# '; and they lived at the bottom of a well.'


Out[100]:
u';\nand they lived at the bottom of a well.'

In [101]:
last_a_tag.next_element


Out[101]:
u'Tillie'

In [102]:
last_a_tag.previous_element
# u' and\n'


Out[102]:
u' and\n'

In [103]:
last_a_tag.previous_element.next_element
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>


Out[103]:
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [104]:
for element in last_a_tag.next_elements:
    print(repr(element))


u'Tillie'
u';\nand they lived at the bottom of a well.'
u'\n'
<p class="story">...</p>
u'...'
u'\n'

Searching a Tree


In [105]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [106]:
soup.find_all('b')


Out[106]:
[<b>The Dormouse's story</b>]

In [107]:
import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)


body
b

In [110]:
for tag in soup.find_all(re.compile("t")):
    print(tag.name)


html
title

In [111]:
soup.find_all(["a", "b"])


Out[111]:
[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [112]:
for tag in soup.find_all(True):
    print(tag.name)


html
head
title
body
p
b
p
a
a
a
p

Function


In [113]:
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')

In [116]:
print soup.find_all(has_class_but_no_id)


[<p class="title"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and their names were\n<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,\n<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\nand they lived at the bottom of a well.</p>, <p class="story">...</p>]

In [115]:
def not_lacie(href):
    return href and not re.compile("lacie").search(href)
soup.find_all(href=not_lacie)


Out[115]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [124]:
from bs4 import element

In [125]:
def surrounded_by_strings(tag):
    return (isinstance(tag.next_element, element.NavigableString)
            and isinstance(tag.previous_element, element.NavigableString))

for tag in soup.find_all(surrounded_by_strings):
    print tag.name


body
p
a
a
a
p

In [126]:
soup.find_all("title")


Out[126]:
[<title>The Dormouse's story</title>]

In [127]:
soup.find_all("p", "title")


Out[127]:
[<p class="title"><b>The Dormouse's story</b></p>]

In [128]:
soup.find_all("a")


Out[128]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [129]:
soup.find_all(id="link2")


Out[129]:
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [130]:
import re
soup.find(string=re.compile("sisters"))
# u'Once upon a time there were three little sisters; and their names were\n'


Out[130]:
u'Once upon a time there were three little sisters; and their names were\n'

In [131]:
soup.find_all("title")


Out[131]:
[<title>The Dormouse's story</title>]

In [132]:
soup.find_all(id='link2')


Out[132]:
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [133]:
soup.find_all(href=re.compile("elsie"))


Out[133]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

You can filter an attribute based on a string, a regular expression, a list, a function, or the value True.


In [134]:
soup.find_all(id=True)


Out[134]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [135]:
soup.find_all(href=re.compile("elsie"), id='link1')


Out[135]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [136]:
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')
data_soup.find_all(data-foo="value")


  File "<ipython-input-136-cda15f701e36>", line 2
    data_soup.find_all(data-foo="value")
SyntaxError: keyword can't be an expression

In [138]:
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')
data_soup.find_all(attrs={"data-foo": "value"})


Out[138]:
[<div data-foo="value">foo!</div>]

Searching by CSS Class


In [140]:
soup.find_all("a", class="sister")


  File "<ipython-input-140-5f358b5377cf>", line 1
    soup.find_all("a", class="sister")
                           ^
SyntaxError: invalid syntax

In [141]:
soup.find_all("a", class_="sister")


Out[141]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [142]:
soup.find_all(class_=re.compile("itl"))


Out[142]:
[<p class="title"><b>The Dormouse's story</b></p>]

In [143]:
def has_six_characters(css_class):
    return css_class is not None and len(css_class) == 6

soup.find_all(class_=has_six_characters)


Out[143]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [144]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.find_all("p", class_="strikeout")
# [<p class="body strikeout"></p>]


Out[144]:
[<p class="body strikeout"></p>]

In [145]:
css_soup.find_all("p", class_="body")


Out[145]:
[<p class="body strikeout"></p>]

In [146]:
css_soup.find_all("p", class_="body strikeout")


Out[146]:
[<p class="body strikeout"></p>]

In [147]:
css_soup.find_all("p", class_="strikeout body")


Out[147]:
[]

In [148]:
css_soup.select("p.strikeout.body")


Out[148]:
[<p class="body strikeout"></p>]

In [149]:
soup.find_all("a", attrs={"class": "sister"})


Out[149]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [150]:
soup.find_all(string="Elsie")
# [u'Elsie']


Out[150]:
[u'Elsie']

In [151]:
soup.find_all(string=["Tillie", "Elsie", "Lacie"])
# [u'Elsie', u'Lacie', u'Tillie']


Out[151]:
[u'Elsie', u'Lacie', u'Tillie']

In [152]:
soup.find_all(string=re.compile("Dormouse"))
[u"The Dormouse's story", u"The Dormouse's story"]


Out[152]:
[u"The Dormouse's story", u"The Dormouse's story"]

In [153]:
def is_the_only_string_within_a_tag(s):
    """Return True if this string is the only child of its parent tag."""
    return (s == s.parent.string)

soup.find_all(string=is_the_only_string_within_a_tag)
# [u"The Dormouse's story", u"The Dormouse's story", u'Elsie', u'Lacie', u'Tillie', u'...']


Out[153]:
[u"The Dormouse's story",
 u"The Dormouse's story",
 u'Elsie',
 u'Lacie',
 u'Tillie',
 u'...']

In [154]:
soup.find_all("a", string="Elsie")


Out[154]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [155]:
soup.find_all("a", text="Elsie")


Out[155]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [156]:
soup.find_all("a", limit=2)


Out[156]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [157]:
soup.html.find_all("title")
# [<title>The Dormouse's story</title>]


Out[157]:
[<title>The Dormouse's story</title>]

In [158]:
soup.html.find_all("title", recursive=False)
# []


Out[158]:
[]

In [159]:
soup.find_all("a")


Out[159]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [160]:
soup("a")


Out[160]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [161]:
soup.find_all('title', limit=1)
# [<title>The Dormouse's story</title>]


Out[161]:
[<title>The Dormouse's story</title>]

In [162]:
soup.find('title')
# <title>The Dormouse's story</title>


Out[162]:
<title>The Dormouse's story</title>

In [163]:
print(soup.find("nosuchtag"))


None

The only difference is that find_all() returns a list containing the single result, and find() just returns the result.


In [164]:
soup.head.title
# <title>The Dormouse's story</title>


Out[164]:
<title>The Dormouse's story</title>

In [165]:
soup.find("head").find("title")
# <title>The Dormouse's story</title>


Out[165]:
<title>The Dormouse's story</title>

In [166]:
a_string = soup.find(string="Lacie")
a_string
# u'Lacie'


Out[166]:
u'Lacie'

In [167]:
a_string.find_parents("a")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


Out[167]:
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [168]:
a_string.find_parent("p")
# <p class="story">Once upon a time there were three little sisters; and their names were
#  <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
#  and they lived at the bottom of a well.</p>


Out[168]:
<p class="story">Once upon a time there were three little sisters; and their names were\n<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,\n<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\nand they lived at the bottom of a well.</p>

In [170]:
a_string.find_parents("p", class_="title")
# []


Out[170]:
[]

In [171]:
first_link = soup.a
first_link
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


Out[171]:
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [172]:
first_link.find_next_siblings("a")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


Out[172]:
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [174]:
first_story_paragraph = soup.find("p", "story")
first_story_paragraph.find_next_sibling("p")


Out[174]:
<p class="story">...</p>

In [175]:
last_link = soup.find("a", id="link3")
last_link


Out[175]:
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [176]:
last_link.find_previous_siblings("a")


Out[176]:
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [177]:
first_story_paragraph = soup.find("p", "story")
first_story_paragraph.find_previous_sibling("p")


Out[177]:
<p class="title"><b>The Dormouse's story</b></p>

In [178]:
first_link = soup.a
first_link


Out[178]:
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [179]:
first_link.find_all_next(string=True)


Out[179]:
[u'Elsie',
 u',\n',
 u'Lacie',
 u' and\n',
 u'Tillie',
 u';\nand they lived at the bottom of a well.',
 u'\n',
 u'...',
 u'\n']

In [180]:
first_link.find_next("p")


Out[180]:
<p class="story">...</p>

In [181]:
first_link = soup.a
first_link


Out[181]:
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [182]:
first_link.find_all_previous("p")


Out[182]:
[<p class="story">Once upon a time there were three little sisters; and their names were\n<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,\n<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\nand they lived at the bottom of a well.</p>,
 <p class="title"><b>The Dormouse's story</b></p>]

In [183]:
first_link.find_previous("title")


Out[183]:
<title>The Dormouse's story</title>

In [184]:
soup.select("title")
# [<title>The Dormouse's story</title>]


Out[184]:
[<title>The Dormouse's story</title>]

In [185]:
soup.select("p:nth-of-type(3)")


Out[185]:
[<p class="story">...</p>]

In [186]:
soup.select("body a")


Out[186]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [187]:
soup.select("html head title")


Out[187]:
[<title>The Dormouse's story</title>]

In [188]:
soup.select("head > title")


Out[188]:
[<title>The Dormouse's story</title>]

In [189]:
soup.select("p > a")


Out[189]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [190]:
soup.select("p > a:nth-of-type(2)")


Out[190]:
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [191]:
soup.select("p > #link1")


Out[191]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [192]:
soup.select("body > a")


Out[192]:
[]

In [193]:
soup.select("#link1 ~ .sister")


Out[193]:
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [194]:
soup.select("#link1 + .sister")


Out[194]:
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [195]:
soup.select(".sister")


Out[195]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [196]:
soup.select("[class~=sister]")


Out[196]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [197]:
soup.select("#link1")


Out[197]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [198]:
soup.select("a#link2")


Out[198]:
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [199]:
soup.select('a[href]')


Out[199]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [200]:
soup.select('a[href="http://example.com/elsie"]')


Out[200]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [201]:
soup.select('a[href^="http://example.com/"]')


Out[201]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [202]:
soup.select('a[href$="tillie"]')


Out[202]:
[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [203]:
soup.select('a[href*=".com/el"]')


Out[203]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [204]:
multilingual_markup = """
 <p lang="en">Hello</p>
 <p lang="en-us">Howdy, y'all</p>
 <p lang="en-gb">Pip-pip, old fruit</p>
 <p lang="fr">Bonjour mes amis</p>
"""
multilingual_soup = BeautifulSoup(multilingual_markup)
multilingual_soup.select('p[lang|=en]')
# [<p lang="en">Hello</p>,
#  <p lang="en-us">Howdy, y'all</p>,
#  <p lang="en-gb">Pip-pip, old fruit</p>]


Out[204]:
[<p lang="en">Hello</p>,
 <p lang="en-us">Howdy, y'all</p>,
 <p lang="en-gb">Pip-pip, old fruit</p>]

In [205]:
soup.select_one(".sister")


Out[205]:
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [207]:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
tag = soup.b

tag.name = "blockquote"
tag['class'] = 'verybold'
tag['id'] = 1
tag
# <blockquote class="verybold" id="1">Extremely bold</blockquote>


Out[207]:
<blockquote class="verybold" id="1">Extremely bold</blockquote>

In [208]:
del tag['class']
del tag['id']
tag


Out[208]:
<blockquote>Extremely bold</blockquote>

Be careful: if the tag contained other tags, they and all their contents will be destroyed.


In [209]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)

tag = soup.a
tag.string = "New link text."
tag
# <a h


Out[209]:
<a href="http://example.com/">New link text.</a>

In [210]:
soup = BeautifulSoup("<a>Foo</a>")
soup.a.append("Bar")

soup
# <html><head></head><body><a>FooBar</a></body></html>


Out[210]:
<html><body><a>FooBar</a></body></html>

In [211]:
soup.a.contents


Out[211]:
[u'Foo', u'Bar']

In [214]:
soup = BeautifulSoup("<b></b>")
tag = soup.b
tag.append("Hello")
new_string = element.NavigableString(" there")
tag.append(new_string)
tag
# <b>Hello there.</b>


Out[214]:
<b>Hello there</b>

In [215]:
tag.contents
# [u'Hello', u' there']


Out[215]:
[u'Hello', u' there']

In [216]:
from bs4 import Comment
new_comment = Comment("Nice to see you.")
tag.append(new_comment)
tag
# <b>Hello there<!--Nice to see you.--></b>


Out[216]:
<b>Hello there<!--Nice to see you.--></b>

In [217]:
tag.contents
# [u'Hello', u' there', u'Nice to see you.']


Out[217]:
[u'Hello', u' there', u'Nice to see you.']

In [218]:
soup = BeautifulSoup("<b></b>")
original_tag = soup.b

new_tag = soup.new_tag("a", href="http://www.example.com")
original_tag.append(new_tag)
original_tag
# <b><a href="http://www.example.com"></a></b>


Out[218]:
<b><a href="http://www.example.com"></a></b>

In [219]:
new_tag.string = "Link text."
original_tag
# <b><a href="http://www.example.com">Link text.</a></b>


Out[219]:
<b><a href="http://www.example.com">Link text.</a></b>

In [220]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
tag = soup.a

tag.insert(1, "but did not endorse ")
tag
# <a href="http://example.com/">I linked to but did not endorse <i>example.com</i></a>


Out[220]:
<a href="http://example.com/">I linked to but did not endorse <i>example.com</i></a>

In [221]:
tag.contents
# [u'I linked to ', u'but did not endorse', <i>example.com</i>]


Out[221]:
[u'I linked to ', u'but did not endorse ', <i>example.com</i>]

In [222]:
soup = BeautifulSoup("<b>stop</b>")
tag = soup.new_tag("i")
tag.string = "Don't"
soup.b.string.insert_before(tag)
soup.b
# <b><i>Don't</i>stop</b>


Out[222]:
<b><i>Don't</i>stop</b>

In [223]:
soup.b.i.insert_after(soup.new_string(" ever "))
soup.b
# <b><i>Don't</i> ever stop</b>


Out[223]:
<b><i>Don't</i> ever stop</b>

In [224]:
soup.b.contents
# [<i>Don't</i>, u' ever ', u'stop']


Out[224]:
[<i>Don't</i>, u' ever ', u'stop']

In [225]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
tag = soup.a

tag.clear()
tag
# <a href="http://example.com/"></a>


Out[225]:
<a href="http://example.com/"></a>

In [226]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
a_tag = soup.a

i_tag = soup.i.extract()

a_tag
# <a href="http://example.com/">I linked to</a>


Out[226]:
<a href="http://example.com/">I linked to </a>

In [227]:
i_tag
# <i>example.com</i>


Out[227]:
<i>example.com</i>

In [228]:
print(i_tag.parent)
None


None

In [229]:
my_string = i_tag.string.extract()
my_string
# u'example.com'


Out[229]:
u'example.com'

In [230]:
print(my_string.parent)
# None
i_tag
# <i></i>


None
Out[230]:
<i></i>

In [231]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
a_tag = soup.a

soup.i.decompose()

a_tag
# <a href="http://example.com/">I linked to</a>


Out[231]:
<a href="http://example.com/">I linked to </a>

In [232]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
a_tag = soup.a

new_tag = soup.new_tag("b")
new_tag.string = "example.net"
a_tag.i.replace_with(new_tag)

a_tag
# <a href="http://example.com/">I linked to <b>example.net</b></a>


Out[232]:
<a href="http://example.com/">I linked to <b>example.net</b></a>

In [235]:
soup = BeautifulSoup("<p>I wish I was bold.</p>")
soup.p.string.wrap(soup.new_tag("b"))
# <b>I wish I was bold.</b>


Out[235]:
<b>I wish I was bold.</b>

In [238]:
soup.p.wrap(soup.new_tag("div"))


Out[238]:
<div><p><b>I wish I was bold.</b></p></div>

In [239]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
a_tag = soup.a

a_tag.i.unwrap()
a_tag


Out[239]:
<a href="http://example.com/">I linked to example.com</a>

In [240]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
soup.prettify()
# '<html>\n <head>\n </head>\n <body>\n  <a href="http://example.com/">\n...'

print(soup.prettify())


<html>
 <body>
  <a href="http://example.com/">
   I linked to
   <i>
    example.com
   </i>
  </a>
 </body>
</html>

In [241]:
print(soup.a.prettify())


<a href="http://example.com/">
 I linked to
 <i>
  example.com
 </i>
</a>

In [242]:
str(soup)
# '<html><head></head><body><a href="http://example.com/">I linked to <i>example.com</i></a></body></html>'


Out[242]:
'<html><body><a href="http://example.com/">I linked to <i>example.com</i></a></body></html>'

In [243]:
unicode(soup.a)


Out[243]:
u'<a href="http://example.com/">I linked to <i>example.com</i></a>'

In [244]:
soup = BeautifulSoup("&ldquo;Dammit!&rdquo; he said.")
unicode(soup)


Out[244]:
u'<html><body><p>\u201cDammit!\u201d he said.</p></body></html>'

In [245]:
str(soup)


Out[245]:
'<html><body><p>\xe2\x80\x9cDammit!\xe2\x80\x9d he said.</p></body></html>'

In [246]:
soup = BeautifulSoup("<p>The law firm of Dewey, Cheatem, & Howe</p>")
soup.p


Out[246]:
<p>The law firm of Dewey, Cheatem, &amp; Howe</p>

In [247]:
soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>')
soup.a


Out[247]:
<a href="http://example.com/?foo=val1&amp;bar=val2">A link</a>

In [248]:
french = "<p>Il a dit &lt;&lt;Sacr&eacute; bleu!&gt;&gt;</p>"
soup = BeautifulSoup(french)
print(soup.prettify(formatter="minimal"))


<html>
 <body>
  <p>
   Il a dit &lt;&lt;Sacré bleu!&gt;&gt;
  </p>
 </body>
</html>

In [249]:
print(soup.prettify(formatter="html"))


<html>
 <body>
  <p>
   Il a dit &lt;&lt;Sacr&eacute; bleu!&gt;&gt;
  </p>
 </body>
</html>

In [250]:
print(soup.prettify(formatter=None))


<html>
 <body>
  <p>
   Il a dit <<Sacré bleu!>>
  </p>
 </body>
</html>

In [251]:
link_soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>')
print(link_soup.a.encode(formatter=None))


<a href="http://example.com/?foo=val1&bar=val2">A link</a>

In [252]:
def uppercase(str):
    return str.upper()

print(soup.prettify(formatter=uppercase))


<html>
 <body>
  <p>
   IL A DIT <<SACRÉ BLEU!>>
  </p>
 </body>
</html>

In [253]:
print(link_soup.a.prettify(formatter=uppercase))


<a href="HTTP://EXAMPLE.COM/?FOO=VAL1&BAR=VAL2">
 A LINK
</a>

In [254]:
from bs4.dammit import EntitySubstitution
def uppercase_and_substitute_html_entities(str):
    return EntitySubstitution.substitute_html(str.upper())

print(soup.prettify(formatter=uppercase_and_substitute_html_entities))


<html>
 <body>
  <p>
   IL A DIT &lt;&lt;SACR&Eacute; BLEU!&gt;&gt;
  </p>
 </body>
</html>

In [255]:
from bs4.element import CData
soup = BeautifulSoup("<a></a>")
soup.a.string = CData("one < three")
print(soup.a.prettify(formatter="xml"))


<a>
 <![CDATA[one < three]]>
</a>

In [256]:
markup = '<a href="http://example.com/">\nI linked to <i>example.com</i>\n</a>'
soup = BeautifulSoup(markup)

soup.get_text()


Out[256]:
u'\nI linked to example.com\n'

In [258]:
soup.i.get_text()


Out[258]:
u'example.com'

In [259]:
soup.get_text("|")


Out[259]:
u'\nI linked to |example.com|\n'

In [260]:
soup.get_text("|", strip=True)


Out[260]:
u'I linked to|example.com'

In [261]:
[text for text in soup.stripped_strings]


Out[261]:
[u'I linked to', u'example.com']

Parser


In [262]:
BeautifulSoup("<a><b /></a>")


Out[262]:
<html><body><a><b></b></a></body></html>

In [263]:
BeautifulSoup("<a><b /></a>", "xml")


Out[263]:
<?xml version="1.0" encoding="unicode-escape"?>\n<a><b/></a>

In [264]:
BeautifulSoup("<a></p>", "lxml")


Out[264]:
<html><body><a></a></body></html>

In [266]:
#BeautifulSoup("<a></p>", "html5lib")

In [267]:
BeautifulSoup("<a></p>", "html.parser")


Out[267]:
<a></a>

Encodings


In [268]:
markup = "<h1>Sacr\xc3\xa9 bleu!</h1>"
soup = BeautifulSoup(markup)
soup.h1


Out[268]:
<h1>Sacr\xe9 bleu!</h1>

In [269]:
# <h1>Sacré bleu!</h1>
soup.h1.string


Out[269]:
u'Sacr\xe9 bleu!'

In [270]:
soup.original_encoding


Out[270]:
'utf-8'

In [271]:
markup = b"<h1>\xed\xe5\xec\xf9</h1>"
soup = BeautifulSoup(markup)
soup.h1


Out[271]:
<h1>\u03bd\u03b5\u03bc\u03c9</h1>

In [272]:
soup.original_encoding


Out[272]:
'ISO-8859-7'

In [273]:
soup = BeautifulSoup(markup, from_encoding="iso-8859-8")
soup.h1


Out[273]:
<h1>\u05dd\u05d5\u05dc\u05e9</h1>

In [274]:
soup.original_encoding


Out[274]:
'iso-8859-8'

In [275]:
soup = BeautifulSoup(markup, exclude_encodings=["ISO-8859-7"])
soup.h1


Out[275]:
<h1>\xed\xe5\xec\xf9</h1>

In [276]:
soup.original_encoding


Out[276]:
'windows-1252'

In [277]:
markup = b'''
 <html>
  <head>
   <meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type" />
  </head>
  <body>
   <p>Sacr\xe9 bleu!</p>
  </body>
 </html>
'''

soup = BeautifulSoup(markup)
print(soup.prettify())


<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
 </head>
 <body>
  <p>
   Sacré bleu!
  </p>
 </body>
</html>


In [278]:
print(soup.prettify("latin-1"))


<html>
 <head>
  <meta content="text/html; charset=latin-1" http-equiv="Content-type"/>
 </head>
 <body>
  <p>
   Sacr� bleu!
  </p>
 </body>
</html>


In [279]:
soup.p.encode("latin-1")


Out[279]:
'<p>Sacr\xe9 bleu!</p>'

In [280]:
soup.p.encode("utf-8")


Out[280]:
'<p>Sacr\xc3\xa9 bleu!</p>'

In [281]:
markup = u"<b>\N{SNOWMAN}</b>"
snowman_soup = BeautifulSoup(markup)
tag = snowman_soup.b

In [282]:
print(tag.encode("utf-8"))


<b>☃</b>

In [283]:
print tag.encode("latin-1")


<b>&#9731;</b>

In [284]:
print tag.encode("ascii")


<b>&#9731;</b>

In [285]:
from bs4 import UnicodeDammit
dammit = UnicodeDammit("Sacr\xc3\xa9 bleu!")
print(dammit.unicode_markup)
# Sacré bleu!


Sacré bleu!

In [286]:
dammit.original_encoding


Out[286]:
'utf-8'

In [287]:
dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"])
print(dammit.unicode_markup)
# Sacré bleu!


Sacré bleu!

In [288]:
dammit.original_encoding
# 'latin-1'


Out[288]:
'latin-1'

In [289]:
markup = b"<p>I just \x93love\x94 Microsoft Word\x92s smart quotes</p>"

UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="html").unicode_markup
# u'<p>I just &ldquo;love&rdquo; Microsoft Word&rsquo;s smart quotes</p>'


Out[289]:
u'<p>I just &ldquo;love&rdquo; Microsoft Word&rsquo;s smart quotes</p>'

In [290]:
UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="xml").unicode_markup
# u'<p>I just &#x201C;love&#x201D; Microsoft Word&#x2019;s smart quotes</p>'


Out[290]:
u'<p>I just &#x201C;love&#x201D; Microsoft Word&#x2019;s smart quotes</p>'

In [291]:
UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="ascii").unicode_markup


Out[291]:
u'<p>I just "love" Microsoft Word\'s smart quotes</p>'

In [292]:
UnicodeDammit(markup, ["windows-1252"]).unicode_markup


Out[292]:
u'<p>I just \u201clove\u201d Microsoft Word\u2019s smart quotes</p>'

In [293]:
snowmen = (u"\N{SNOWMAN}" * 3)
quote = (u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}")
doc = snowmen.encode("utf8") + quote.encode("windows_1252")

In [294]:
print(doc)


☃☃☃�I like snowmen!�

In [295]:
print(doc.decode("windows-1252"))


☃☃☃“I like snowmen!”

In [296]:
new_doc = UnicodeDammit.detwingle(doc)
print(new_doc.decode("utf8"))


☃☃☃“I like snowmen!”

Object Comparison


In [297]:
markup = "<p>I want <b>pizza</b> and more <b>pizza</b>!</p>"
soup = BeautifulSoup(markup, 'html.parser')
first_b, second_b = soup.find_all('b')
print first_b == second_b


True

In [298]:
print first_b.previous_element == second_b.previous_element


False

In [299]:
print first_b is second_b


False

In [300]:
import copy
p_copy = copy.copy(soup.p)
print p_copy


<p>I want <b>pizza</b> and more <b>pizza</b>!</p>

In [302]:
print soup.p == p_copy
print soup.p is p_copy


True
False

In [303]:
print p_copy.parent


None

In [304]:
from bs4 import SoupStrainer

only_a_tags = SoupStrainer("a")

only_tags_with_id_link2 = SoupStrainer(id="link2")

def is_short_string(string):
    return len(string) < 10

only_short_strings = SoupStrainer(string=is_short_string)

In [305]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

print(BeautifulSoup(html_doc, "html.parser", parse_only=only_a_tags).prettify())


<a class="sister" href="http://example.com/elsie" id="link1">
 Elsie
</a>
<a class="sister" href="http://example.com/lacie" id="link2">
 Lacie
</a>
<a class="sister" href="http://example.com/tillie" id="link3">
 Tillie
</a>

In [306]:
print(BeautifulSoup(html_doc, "html.parser", parse_only=only_tags_with_id_link2).prettify())


<a class="sister" href="http://example.com/lacie" id="link2">
 Lacie
</a>

In [308]:
#print(BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings).prettify())

In [310]:
#soup = BeautifulSoup(html_doc)
#soup.find_all(only_short_strings)

Further dianostics possible!