BeautifulSoup documentation

Docs



In [1]:

    
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""



In [2]:

    
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

print(soup.prettify())









    



<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>



In [3]:

    
soup.title
# <title>The Dormouse's story</title>









    Out[3]:





<title>The Dormouse's story</title>



In [4]:

    
soup.title.name
# u'title'









    Out[4]:





u'title'



In [5]:

    
soup.title.string
# u'The Dormouse's story'









    Out[5]:





u"The Dormouse's story"



In [6]:

    
soup.title.parent.name
# u'head'









    Out[6]:





u'head'



In [7]:

    
soup.p
# <p class="title"><b>The Dormouse's story</b></p>









    Out[7]:





<p class="title"><b>The Dormouse's story</b></p>



In [8]:

    
soup.p['class']
# u'title'









    Out[8]:





[u'title']



In [9]:

    
soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>









    Out[9]:





<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>



In [10]:

    
soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]









    Out[10]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [11]:

    
soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>









    Out[11]:





<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

Extract all URLS



In [12]:

    
for link in soup.find_all('a'):
    print(link.get('href'))









    



http://example.com/elsie
http://example.com/lacie
http://example.com/tillie

Extract all text



In [13]:

    
print(soup.get_text())









    



The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...

Four kinds of Objects

Tag
NavigableString
BeautifulSoup
Comment

1. Tag



In [20]:

    
#soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'lxml')
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'html.parser')
tag = soup.b
type(tag)
# <class 'bs4.element.Tag'>









    Out[20]:





bs4.element.Tag



In [21]:

    
tag.name









    Out[21]:





u'b'



In [18]:

    
tag.name = "blockquote"
tag









    Out[18]:





<blockquote class="boldest">Extremely bold</blockquote>



In [25]:

    
tag['id']









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-25-c68bda60b84a> in <module>()
----> 1 tag['id']

/Users/jarumugam/anaconda/envs/py27/lib/python2.7/site-packages/bs4/element.pyc in __getitem__(self, key)
    995         """tag[key] returns the value of the 'key' attribute for the tag,
    996         and throws an exception if it's not there."""
--> 997         return self.attrs[key]
    998 
    999     def __iter__(self):

KeyError: 'id'



In [26]:

    
tag.id



In [29]:

    
tag['class']









    Out[29]:





[u'boldest']



In [27]:

    
tag.attrs









    Out[27]:





{u'class': [u'boldest']}



In [30]:

    
tag['id'] = 'verybold'
tag['another-attribute'] = 1
tag
# <b another-attribute="1" id="verybold"></b>









    Out[30]:





<b another-attribute="1" class="boldest" id="verybold">Extremely bold</b>



In [31]:

    
del tag['id']
del tag['another-attribute']
tag
# <b></b>









    Out[31]:





<b class="boldest">Extremely bold</b>



In [33]:

    
tag['id']
# KeyError: 'id'









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-33-edca917d4588> in <module>()
----> 1 tag['id']
      2 # KeyError: 'id'

/Users/jarumugam/anaconda/envs/py27/lib/python2.7/site-packages/bs4/element.pyc in __getitem__(self, key)
    995         """tag[key] returns the value of the 'key' attribute for the tag,
    996         and throws an exception if it's not there."""
--> 997         return self.attrs[key]
    998 
    999     def __iter__(self):

KeyError: 'id'



In [34]:

    
print(tag.get('id'))
# None









    



None



In [35]:

    
css_soup = BeautifulSoup('<p class="body"></p>')
css_soup.p['class']
# ["body"]









    Out[35]:





['body']



In [36]:

    
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.p['class']
# ["body", "strikeout"]









    Out[36]:





['body', 'strikeout']



In [37]:

    
id_soup = BeautifulSoup('<p id="my id"></p>')
id_soup.p['id']









    Out[37]:





'my id'



In [38]:

    
rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>')
rel_soup.a['rel']
# ['index']









    Out[38]:





['index']



In [39]:

    
rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p)
# <p>Back to the <a rel="index contents">homepage</a></p>









    



<p>Back to the <a rel="index contents">homepage</a></p>



In [40]:

    
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml')
xml_soup.p['class']
# u'body strikeout'









    Out[40]:





u'body strikeout'

2. NavigableString



In [42]:

    
tag.string
# u'Extremely bold'









    Out[42]:





u'Extremely bold'



In [43]:

    
type(tag.string)
# <class 'bs4.element.NavigableString'>









    Out[43]:





bs4.element.NavigableString



In [44]:

    
unicode_string = unicode(tag.string)
unicode_string
# u'Extremely bold'









    Out[44]:





u'Extremely bold'



In [45]:

    
type(unicode_string)
# <type 'unicode'>









    Out[45]:





unicode



In [46]:

    
tag.string.replace_with("No longer bold")
tag
# <blockquote>No longer bold</blockquote>









    Out[46]:





<b class="boldest">No longer bold</b>

3. Beautiful Soup

This can be treated as a tag object for most purposes



In [48]:

    
soup.name









    Out[48]:





u'[document]'

4. Comments

Special type of navigable string



In [50]:

    
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup)
comment = soup.b.string
type(comment)
# <class 'bs4.element.Comment'>









    Out[50]:





bs4.element.Comment



In [51]:

    
comment









    Out[51]:





u'Hey, buddy. Want to buy a used parser?'



In [52]:

    
print(soup.b.prettify())









    



<b>
 <!--Hey, buddy. Want to buy a used parser?-->
</b>



In [53]:

    
from bs4 import CData
cdata = CData("A CDATA block")
comment.replace_with(cdata)

print(soup.b.prettify())
# <b>
#  <![CDATA[A CDATA block]]>
# </b>









    



<b>
 <![CDATA[A CDATA block]]>
</b>

Navigable Tree



In [54]:

    
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')



In [55]:

    
soup.head
# <head><title>The Dormouse's story</title></head>









    Out[55]:





<head><title>The Dormouse's story</title></head>



In [56]:

    
soup.title
# <title>The Dormouse's story</title>









    Out[56]:





<title>The Dormouse's story</title>



In [57]:

    
soup.body.b
# <b>The Dormouse's story</b>









    Out[57]:





<b>The Dormouse's story</b>



In [58]:

    
soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>









    Out[58]:





<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>



In [59]:

    
soup.find_all('a')









    Out[59]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [60]:

    
head_tag = soup.head
head_tag
# <head><title>The Dormouse's story</title></head>









    Out[60]:





<head><title>The Dormouse's story</title></head>

Contents, children, and descendants



In [62]:

    
head_tag.contents
#[<title>The Dormouse's story</title>]









    Out[62]:





[<title>The Dormouse's story</title>]



In [63]:

    
title_tag = head_tag.contents[0]
title_tag
# <title>The Dormouse's story</title>









    Out[63]:





<title>The Dormouse's story</title>



In [64]:

    
title_tag.contents
# [u'The Dormouse's story']









    Out[64]:





[u"The Dormouse's story"]



In [65]:

    
print len(soup.contents)
# 1
print soup.contents[0].name
# u'html'









    



2
None

A string does not have .contents



In [66]:

    
text = title_tag.contents[0]
text.contents
# AttributeError: 'NavigableString' object has no attribute 'contents'









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-66-529715b2710e> in <module>()
      1 text = title_tag.contents[0]
----> 2 text.contents
      3 # AttributeError: 'NavigableString' object has no attribute 'contents'

/Users/jarumugam/anaconda/envs/py27/lib/python2.7/site-packages/bs4/element.pyc in __getattr__(self, attr)
    728             raise AttributeError(
    729                 "'%s' object has no attribute '%s'" % (
--> 730                     self.__class__.__name__, attr))
    731 
    732     def output_ready(self, formatter="minimal"):

AttributeError: 'NavigableString' object has no attribute 'contents'



In [67]:

    
for child in title_tag.children:
    print(child)









    



The Dormouse's story



In [68]:

    
head_tag.contents









    Out[68]:





[<title>The Dormouse's story</title>]



In [69]:

    
for child in head_tag.descendants:
    print(child)









    



<title>The Dormouse's story</title>
The Dormouse's story



In [70]:

    
print len(list(soup.children))
# 1
print len(list(soup.descendants))
# 25



In [71]:

    
title_tag.string









    Out[71]:





u"The Dormouse's story"



In [72]:

    
head_tag.contents
# [<title>The Dormouse's story</title>]

head_tag.string
# u'The Dormouse's story'









    Out[72]:





u"The Dormouse's story"

If a tag’s only child is another tag, and that tag has a .string, then the parent tag is considered to have the same .string as its child



In [73]:

    
head_tag.contents
# [<title>The Dormouse's story</title>]

head_tag.string
# u'The Dormouse's story'









    Out[73]:





u"The Dormouse's story"

If a tag contains more than one thing, then it’s not clear what .string should refer to, so .string is defined to be None:



In [74]:

    
print(soup.html.string)









    



None



In [75]:

    
for string in soup.strings:
    print(repr(string))









    



u'\n'
u"The Dormouse's story"
u'\n'
u'\n'
u"The Dormouse's story"
u'\n'
u'Once upon a time there were three little sisters; and their names were\n'
u'Elsie'
u',\n'
u'Lacie'
u' and\n'
u'Tillie'
u';\nand they lived at the bottom of a well.'
u'\n'
u'...'
u'\n'

Removing white space



In [77]:

    
for string in soup.stripped_strings:
    print(repr(string))









    



u"The Dormouse's story"
u"The Dormouse's story"
u'Once upon a time there were three little sisters; and their names were'
u'Elsie'
u','
u'Lacie'
u'and'
u'Tillie'
u';\nand they lived at the bottom of a well.'
u'...'

Parent's and siblings



In [78]:

    
title_tag = soup.title
title_tag
# <title>The Dormouse's story</title>









    Out[78]:





<title>The Dormouse's story</title>



In [79]:

    
title_tag.parent
# <head><title>The Dormouse's story</title></head>









    Out[79]:





<head><title>The Dormouse's story</title></head>



In [80]:

    
title_tag.string.parent









    Out[80]:





<title>The Dormouse's story</title>



In [81]:

    
html_tag = soup.html
type(html_tag.parent)









    Out[81]:





bs4.BeautifulSoup



In [82]:

    
print(soup.parent)









    



None



In [85]:

    
link = soup.a
link
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>









    Out[85]:





<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>



In [86]:

    
for parent in link.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)









    



p
body
html
[document]



In [87]:

    
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>")
print(sibling_soup.prettify())









    



<html>
 <body>
  <a>
   <b>
    text1
   </b>
   <c>
    text2
   </c>
  </a>
 </body>
</html>



In [88]:

    
sibling_soup.b.next_sibling
# <c>text2</c>









    Out[88]:





<c>text2</c>



In [89]:

    
sibling_soup.c.previous_sibling
# <b>text1</b>









    Out[89]:





<b>text1</b>



In [90]:

    
print(sibling_soup.b.previous_sibling)
# None









    



None



In [91]:

    
print(sibling_soup.c.next_sibling)
# None









    



None



In [92]:

    
sibling_soup.b.string
# u'text1'









    Out[92]:





u'text1'



In [93]:

    
print(sibling_soup.b.string.next_sibling)
# None









    



None



In [94]:

    
link = soup.a
link
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>









    Out[94]:





<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>



In [95]:

    
link.next_sibling
# u',\n'









    Out[95]:





u',\n'



In [96]:

    
link.next_sibling.next_sibling









    Out[96]:





<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>



In [97]:

    
for sibling in soup.a.next_siblings:
    print(repr(sibling))









    



u',\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
u' and\n'
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
u';\nand they lived at the bottom of a well.'



In [98]:

    
for sibling in soup.find(id="link3").previous_siblings:
    print(repr(sibling))









    



u' and\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
u',\n'
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
u'Once upon a time there were three little sisters; and their names were\n'

Going back and forth



In [99]:

    
last_a_tag = soup.find("a", id="link3")
last_a_tag
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>









    Out[99]:





<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>



In [100]:

    
last_a_tag.next_sibling
# '; and they lived at the bottom of a well.'









    Out[100]:





u';\nand they lived at the bottom of a well.'



In [101]:

    
last_a_tag.next_element









    Out[101]:





u'Tillie'



In [102]:

    
last_a_tag.previous_element
# u' and\n'









    Out[102]:





u' and\n'



In [103]:

    
last_a_tag.previous_element.next_element
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>









    Out[103]:





<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>



In [104]:

    
for element in last_a_tag.next_elements:
    print(repr(element))









    



u'Tillie'
u';\nand they lived at the bottom of a well.'
u'\n'
<p class="story">...</p>
u'...'
u'\n'

Searching a Tree



In [105]:

    
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')



In [106]:

    
soup.find_all('b')









    Out[106]:





[<b>The Dormouse's story</b>]



In [107]:

    
import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)









    



body
b



In [110]:

    
for tag in soup.find_all(re.compile("t")):
    print(tag.name)









    



html
title



In [111]:

    
soup.find_all(["a", "b"])









    Out[111]:





[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [112]:

    
for tag in soup.find_all(True):
    print(tag.name)









    



html
head
title
body
p
b
p
a
a
a
p

Function



In [113]:

    
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')



In [116]:

    
print soup.find_all(has_class_but_no_id)









    



[<p class="title"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and their names were\n<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,\n<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\nand they lived at the bottom of a well.</p>, <p class="story">...</p>]



In [115]:

    
def not_lacie(href):
    return href and not re.compile("lacie").search(href)
soup.find_all(href=not_lacie)









    Out[115]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [124]:

    
from bs4 import element



In [125]:

    
def surrounded_by_strings(tag):
    return (isinstance(tag.next_element, element.NavigableString)
            and isinstance(tag.previous_element, element.NavigableString))

for tag in soup.find_all(surrounded_by_strings):
    print tag.name









    



body
p
a
a
a
p



In [126]:

    
soup.find_all("title")









    Out[126]:





[<title>The Dormouse's story</title>]



In [127]:

    
soup.find_all("p", "title")









    Out[127]:





[<p class="title"><b>The Dormouse's story</b></p>]



In [128]:

    
soup.find_all("a")









    Out[128]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [129]:

    
soup.find_all(id="link2")









    Out[129]:





[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]



In [130]:

    
import re
soup.find(string=re.compile("sisters"))
# u'Once upon a time there were three little sisters; and their names were\n'









    Out[130]:





u'Once upon a time there were three little sisters; and their names were\n'



In [131]:

    
soup.find_all("title")









    Out[131]:





[<title>The Dormouse's story</title>]



In [132]:

    
soup.find_all(id='link2')









    Out[132]:





[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]



In [133]:

    
soup.find_all(href=re.compile("elsie"))









    Out[133]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

You can filter an attribute based on a string, a regular expression, a list, a function, or the value True.



In [134]:

    
soup.find_all(id=True)









    Out[134]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [135]:

    
soup.find_all(href=re.compile("elsie"), id='link1')









    Out[135]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]



In [136]:

    
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')
data_soup.find_all(data-foo="value")









    



  File "<ipython-input-136-cda15f701e36>", line 2
    data_soup.find_all(data-foo="value")
SyntaxError: keyword can't be an expression



In [138]:

    
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')
data_soup.find_all(attrs={"data-foo": "value"})









    Out[138]:





[<div data-foo="value">foo!</div>]

Searching by CSS Class



In [140]:

    
soup.find_all("a", class="sister")









    



  File "<ipython-input-140-5f358b5377cf>", line 1
    soup.find_all("a", class="sister")
                           ^
SyntaxError: invalid syntax



In [141]:

    
soup.find_all("a", class_="sister")









    Out[141]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [142]:

    
soup.find_all(class_=re.compile("itl"))









    Out[142]:





[<p class="title"><b>The Dormouse's story</b></p>]



In [143]:

    
def has_six_characters(css_class):
    return css_class is not None and len(css_class) == 6

soup.find_all(class_=has_six_characters)









    Out[143]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [144]:

    
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.find_all("p", class_="strikeout")
# [<p class="body strikeout"></p>]









    Out[144]:





[<p class="body strikeout"></p>]



In [145]:

    
css_soup.find_all("p", class_="body")









    Out[145]:





[<p class="body strikeout"></p>]



In [146]:

    
css_soup.find_all("p", class_="body strikeout")









    Out[146]:





[<p class="body strikeout"></p>]



In [147]:

    
css_soup.find_all("p", class_="strikeout body")









    Out[147]:





[]



In [148]:

    
css_soup.select("p.strikeout.body")









    Out[148]:





[<p class="body strikeout"></p>]



In [149]:

    
soup.find_all("a", attrs={"class": "sister"})









    Out[149]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [150]:

    
soup.find_all(string="Elsie")
# [u'Elsie']









    Out[150]:





[u'Elsie']



In [151]:

    
soup.find_all(string=["Tillie", "Elsie", "Lacie"])
# [u'Elsie', u'Lacie', u'Tillie']









    Out[151]:





[u'Elsie', u'Lacie', u'Tillie']



In [152]:

    
soup.find_all(string=re.compile("Dormouse"))
[u"The Dormouse's story", u"The Dormouse's story"]









    Out[152]:





[u"The Dormouse's story", u"The Dormouse's story"]



In [153]:

    
def is_the_only_string_within_a_tag(s):
    """Return True if this string is the only child of its parent tag."""
    return (s == s.parent.string)

soup.find_all(string=is_the_only_string_within_a_tag)
# [u"The Dormouse's story", u"The Dormouse's story", u'Elsie', u'Lacie', u'Tillie', u'...']









    Out[153]:





[u"The Dormouse's story",
 u"The Dormouse's story",
 u'Elsie',
 u'Lacie',
 u'Tillie',
 u'...']



In [154]:

    
soup.find_all("a", string="Elsie")









    Out[154]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]



In [155]:

    
soup.find_all("a", text="Elsie")









    Out[155]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]



In [156]:

    
soup.find_all("a", limit=2)









    Out[156]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]



In [157]:

    
soup.html.find_all("title")
# [<title>The Dormouse's story</title>]









    Out[157]:





[<title>The Dormouse's story</title>]



In [158]:

    
soup.html.find_all("title", recursive=False)
# []









    Out[158]:





[]



In [159]:

    
soup.find_all("a")









    Out[159]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [160]:

    
soup("a")









    Out[160]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [161]:

    
soup.find_all('title', limit=1)
# [<title>The Dormouse's story</title>]









    Out[161]:





[<title>The Dormouse's story</title>]



In [162]:

    
soup.find('title')
# <title>The Dormouse's story</title>









    Out[162]:





<title>The Dormouse's story</title>



In [163]:

    
print(soup.find("nosuchtag"))









    



None

The only difference is that find_all() returns a list containing the single result, and find() just returns the result.



In [164]:

    
soup.head.title
# <title>The Dormouse's story</title>









    Out[164]:





<title>The Dormouse's story</title>



In [165]:

    
soup.find("head").find("title")
# <title>The Dormouse's story</title>









    Out[165]:





<title>The Dormouse's story</title>



In [166]:

    
a_string = soup.find(string="Lacie")
a_string
# u'Lacie'









    Out[166]:





u'Lacie'



In [167]:

    
a_string.find_parents("a")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]









    Out[167]:





[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]



In [168]:

    
a_string.find_parent("p")
# <p class="story">Once upon a time there were three little sisters; and their names were
#  <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
#  and they lived at the bottom of a well.</p>









    Out[168]:





<p class="story">Once upon a time there were three little sisters; and their names were\n<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,\n<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\nand they lived at the bottom of a well.</p>



In [170]:

    
a_string.find_parents("p", class_="title")
# []









    Out[170]:





[]



In [171]:

    
first_link = soup.a
first_link
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>









    Out[171]:





<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>



In [172]:

    
first_link.find_next_siblings("a")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]









    Out[172]:





[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [174]:

    
first_story_paragraph = soup.find("p", "story")
first_story_paragraph.find_next_sibling("p")









    Out[174]:





<p class="story">...</p>



In [175]:

    
last_link = soup.find("a", id="link3")
last_link









    Out[175]:





<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>



In [176]:

    
last_link.find_previous_siblings("a")









    Out[176]:





[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]



In [177]:

    
first_story_paragraph = soup.find("p", "story")
first_story_paragraph.find_previous_sibling("p")









    Out[177]:





<p class="title"><b>The Dormouse's story</b></p>



In [178]:

    
first_link = soup.a
first_link









    Out[178]:





<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>



In [179]:

    
first_link.find_all_next(string=True)









    Out[179]:





[u'Elsie',
 u',\n',
 u'Lacie',
 u' and\n',
 u'Tillie',
 u';\nand they lived at the bottom of a well.',
 u'\n',
 u'...',
 u'\n']



In [180]:

    
first_link.find_next("p")









    Out[180]:





<p class="story">...</p>



In [181]:

    
first_link = soup.a
first_link









    Out[181]:





<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>



In [182]:

    
first_link.find_all_previous("p")









    Out[182]:





[<p class="story">Once upon a time there were three little sisters; and their names were\n<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,\n<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\nand they lived at the bottom of a well.</p>,
 <p class="title"><b>The Dormouse's story</b></p>]



In [183]:

    
first_link.find_previous("title")









    Out[183]:





<title>The Dormouse's story</title>



In [184]:

    
soup.select("title")
# [<title>The Dormouse's story</title>]









    Out[184]:





[<title>The Dormouse's story</title>]



In [185]:

    
soup.select("p:nth-of-type(3)")









    Out[185]:





[<p class="story">...</p>]



In [186]:

    
soup.select("body a")









    Out[186]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [187]:

    
soup.select("html head title")









    Out[187]:





[<title>The Dormouse's story</title>]



In [188]:

    
soup.select("head > title")









    Out[188]:





[<title>The Dormouse's story</title>]



In [189]:

    
soup.select("p > a")









    Out[189]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [190]:

    
soup.select("p > a:nth-of-type(2)")









    Out[190]:





[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]



In [191]:

    
soup.select("p > #link1")









    Out[191]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]



In [192]:

    
soup.select("body > a")









    Out[192]:





[]



In [193]:

    
soup.select("#link1 ~ .sister")









    Out[193]:





[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [194]:

    
soup.select("#link1 + .sister")









    Out[194]:





[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]



In [195]:

    
soup.select(".sister")









    Out[195]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [196]:

    
soup.select("[class~=sister]")









    Out[196]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [197]:

    
soup.select("#link1")









    Out[197]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]



In [198]:

    
soup.select("a#link2")









    Out[198]:





[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]



In [199]:

    
soup.select('a[href]')









    Out[199]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [200]:

    
soup.select('a[href="http://example.com/elsie"]')









    Out[200]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]



In [201]:

    
soup.select('a[href^="http://example.com/"]')









    Out[201]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [202]:

    
soup.select('a[href$="tillie"]')









    Out[202]:





[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]



In [203]:

    
soup.select('a[href*=".com/el"]')









    Out[203]:





[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]



In [204]:

    
multilingual_markup = """
 <p lang="en">Hello</p>
 <p lang="en-us">Howdy, y'all</p>
 <p lang="en-gb">Pip-pip, old fruit</p>
 <p lang="fr">Bonjour mes amis</p>
"""
multilingual_soup = BeautifulSoup(multilingual_markup)
multilingual_soup.select('p[lang|=en]')
# [<p lang="en">Hello</p>,
#  <p lang="en-us">Howdy, y'all</p>,
#  <p lang="en-gb">Pip-pip, old fruit</p>]









    Out[204]:





[<p lang="en">Hello</p>,
 <p lang="en-us">Howdy, y'all</p>,
 <p lang="en-gb">Pip-pip, old fruit</p>]



In [205]:

    
soup.select_one(".sister")









    Out[205]:





<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>



In [207]:

    
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
tag = soup.b

tag.name = "blockquote"
tag['class'] = 'verybold'
tag['id'] = 1
tag
# <blockquote class="verybold" id="1">Extremely bold</blockquote>









    Out[207]:





<blockquote class="verybold" id="1">Extremely bold</blockquote>



In [208]:

    
del tag['class']
del tag['id']
tag









    Out[208]:





<blockquote>Extremely bold</blockquote>

Be careful: if the tag contained other tags, they and all their contents will be destroyed.



In [209]:

    
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)

tag = soup.a
tag.string = "New link text."
tag
# <a h









    Out[209]:





<a href="http://example.com/">New link text.</a>



In [210]:

    
soup = BeautifulSoup("<a>Foo</a>")
soup.a.append("Bar")

soup
# <html><head></head><body><a>FooBar</a></body></html>









    Out[210]:





<html><body><a>FooBar</a></body></html>



In [211]:

    
soup.a.contents









    Out[211]:





[u'Foo', u'Bar']



In [214]:

    
soup = BeautifulSoup("<b></b>")
tag = soup.b
tag.append("Hello")
new_string = element.NavigableString(" there")
tag.append(new_string)
tag
# <b>Hello there.</b>









    Out[214]:





<b>Hello there</b>



In [215]:

    
tag.contents
# [u'Hello', u' there']









    Out[215]:





[u'Hello', u' there']



In [216]:

    
from bs4 import Comment
new_comment = Comment("Nice to see you.")
tag.append(new_comment)
tag
# <b>Hello there<!--Nice to see you.--></b>









    Out[216]:





<b>Hello there<!--Nice to see you.--></b>



In [217]:

    
tag.contents
# [u'Hello', u' there', u'Nice to see you.']









    Out[217]:





[u'Hello', u' there', u'Nice to see you.']



In [218]:

    
soup = BeautifulSoup("<b></b>")
original_tag = soup.b

new_tag = soup.new_tag("a", href="http://www.example.com")
original_tag.append(new_tag)
original_tag
# <b><a href="http://www.example.com"></a></b>









    Out[218]:





<b><a href="http://www.example.com"></a></b>



In [219]:

    
new_tag.string = "Link text."
original_tag
# <b><a href="http://www.example.com">Link text.</a></b>









    Out[219]:





<b><a href="http://www.example.com">Link text.</a></b>



In [220]:

    
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
tag = soup.a

tag.insert(1, "but did not endorse ")
tag
# <a href="http://example.com/">I linked to but did not endorse <i>example.com</i></a>









    Out[220]:





<a href="http://example.com/">I linked to but did not endorse <i>example.com</i></a>



In [221]:

    
tag.contents
# [u'I linked to ', u'but did not endorse', <i>example.com</i>]









    Out[221]:





[u'I linked to ', u'but did not endorse ', <i>example.com</i>]



In [222]:

    
soup = BeautifulSoup("<b>stop</b>")
tag = soup.new_tag("i")
tag.string = "Don't"
soup.b.string.insert_before(tag)
soup.b
# <b><i>Don't</i>stop</b>









    Out[222]:





<b><i>Don't</i>stop</b>



In [223]:

    
soup.b.i.insert_after(soup.new_string(" ever "))
soup.b
# <b><i>Don't</i> ever stop</b>









    Out[223]:





<b><i>Don't</i> ever stop</b>



In [224]:

    
soup.b.contents
# [<i>Don't</i>, u' ever ', u'stop']









    Out[224]:





[<i>Don't</i>, u' ever ', u'stop']



In [225]:

    
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
tag = soup.a

tag.clear()
tag
# <a href="http://example.com/"></a>









    Out[225]:





<a href="http://example.com/"></a>



In [226]:

    
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
a_tag = soup.a

i_tag = soup.i.extract()

a_tag
# <a href="http://example.com/">I linked to</a>









    Out[226]:





<a href="http://example.com/">I linked to </a>



In [227]:

    
i_tag
# <i>example.com</i>









    Out[227]:





<i>example.com</i>



In [228]:

    
print(i_tag.parent)
None









    



None



In [229]:

    
my_string = i_tag.string.extract()
my_string
# u'example.com'









    Out[229]:





u'example.com'



In [230]:

    
print(my_string.parent)
# None
i_tag
# <i></i>









    



None






    Out[230]:





<i></i>



In [231]:

    
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
a_tag = soup.a

soup.i.decompose()

a_tag
# <a href="http://example.com/">I linked to</a>









    Out[231]:





<a href="http://example.com/">I linked to </a>



In [232]:

    
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
a_tag = soup.a

new_tag = soup.new_tag("b")
new_tag.string = "example.net"
a_tag.i.replace_with(new_tag)

a_tag
# <a href="http://example.com/">I linked to <b>example.net</b></a>









    Out[232]:





<a href="http://example.com/">I linked to <b>example.net</b></a>



In [235]:

    
soup = BeautifulSoup("<p>I wish I was bold.</p>")
soup.p.string.wrap(soup.new_tag("b"))
# <b>I wish I was bold.</b>









    Out[235]:





<b>I wish I was bold.</b>



In [238]:

    
soup.p.wrap(soup.new_tag("div"))









    Out[238]:





<div><p><b>I wish I was bold.</b></p></div>



In [239]:

    
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
a_tag = soup.a

a_tag.i.unwrap()
a_tag









    Out[239]:





<a href="http://example.com/">I linked to example.com</a>



In [240]:

    
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
soup.prettify()
# '<html>\n <head>\n </head>\n <body>\n  <a href="http://example.com/">\n...'

print(soup.prettify())









    



<html>
 <body>
  <a href="http://example.com/">
   I linked to
   <i>
    example.com
   </i>
  </a>
 </body>
</html>



In [241]:

    
print(soup.a.prettify())









    



<a href="http://example.com/">
 I linked to
 <i>
  example.com
 </i>
</a>



In [242]:

    
str(soup)
# '<html><head></head><body><a href="http://example.com/">I linked to <i>example.com</i></a></body></html>'









    Out[242]:





'<html><body><a href="http://example.com/">I linked to <i>example.com</i></a></body></html>'



In [243]:

    
unicode(soup.a)









    Out[243]:





u'<a href="http://example.com/">I linked to <i>example.com</i></a>'



In [244]:

    
soup = BeautifulSoup("&ldquo;Dammit!&rdquo; he said.")
unicode(soup)









    Out[244]:





u'<html><body><p>\u201cDammit!\u201d he said.</p></body></html>'



In [245]:

    
str(soup)









    Out[245]:





'<html><body><p>\xe2\x80\x9cDammit!\xe2\x80\x9d he said.</p></body></html>'



In [246]:

    
soup = BeautifulSoup("<p>The law firm of Dewey, Cheatem, & Howe</p>")
soup.p









    Out[246]:





<p>The law firm of Dewey, Cheatem, &amp; Howe</p>



In [247]:

    
soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>')
soup.a









    Out[247]:





<a href="http://example.com/?foo=val1&amp;bar=val2">A link</a>



In [248]:

    
french = "<p>Il a dit &lt;&lt;Sacr&eacute; bleu!&gt;&gt;</p>"
soup = BeautifulSoup(french)
print(soup.prettify(formatter="minimal"))









    



<html>
 <body>
  <p>
   Il a dit &lt;&lt;Sacré bleu!&gt;&gt;
  </p>
 </body>
</html>



In [249]:

    
print(soup.prettify(formatter="html"))









    



<html>
 <body>
  <p>
   Il a dit &lt;&lt;Sacr&eacute; bleu!&gt;&gt;
  </p>
 </body>
</html>



In [250]:

    
print(soup.prettify(formatter=None))









    



<html>
 <body>
  <p>
   Il a dit <<Sacré bleu!>>
  </p>
 </body>
</html>



In [251]:

    
link_soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>')
print(link_soup.a.encode(formatter=None))









    



<a href="http://example.com/?foo=val1&bar=val2">A link</a>



In [252]:

    
def uppercase(str):
    return str.upper()

print(soup.prettify(formatter=uppercase))









    



<html>
 <body>
  <p>
   IL A DIT <<SACRÉ BLEU!>>
  </p>
 </body>
</html>



In [253]:

    
print(link_soup.a.prettify(formatter=uppercase))









    



<a href="HTTP://EXAMPLE.COM/?FOO=VAL1&BAR=VAL2">
 A LINK
</a>



In [254]:

    
from bs4.dammit import EntitySubstitution
def uppercase_and_substitute_html_entities(str):
    return EntitySubstitution.substitute_html(str.upper())

print(soup.prettify(formatter=uppercase_and_substitute_html_entities))









    



<html>
 <body>
  <p>
   IL A DIT &lt;&lt;SACR&Eacute; BLEU!&gt;&gt;
  </p>
 </body>
</html>



In [255]:

    
from bs4.element import CData
soup = BeautifulSoup("<a></a>")
soup.a.string = CData("one < three")
print(soup.a.prettify(formatter="xml"))









    



<a>
 <![CDATA[one < three]]>
</a>



In [256]:

    
markup = '<a href="http://example.com/">\nI linked to <i>example.com</i>\n</a>'
soup = BeautifulSoup(markup)

soup.get_text()









    Out[256]:





u'\nI linked to example.com\n'



In [258]:

    
soup.i.get_text()









    Out[258]:





u'example.com'



In [259]:

    
soup.get_text("|")









    Out[259]:





u'\nI linked to |example.com|\n'



In [260]:

    
soup.get_text("|", strip=True)









    Out[260]:





u'I linked to|example.com'



In [261]:

    
[text for text in soup.stripped_strings]









    Out[261]:





[u'I linked to', u'example.com']

Parser



In [262]:

    
BeautifulSoup("<a><b /></a>")









    Out[262]:





<html><body><a><b></b></a></body></html>



In [263]:

    
BeautifulSoup("<a><b /></a>", "xml")









    Out[263]:





<?xml version="1.0" encoding="unicode-escape"?>\n<a><b/></a>



In [264]:

    
BeautifulSoup("<a></p>", "lxml")









    Out[264]:





<html><body><a></a></body></html>



In [266]:

    
#BeautifulSoup("<a></p>", "html5lib")



In [267]:

    
BeautifulSoup("<a></p>", "html.parser")









    Out[267]:





<a></a>

Encodings



In [268]:

    
markup = "<h1>Sacr\xc3\xa9 bleu!</h1>"
soup = BeautifulSoup(markup)
soup.h1









    Out[268]:





<h1>Sacr\xe9 bleu!</h1>



In [269]:

    
# <h1>Sacré bleu!</h1>
soup.h1.string









    Out[269]:





u'Sacr\xe9 bleu!'



In [270]:

    
soup.original_encoding









    Out[270]:





'utf-8'



In [271]:

    
markup = b"<h1>\xed\xe5\xec\xf9</h1>"
soup = BeautifulSoup(markup)
soup.h1









    Out[271]:





<h1>\u03bd\u03b5\u03bc\u03c9</h1>



In [272]:

    
soup.original_encoding









    Out[272]:





'ISO-8859-7'



In [273]:

    
soup = BeautifulSoup(markup, from_encoding="iso-8859-8")
soup.h1









    Out[273]:





<h1>\u05dd\u05d5\u05dc\u05e9</h1>



In [274]:

    
soup.original_encoding









    Out[274]:





'iso-8859-8'



In [275]:

    
soup = BeautifulSoup(markup, exclude_encodings=["ISO-8859-7"])
soup.h1









    Out[275]:





<h1>\xed\xe5\xec\xf9</h1>



In [276]:

    
soup.original_encoding









    Out[276]:





'windows-1252'



In [277]:

    
markup = b'''
 <html>
  <head>
   <meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type" />
  </head>
  <body>
   <p>Sacr\xe9 bleu!</p>
  </body>
 </html>
'''

soup = BeautifulSoup(markup)
print(soup.prettify())









    



<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
 </head>
 <body>
  <p>
   Sacré bleu!
  </p>
 </body>
</html>



In [278]:

    
print(soup.prettify("latin-1"))









    



<html>
 <head>
  <meta content="text/html; charset=latin-1" http-equiv="Content-type"/>
 </head>
 <body>
  <p>
   Sacr� bleu!
  </p>
 </body>
</html>



In [279]:

    
soup.p.encode("latin-1")









    Out[279]:





'<p>Sacr\xe9 bleu!</p>'



In [280]:

    
soup.p.encode("utf-8")









    Out[280]:





'<p>Sacr\xc3\xa9 bleu!</p>'



In [281]:

    
markup = u"<b>\N{SNOWMAN}</b>"
snowman_soup = BeautifulSoup(markup)
tag = snowman_soup.b



In [282]:

    
print(tag.encode("utf-8"))









    



<b>☃</b>



In [283]:

    
print tag.encode("latin-1")









    



<b>&#9731;</b>



In [284]:

    
print tag.encode("ascii")









    



<b>&#9731;</b>



In [285]:

    
from bs4 import UnicodeDammit
dammit = UnicodeDammit("Sacr\xc3\xa9 bleu!")
print(dammit.unicode_markup)
# Sacré bleu!









    



Sacré bleu!



In [286]:

    
dammit.original_encoding









    Out[286]:





'utf-8'



In [287]:

    
dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"])
print(dammit.unicode_markup)
# Sacré bleu!









    



Sacré bleu!



In [288]:

    
dammit.original_encoding
# 'latin-1'









    Out[288]:





'latin-1'



In [289]:

    
markup = b"<p>I just \x93love\x94 Microsoft Word\x92s smart quotes</p>"

UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="html").unicode_markup
# u'<p>I just &ldquo;love&rdquo; Microsoft Word&rsquo;s smart quotes</p>'









    Out[289]:





u'<p>I just &ldquo;love&rdquo; Microsoft Word&rsquo;s smart quotes</p>'



In [290]:

    
UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="xml").unicode_markup
# u'<p>I just &#x201C;love&#x201D; Microsoft Word&#x2019;s smart quotes</p>'









    Out[290]:





u'<p>I just &#x201C;love&#x201D; Microsoft Word&#x2019;s smart quotes</p>'



In [291]:

    
UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="ascii").unicode_markup









    Out[291]:





u'<p>I just "love" Microsoft Word\'s smart quotes</p>'



In [292]:

    
UnicodeDammit(markup, ["windows-1252"]).unicode_markup









    Out[292]:





u'<p>I just \u201clove\u201d Microsoft Word\u2019s smart quotes</p>'



In [293]:

    
snowmen = (u"\N{SNOWMAN}" * 3)
quote = (u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}")
doc = snowmen.encode("utf8") + quote.encode("windows_1252")



In [294]:

    
print(doc)









    



☃☃☃�I like snowmen!�



In [295]:

    
print(doc.decode("windows-1252"))









    



â˜ƒâ˜ƒâ˜ƒ“I like snowmen!”



In [296]:

    
new_doc = UnicodeDammit.detwingle(doc)
print(new_doc.decode("utf8"))









    



☃☃☃“I like snowmen!”

Object Comparison



In [297]:

    
markup = "<p>I want <b>pizza</b> and more <b>pizza</b>!</p>"
soup = BeautifulSoup(markup, 'html.parser')
first_b, second_b = soup.find_all('b')
print first_b == second_b









    



True



In [298]:

    
print first_b.previous_element == second_b.previous_element









    



False



In [299]:

    
print first_b is second_b









    



False



In [300]:

    
import copy
p_copy = copy.copy(soup.p)
print p_copy









    



<p>I want <b>pizza</b> and more <b>pizza</b>!</p>



In [302]:

    
print soup.p == p_copy
print soup.p is p_copy









    



True
False



In [303]:

    
print p_copy.parent









    



None



In [304]:

    
from bs4 import SoupStrainer

only_a_tags = SoupStrainer("a")

only_tags_with_id_link2 = SoupStrainer(id="link2")

def is_short_string(string):
    return len(string) < 10

only_short_strings = SoupStrainer(string=is_short_string)



In [305]:

    
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

print(BeautifulSoup(html_doc, "html.parser", parse_only=only_a_tags).prettify())









    



<a class="sister" href="http://example.com/elsie" id="link1">
 Elsie
</a>
<a class="sister" href="http://example.com/lacie" id="link2">
 Lacie
</a>
<a class="sister" href="http://example.com/tillie" id="link3">
 Tillie
</a>



In [306]:

    
print(BeautifulSoup(html_doc, "html.parser", parse_only=only_tags_with_id_link2).prettify())









    



<a class="sister" href="http://example.com/lacie" id="link2">
 Lacie
</a>



In [308]:

    
#print(BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings).prettify())



In [310]:

    
#soup = BeautifulSoup(html_doc)
#soup.find_all(only_short_strings)