In [1]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
In [2]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())
In [3]:
soup.title
# <title>The Dormouse's story</title>
Out[3]:
In [4]:
soup.title.name
# u'title'
Out[4]:
In [5]:
soup.title.string
# u'The Dormouse's story'
Out[5]:
In [6]:
soup.title.parent.name
# u'head'
Out[6]:
In [7]:
soup.p
# <p class="title"><b>The Dormouse's story</b></p>
Out[7]:
In [8]:
soup.p['class']
# u'title'
Out[8]:
In [9]:
soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Out[9]:
In [10]:
soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
Out[10]:
In [11]:
soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
Out[11]:
In [12]:
for link in soup.find_all('a'):
print(link.get('href'))
In [13]:
print(soup.get_text())
In [20]:
#soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'lxml')
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'html.parser')
tag = soup.b
type(tag)
# <class 'bs4.element.Tag'>
Out[20]:
In [21]:
tag.name
Out[21]:
In [18]:
tag.name = "blockquote"
tag
Out[18]:
In [25]:
tag['id']
In [26]:
tag.id
In [29]:
tag['class']
Out[29]:
In [27]:
tag.attrs
Out[27]:
In [30]:
tag['id'] = 'verybold'
tag['another-attribute'] = 1
tag
# <b another-attribute="1" id="verybold"></b>
Out[30]:
In [31]:
del tag['id']
del tag['another-attribute']
tag
# <b></b>
Out[31]:
In [33]:
tag['id']
# KeyError: 'id'
In [34]:
print(tag.get('id'))
# None
In [35]:
css_soup = BeautifulSoup('<p class="body"></p>')
css_soup.p['class']
# ["body"]
Out[35]:
In [36]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.p['class']
# ["body", "strikeout"]
Out[36]:
In [37]:
id_soup = BeautifulSoup('<p id="my id"></p>')
id_soup.p['id']
Out[37]:
In [38]:
rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>')
rel_soup.a['rel']
# ['index']
Out[38]:
In [39]:
rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p)
# <p>Back to the <a rel="index contents">homepage</a></p>
In [40]:
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml')
xml_soup.p['class']
# u'body strikeout'
Out[40]:
In [42]:
tag.string
# u'Extremely bold'
Out[42]:
In [43]:
type(tag.string)
# <class 'bs4.element.NavigableString'>
Out[43]:
In [44]:
unicode_string = unicode(tag.string)
unicode_string
# u'Extremely bold'
Out[44]:
In [45]:
type(unicode_string)
# <type 'unicode'>
Out[45]:
In [46]:
tag.string.replace_with("No longer bold")
tag
# <blockquote>No longer bold</blockquote>
Out[46]:
In [48]:
soup.name
Out[48]:
In [50]:
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup)
comment = soup.b.string
type(comment)
# <class 'bs4.element.Comment'>
Out[50]:
In [51]:
comment
Out[51]:
In [52]:
print(soup.b.prettify())
In [53]:
from bs4 import CData
cdata = CData("A CDATA block")
comment.replace_with(cdata)
print(soup.b.prettify())
# <b>
# <![CDATA[A CDATA block]]>
# </b>
In [54]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
In [55]:
soup.head
# <head><title>The Dormouse's story</title></head>
Out[55]:
In [56]:
soup.title
# <title>The Dormouse's story</title>
Out[56]:
In [57]:
soup.body.b
# <b>The Dormouse's story</b>
Out[57]:
In [58]:
soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Out[58]:
In [59]:
soup.find_all('a')
Out[59]:
In [60]:
head_tag = soup.head
head_tag
# <head><title>The Dormouse's story</title></head>
Out[60]:
In [62]:
head_tag.contents
#[<title>The Dormouse's story</title>]
Out[62]:
In [63]:
title_tag = head_tag.contents[0]
title_tag
# <title>The Dormouse's story</title>
Out[63]:
In [64]:
title_tag.contents
# [u'The Dormouse's story']
Out[64]:
In [65]:
print len(soup.contents)
# 1
print soup.contents[0].name
# u'html'
A string does not have .contents
In [66]:
text = title_tag.contents[0]
text.contents
# AttributeError: 'NavigableString' object has no attribute 'contents'
In [67]:
for child in title_tag.children:
print(child)
In [68]:
head_tag.contents
Out[68]:
In [69]:
for child in head_tag.descendants:
print(child)
In [70]:
print len(list(soup.children))
# 1
print len(list(soup.descendants))
# 25
In [71]:
title_tag.string
Out[71]:
In [72]:
head_tag.contents
# [<title>The Dormouse's story</title>]
head_tag.string
# u'The Dormouse's story'
Out[72]:
If a tag’s only child is another tag, and that tag has a .string, then the parent tag is considered to have the same .string as its child
In [73]:
head_tag.contents
# [<title>The Dormouse's story</title>]
head_tag.string
# u'The Dormouse's story'
Out[73]:
If a tag contains more than one thing, then it’s not clear what .string should refer to, so .string is defined to be None:
In [74]:
print(soup.html.string)
In [75]:
for string in soup.strings:
print(repr(string))
Removing white space
In [77]:
for string in soup.stripped_strings:
print(repr(string))
In [78]:
title_tag = soup.title
title_tag
# <title>The Dormouse's story</title>
Out[78]:
In [79]:
title_tag.parent
# <head><title>The Dormouse's story</title></head>
Out[79]:
In [80]:
title_tag.string.parent
Out[80]:
In [81]:
html_tag = soup.html
type(html_tag.parent)
Out[81]:
In [82]:
print(soup.parent)
In [85]:
link = soup.a
link
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Out[85]:
In [86]:
for parent in link.parents:
if parent is None:
print(parent)
else:
print(parent.name)
In [87]:
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>")
print(sibling_soup.prettify())
In [88]:
sibling_soup.b.next_sibling
# <c>text2</c>
Out[88]:
In [89]:
sibling_soup.c.previous_sibling
# <b>text1</b>
Out[89]:
In [90]:
print(sibling_soup.b.previous_sibling)
# None
In [91]:
print(sibling_soup.c.next_sibling)
# None
In [92]:
sibling_soup.b.string
# u'text1'
Out[92]:
In [93]:
print(sibling_soup.b.string.next_sibling)
# None
In [94]:
link = soup.a
link
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Out[94]:
In [95]:
link.next_sibling
# u',\n'
Out[95]:
In [96]:
link.next_sibling.next_sibling
Out[96]:
In [97]:
for sibling in soup.a.next_siblings:
print(repr(sibling))
In [98]:
for sibling in soup.find(id="link3").previous_siblings:
print(repr(sibling))
In [99]:
last_a_tag = soup.find("a", id="link3")
last_a_tag
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
Out[99]:
In [100]:
last_a_tag.next_sibling
# '; and they lived at the bottom of a well.'
Out[100]:
In [101]:
last_a_tag.next_element
Out[101]:
In [102]:
last_a_tag.previous_element
# u' and\n'
Out[102]:
In [103]:
last_a_tag.previous_element.next_element
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
Out[103]:
In [104]:
for element in last_a_tag.next_elements:
print(repr(element))
In [105]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
In [106]:
soup.find_all('b')
Out[106]:
In [107]:
import re
for tag in soup.find_all(re.compile("^b")):
print(tag.name)
In [110]:
for tag in soup.find_all(re.compile("t")):
print(tag.name)
In [111]:
soup.find_all(["a", "b"])
Out[111]:
In [112]:
for tag in soup.find_all(True):
print(tag.name)
In [113]:
def has_class_but_no_id(tag):
return tag.has_attr('class') and not tag.has_attr('id')
In [116]:
print soup.find_all(has_class_but_no_id)
In [115]:
def not_lacie(href):
return href and not re.compile("lacie").search(href)
soup.find_all(href=not_lacie)
Out[115]:
In [124]:
from bs4 import element
In [125]:
def surrounded_by_strings(tag):
return (isinstance(tag.next_element, element.NavigableString)
and isinstance(tag.previous_element, element.NavigableString))
for tag in soup.find_all(surrounded_by_strings):
print tag.name
In [126]:
soup.find_all("title")
Out[126]:
In [127]:
soup.find_all("p", "title")
Out[127]:
In [128]:
soup.find_all("a")
Out[128]:
In [129]:
soup.find_all(id="link2")
Out[129]:
In [130]:
import re
soup.find(string=re.compile("sisters"))
# u'Once upon a time there were three little sisters; and their names were\n'
Out[130]:
In [131]:
soup.find_all("title")
Out[131]:
In [132]:
soup.find_all(id='link2')
Out[132]:
In [133]:
soup.find_all(href=re.compile("elsie"))
Out[133]:
You can filter an attribute based on a string, a regular expression, a list, a function, or the value True.
In [134]:
soup.find_all(id=True)
Out[134]:
In [135]:
soup.find_all(href=re.compile("elsie"), id='link1')
Out[135]:
In [136]:
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')
data_soup.find_all(data-foo="value")
In [138]:
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')
data_soup.find_all(attrs={"data-foo": "value"})
Out[138]:
In [140]:
soup.find_all("a", class="sister")
In [141]:
soup.find_all("a", class_="sister")
Out[141]:
In [142]:
soup.find_all(class_=re.compile("itl"))
Out[142]:
In [143]:
def has_six_characters(css_class):
return css_class is not None and len(css_class) == 6
soup.find_all(class_=has_six_characters)
Out[143]:
In [144]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.find_all("p", class_="strikeout")
# [<p class="body strikeout"></p>]
Out[144]:
In [145]:
css_soup.find_all("p", class_="body")
Out[145]:
In [146]:
css_soup.find_all("p", class_="body strikeout")
Out[146]:
In [147]:
css_soup.find_all("p", class_="strikeout body")
Out[147]:
In [148]:
css_soup.select("p.strikeout.body")
Out[148]:
In [149]:
soup.find_all("a", attrs={"class": "sister"})
Out[149]:
In [150]:
soup.find_all(string="Elsie")
# [u'Elsie']
Out[150]:
In [151]:
soup.find_all(string=["Tillie", "Elsie", "Lacie"])
# [u'Elsie', u'Lacie', u'Tillie']
Out[151]:
In [152]:
soup.find_all(string=re.compile("Dormouse"))
[u"The Dormouse's story", u"The Dormouse's story"]
Out[152]:
In [153]:
def is_the_only_string_within_a_tag(s):
"""Return True if this string is the only child of its parent tag."""
return (s == s.parent.string)
soup.find_all(string=is_the_only_string_within_a_tag)
# [u"The Dormouse's story", u"The Dormouse's story", u'Elsie', u'Lacie', u'Tillie', u'...']
Out[153]:
In [154]:
soup.find_all("a", string="Elsie")
Out[154]:
In [155]:
soup.find_all("a", text="Elsie")
Out[155]:
In [156]:
soup.find_all("a", limit=2)
Out[156]:
In [157]:
soup.html.find_all("title")
# [<title>The Dormouse's story</title>]
Out[157]:
In [158]:
soup.html.find_all("title", recursive=False)
# []
Out[158]:
In [159]:
soup.find_all("a")
Out[159]:
In [160]:
soup("a")
Out[160]:
In [161]:
soup.find_all('title', limit=1)
# [<title>The Dormouse's story</title>]
Out[161]:
In [162]:
soup.find('title')
# <title>The Dormouse's story</title>
Out[162]:
In [163]:
print(soup.find("nosuchtag"))
The only difference is that find_all() returns a list containing the single result, and find() just returns the result.
In [164]:
soup.head.title
# <title>The Dormouse's story</title>
Out[164]:
In [165]:
soup.find("head").find("title")
# <title>The Dormouse's story</title>
Out[165]:
In [166]:
a_string = soup.find(string="Lacie")
a_string
# u'Lacie'
Out[166]:
In [167]:
a_string.find_parents("a")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
Out[167]:
In [168]:
a_string.find_parent("p")
# <p class="story">Once upon a time there were three little sisters; and their names were
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</p>
Out[168]:
In [170]:
a_string.find_parents("p", class_="title")
# []
Out[170]:
In [171]:
first_link = soup.a
first_link
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Out[171]:
In [172]:
first_link.find_next_siblings("a")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
Out[172]:
In [174]:
first_story_paragraph = soup.find("p", "story")
first_story_paragraph.find_next_sibling("p")
Out[174]:
In [175]:
last_link = soup.find("a", id="link3")
last_link
Out[175]:
In [176]:
last_link.find_previous_siblings("a")
Out[176]:
In [177]:
first_story_paragraph = soup.find("p", "story")
first_story_paragraph.find_previous_sibling("p")
Out[177]:
In [178]:
first_link = soup.a
first_link
Out[178]:
In [179]:
first_link.find_all_next(string=True)
Out[179]:
In [180]:
first_link.find_next("p")
Out[180]:
In [181]:
first_link = soup.a
first_link
Out[181]:
In [182]:
first_link.find_all_previous("p")
Out[182]:
In [183]:
first_link.find_previous("title")
Out[183]:
In [184]:
soup.select("title")
# [<title>The Dormouse's story</title>]
Out[184]:
In [185]:
soup.select("p:nth-of-type(3)")
Out[185]:
In [186]:
soup.select("body a")
Out[186]:
In [187]:
soup.select("html head title")
Out[187]:
In [188]:
soup.select("head > title")
Out[188]:
In [189]:
soup.select("p > a")
Out[189]:
In [190]:
soup.select("p > a:nth-of-type(2)")
Out[190]:
In [191]:
soup.select("p > #link1")
Out[191]:
In [192]:
soup.select("body > a")
Out[192]:
In [193]:
soup.select("#link1 ~ .sister")
Out[193]:
In [194]:
soup.select("#link1 + .sister")
Out[194]:
In [195]:
soup.select(".sister")
Out[195]:
In [196]:
soup.select("[class~=sister]")
Out[196]:
In [197]:
soup.select("#link1")
Out[197]:
In [198]:
soup.select("a#link2")
Out[198]:
In [199]:
soup.select('a[href]')
Out[199]:
In [200]:
soup.select('a[href="http://example.com/elsie"]')
Out[200]:
In [201]:
soup.select('a[href^="http://example.com/"]')
Out[201]:
In [202]:
soup.select('a[href$="tillie"]')
Out[202]:
In [203]:
soup.select('a[href*=".com/el"]')
Out[203]:
In [204]:
multilingual_markup = """
<p lang="en">Hello</p>
<p lang="en-us">Howdy, y'all</p>
<p lang="en-gb">Pip-pip, old fruit</p>
<p lang="fr">Bonjour mes amis</p>
"""
multilingual_soup = BeautifulSoup(multilingual_markup)
multilingual_soup.select('p[lang|=en]')
# [<p lang="en">Hello</p>,
# <p lang="en-us">Howdy, y'all</p>,
# <p lang="en-gb">Pip-pip, old fruit</p>]
Out[204]:
In [205]:
soup.select_one(".sister")
Out[205]:
In [207]:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
tag = soup.b
tag.name = "blockquote"
tag['class'] = 'verybold'
tag['id'] = 1
tag
# <blockquote class="verybold" id="1">Extremely bold</blockquote>
Out[207]:
In [208]:
del tag['class']
del tag['id']
tag
Out[208]:
Be careful: if the tag contained other tags, they and all their contents will be destroyed.
In [209]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
tag = soup.a
tag.string = "New link text."
tag
# <a h
Out[209]:
In [210]:
soup = BeautifulSoup("<a>Foo</a>")
soup.a.append("Bar")
soup
# <html><head></head><body><a>FooBar</a></body></html>
Out[210]:
In [211]:
soup.a.contents
Out[211]:
In [214]:
soup = BeautifulSoup("<b></b>")
tag = soup.b
tag.append("Hello")
new_string = element.NavigableString(" there")
tag.append(new_string)
tag
# <b>Hello there.</b>
Out[214]:
In [215]:
tag.contents
# [u'Hello', u' there']
Out[215]:
In [216]:
from bs4 import Comment
new_comment = Comment("Nice to see you.")
tag.append(new_comment)
tag
# <b>Hello there<!--Nice to see you.--></b>
Out[216]:
In [217]:
tag.contents
# [u'Hello', u' there', u'Nice to see you.']
Out[217]:
In [218]:
soup = BeautifulSoup("<b></b>")
original_tag = soup.b
new_tag = soup.new_tag("a", href="http://www.example.com")
original_tag.append(new_tag)
original_tag
# <b><a href="http://www.example.com"></a></b>
Out[218]:
In [219]:
new_tag.string = "Link text."
original_tag
# <b><a href="http://www.example.com">Link text.</a></b>
Out[219]:
In [220]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
tag = soup.a
tag.insert(1, "but did not endorse ")
tag
# <a href="http://example.com/">I linked to but did not endorse <i>example.com</i></a>
Out[220]:
In [221]:
tag.contents
# [u'I linked to ', u'but did not endorse', <i>example.com</i>]
Out[221]:
In [222]:
soup = BeautifulSoup("<b>stop</b>")
tag = soup.new_tag("i")
tag.string = "Don't"
soup.b.string.insert_before(tag)
soup.b
# <b><i>Don't</i>stop</b>
Out[222]:
In [223]:
soup.b.i.insert_after(soup.new_string(" ever "))
soup.b
# <b><i>Don't</i> ever stop</b>
Out[223]:
In [224]:
soup.b.contents
# [<i>Don't</i>, u' ever ', u'stop']
Out[224]:
In [225]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
tag = soup.a
tag.clear()
tag
# <a href="http://example.com/"></a>
Out[225]:
In [226]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
a_tag = soup.a
i_tag = soup.i.extract()
a_tag
# <a href="http://example.com/">I linked to</a>
Out[226]:
In [227]:
i_tag
# <i>example.com</i>
Out[227]:
In [228]:
print(i_tag.parent)
None
In [229]:
my_string = i_tag.string.extract()
my_string
# u'example.com'
Out[229]:
In [230]:
print(my_string.parent)
# None
i_tag
# <i></i>
Out[230]:
In [231]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
a_tag = soup.a
soup.i.decompose()
a_tag
# <a href="http://example.com/">I linked to</a>
Out[231]:
In [232]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
a_tag = soup.a
new_tag = soup.new_tag("b")
new_tag.string = "example.net"
a_tag.i.replace_with(new_tag)
a_tag
# <a href="http://example.com/">I linked to <b>example.net</b></a>
Out[232]:
In [235]:
soup = BeautifulSoup("<p>I wish I was bold.</p>")
soup.p.string.wrap(soup.new_tag("b"))
# <b>I wish I was bold.</b>
Out[235]:
In [238]:
soup.p.wrap(soup.new_tag("div"))
Out[238]:
In [239]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
a_tag = soup.a
a_tag.i.unwrap()
a_tag
Out[239]:
In [240]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
soup.prettify()
# '<html>\n <head>\n </head>\n <body>\n <a href="http://example.com/">\n...'
print(soup.prettify())
In [241]:
print(soup.a.prettify())
In [242]:
str(soup)
# '<html><head></head><body><a href="http://example.com/">I linked to <i>example.com</i></a></body></html>'
Out[242]:
In [243]:
unicode(soup.a)
Out[243]:
In [244]:
soup = BeautifulSoup("“Dammit!” he said.")
unicode(soup)
Out[244]:
In [245]:
str(soup)
Out[245]:
In [246]:
soup = BeautifulSoup("<p>The law firm of Dewey, Cheatem, & Howe</p>")
soup.p
Out[246]:
In [247]:
soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>')
soup.a
Out[247]:
In [248]:
french = "<p>Il a dit <<Sacré bleu!>></p>"
soup = BeautifulSoup(french)
print(soup.prettify(formatter="minimal"))
In [249]:
print(soup.prettify(formatter="html"))
In [250]:
print(soup.prettify(formatter=None))
In [251]:
link_soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>')
print(link_soup.a.encode(formatter=None))
In [252]:
def uppercase(str):
return str.upper()
print(soup.prettify(formatter=uppercase))
In [253]:
print(link_soup.a.prettify(formatter=uppercase))
In [254]:
from bs4.dammit import EntitySubstitution
def uppercase_and_substitute_html_entities(str):
return EntitySubstitution.substitute_html(str.upper())
print(soup.prettify(formatter=uppercase_and_substitute_html_entities))
In [255]:
from bs4.element import CData
soup = BeautifulSoup("<a></a>")
soup.a.string = CData("one < three")
print(soup.a.prettify(formatter="xml"))
In [256]:
markup = '<a href="http://example.com/">\nI linked to <i>example.com</i>\n</a>'
soup = BeautifulSoup(markup)
soup.get_text()
Out[256]:
In [258]:
soup.i.get_text()
Out[258]:
In [259]:
soup.get_text("|")
Out[259]:
In [260]:
soup.get_text("|", strip=True)
Out[260]:
In [261]:
[text for text in soup.stripped_strings]
Out[261]:
In [262]:
BeautifulSoup("<a><b /></a>")
Out[262]:
In [263]:
BeautifulSoup("<a><b /></a>", "xml")
Out[263]:
In [264]:
BeautifulSoup("<a></p>", "lxml")
Out[264]:
In [266]:
#BeautifulSoup("<a></p>", "html5lib")
In [267]:
BeautifulSoup("<a></p>", "html.parser")
Out[267]:
In [268]:
markup = "<h1>Sacr\xc3\xa9 bleu!</h1>"
soup = BeautifulSoup(markup)
soup.h1
Out[268]:
In [269]:
# <h1>Sacré bleu!</h1>
soup.h1.string
Out[269]:
In [270]:
soup.original_encoding
Out[270]:
In [271]:
markup = b"<h1>\xed\xe5\xec\xf9</h1>"
soup = BeautifulSoup(markup)
soup.h1
Out[271]:
In [272]:
soup.original_encoding
Out[272]:
In [273]:
soup = BeautifulSoup(markup, from_encoding="iso-8859-8")
soup.h1
Out[273]:
In [274]:
soup.original_encoding
Out[274]:
In [275]:
soup = BeautifulSoup(markup, exclude_encodings=["ISO-8859-7"])
soup.h1
Out[275]:
In [276]:
soup.original_encoding
Out[276]:
In [277]:
markup = b'''
<html>
<head>
<meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type" />
</head>
<body>
<p>Sacr\xe9 bleu!</p>
</body>
</html>
'''
soup = BeautifulSoup(markup)
print(soup.prettify())
In [278]:
print(soup.prettify("latin-1"))
In [279]:
soup.p.encode("latin-1")
Out[279]:
In [280]:
soup.p.encode("utf-8")
Out[280]:
In [281]:
markup = u"<b>\N{SNOWMAN}</b>"
snowman_soup = BeautifulSoup(markup)
tag = snowman_soup.b
In [282]:
print(tag.encode("utf-8"))
In [283]:
print tag.encode("latin-1")
In [284]:
print tag.encode("ascii")
In [285]:
from bs4 import UnicodeDammit
dammit = UnicodeDammit("Sacr\xc3\xa9 bleu!")
print(dammit.unicode_markup)
# Sacré bleu!
In [286]:
dammit.original_encoding
Out[286]:
In [287]:
dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"])
print(dammit.unicode_markup)
# Sacré bleu!
In [288]:
dammit.original_encoding
# 'latin-1'
Out[288]:
In [289]:
markup = b"<p>I just \x93love\x94 Microsoft Word\x92s smart quotes</p>"
UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="html").unicode_markup
# u'<p>I just “love” Microsoft Word’s smart quotes</p>'
Out[289]:
In [290]:
UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="xml").unicode_markup
# u'<p>I just “love” Microsoft Word’s smart quotes</p>'
Out[290]:
In [291]:
UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="ascii").unicode_markup
Out[291]:
In [292]:
UnicodeDammit(markup, ["windows-1252"]).unicode_markup
Out[292]:
In [293]:
snowmen = (u"\N{SNOWMAN}" * 3)
quote = (u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}")
doc = snowmen.encode("utf8") + quote.encode("windows_1252")
In [294]:
print(doc)
In [295]:
print(doc.decode("windows-1252"))
In [296]:
new_doc = UnicodeDammit.detwingle(doc)
print(new_doc.decode("utf8"))
In [297]:
markup = "<p>I want <b>pizza</b> and more <b>pizza</b>!</p>"
soup = BeautifulSoup(markup, 'html.parser')
first_b, second_b = soup.find_all('b')
print first_b == second_b
In [298]:
print first_b.previous_element == second_b.previous_element
In [299]:
print first_b is second_b
In [300]:
import copy
p_copy = copy.copy(soup.p)
print p_copy
In [302]:
print soup.p == p_copy
print soup.p is p_copy
In [303]:
print p_copy.parent
In [304]:
from bs4 import SoupStrainer
only_a_tags = SoupStrainer("a")
only_tags_with_id_link2 = SoupStrainer(id="link2")
def is_short_string(string):
return len(string) < 10
only_short_strings = SoupStrainer(string=is_short_string)
In [305]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
print(BeautifulSoup(html_doc, "html.parser", parse_only=only_a_tags).prettify())
In [306]:
print(BeautifulSoup(html_doc, "html.parser", parse_only=only_tags_with_id_link2).prettify())
In [308]:
#print(BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings).prettify())
In [310]:
#soup = BeautifulSoup(html_doc)
#soup.find_all(only_short_strings)