In [1]:
import feedparser
In [2]:
# load from url
d = feedparser.parse('http://www.oschina.net/news/rss')
d['feed']['title']
Out[2]:
In [3]:
# load from loacl file
d = feedparser.parse('oschina_news_rss.xml')
d['feed']['title']
In [4]:
rawdata = """<rss version="2.0">
<channel>
<title>开源中国社区最新新闻</title>
</channel>
</rss>"""
d = feedparser.parse(rawdata)
d['feed']['title']
Out[4]:
In [5]:
# rss20 example
rss20="""<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0">
<channel>
<title>Sample Feed</title>
<description>For documentation <em>only</em></description>
<link>http://example.org/</link>
<pubDate>Sat, 07 Sep 2002 00:00:01 GMT</pubDate>
<!-- other elements omitted from this example -->
<item>
<title>First entry title</title>
<link>http://example.org/entry/3</link>
<description>Watch out for <span style="background-image:
url(javascript:window.location='http://example.org/')">nasty
tricks</span></description>
<pubDate>Thu, 05 Sep 2002 00:00:01 GMT</pubDate>
<guid>http://example.org/entry/3</guid>
<!-- other elements omitted from this example -->
</item>
</channel>
</rss>"""
In [6]:
# RSS feed 的常见元素有 title 、 link 、 description 、 publication date(pubDate) 和 entry ID(guid) 。
d = feedparser.parse(rss20)
# d.feed 是 channel 元素
print(d.feed.title)
print(d.feed.link)
print(d.feed.description)
print(d.feed.published)
print(d.feed.published_parsed)
In [7]:
# entries 是一个 item 组成的 list ,其顺序就是 rss 文件中的顺序
print(d.entries[0].title)
print(d.entries[0].link)
print(d.entries[0].description)
print(d.entries[0].published)
print(d.entries[0].published_parsed)
print(d.entries[0].id)
In [8]:
atom10 = """<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom"
xml:base="http://example.org/"
xml:lang="en">
<title type="text">Sample Feed</title>
<subtitle type="html">
For documentation <em>only</em>
</subtitle>
<link rel="alternate" href="/"/>
<link rel="self"
type="application/atom+xml"
href="http://www.example.org/atom10.xml"/>
<rights type="html">
<p>Copyright 2005, Mark Pilgrim</p><
</rights>
<id>tag:feedparser.org,2005-11-09:/docs/examples/atom10.xml</id>
<generator
uri="http://example.org/generator/"
version="4.0">
Sample Toolkit
</generator>
<updated>2005-11-09T11:56:34Z</updated>
<entry>
<title>First entry title</title>
<link rel="alternate"
href="/entry/3"/>
<link rel="related"
type="text/html"
href="http://search.example.com/"/>
<link rel="via"
type="text/html"
href="http://toby.example.com/examples/atom10"/>
<link rel="enclosure"
type="video/mpeg4"
href="http://www.example.com/movie.mp4"
length="42301"/>
<id>tag:feedparser.org,2005-11-09:/docs/examples/atom10.xml:3</id>
<published>2005-11-09T00:23:47Z</published>
<updated>2005-11-09T11:56:34Z</updated>
<summary type="text/plain" mode="escaped">Watch out for nasty tricks</summary>
<content type="application/xhtml+xml" mode="xml"
xml:base="http://example.org/entry/3" xml:lang="en-US">
<div xmlns="http://www.w3.org/1999/xhtml">Watch out for
<span style="background: url(javascript:window.location='http://example.org/')">
nasty tricks</span></div>
</content>
</entry>
</feed>"""
In [9]:
d = feedparser.parse(atom10)
print(d.feed.title)
print(d.feed.link)
print(d.feed.subtitle)
print(d.feed.updated)
print(d.feed.updated_parsed)
print(d.feed.id)
In [10]:
print(d.entries[0].title)
print(d.entries[0].link)
print(d.entries[0].id)
print(d.entries[0].published)
print(d.entries[0].published_parsed)
print(d.entries[0].updated)
print(d.entries[0].updated_parsed)
print(d.entries[0].summary)
print(d.entries[0].content)
.. note::
The parsed summary and content are not the same as they appear in the
original feed. The original elements contained dangerous :abbr:`HTML
(HyperText Markup Language)` markup which was sanitized. See
:ref:`advanced.sanitization` for details.
Because Atom entries can have more than one content element,
d.entries[0].content
is a list of dictionaries. Each dictionary contains
metadata about a single content element. The two most important values in the
dictionary are the content type, in d.entries[0].content[0].type
, and the
actual content value, in d.entries[0].content[0].value
.
You can get this level of detail on other Atom elements too.
Several Atom elements share the Atom content model: title, subtitle, rights, summary, and of course content. (Atom 0.3 also had an info element which shared this content model.) Universal Feed Parser captures all relevant metadata about these elements, most importantly the content type and the value itself.
In [11]:
print(d.feed.title_detail)
print(d.feed.subtitle_detail)
print(d.feed.rights_detail)
print(d.entries[0].title_detail)
print(d.entries[0].summary_detail)
print(len(d.entries[0].content))
print(d.entries[0].content[0])