In [1]:
import urllib, urllib2
import hashlib
import feedparser
from bs4 import BeautifulSoup
In [2]:
def toUtf8(ele):
return urllib.unquote(ele.encode("utf8")).decode('utf8')
In [3]:
def getContent(link):
return urllib2.urlopen(link).read()
In [4]:
def parseCnt(html):
soup = BeautifulSoup(html, 'html.parser')
b_div = soup.find_all("div", attrs={"itemprop":"articleBody"})
if len(b_div)>0:
return b_div[0].get_text()
return ""
In [5]:
urls = {
"實用":"https://tw.news.yahoo.com/sentiment/informative/rss",
"感人":"https://tw.news.yahoo.com/sentiment/warm/rss/",
"開心":"https://tw.news.yahoo.com/sentiment/happy/rss/",
"超扯":"https://tw.news.yahoo.com/sentiment/odd/rss/",
"害怕":"https://tw.news.yahoo.com/sentiment/worried/rss/",
"火大":"https://tw.news.yahoo.com/sentiment/angry/rss/",
"難過":"https://tw.news.yahoo.com/sentiment/angry/rss/" }
In [6]:
feed = feedparser.parse( urls["火大"] )
In [7]:
def getHash(item_id):
h = hashlib.new('ripemd160')
h.update(item_id.encode("utf8"))
return h.hexdigest()
In [8]:
for news_item in feed['entries'][:2]:
print "="*10
print toUtf8(news_item['title'])
print toUtf8(news_item['id'])
print getHash(toUtf8(news_item['id']))
print getHash(toUtf8(news_item['id']))
# cnt = getContent(news_item['link'])
# print parseCnt(cnt)
print news_item.keys()
print news_item['published_parsed']
In [ ]: