notebook.community

Edit and run



In [1]:

    
import urllib, urllib2
import hashlib
import feedparser
from bs4 import BeautifulSoup



In [2]:

    
def toUtf8(ele):
    return urllib.unquote(ele.encode("utf8")).decode('utf8')



In [3]:

    
def getContent(link):
    return urllib2.urlopen(link).read()



In [4]:

    
def parseCnt(html):
    soup = BeautifulSoup(html, 'html.parser')
    b_div = soup.find_all("div", attrs={"itemprop":"articleBody"})
    if len(b_div)>0:
        return b_div[0].get_text()
    return ""



In [5]:

    
urls = { 
    "實用":"https://tw.news.yahoo.com/sentiment/informative/rss",
    "感人":"https://tw.news.yahoo.com/sentiment/warm/rss/",
    "開心":"https://tw.news.yahoo.com/sentiment/happy/rss/",
    "超扯":"https://tw.news.yahoo.com/sentiment/odd/rss/",
    "害怕":"https://tw.news.yahoo.com/sentiment/worried/rss/",
    "火大":"https://tw.news.yahoo.com/sentiment/angry/rss/",
    "難過":"https://tw.news.yahoo.com/sentiment/angry/rss/" }



In [6]:

    
feed = feedparser.parse( urls["火大"] )



In [7]:

    
def getHash(item_id):
    h = hashlib.new('ripemd160')
    h.update(item_id.encode("utf8"))
    return h.hexdigest()



In [8]:

    
for news_item in feed['entries'][:2]:
    print "="*10
    print toUtf8(news_item['title']) 
    print toUtf8(news_item['id'])
    print getHash(toUtf8(news_item['id']))
    print getHash(toUtf8(news_item['id']))
#     cnt = getContent(news_item['link'])
#     print parseCnt(cnt)
    print news_item.keys()
    print news_item['published_parsed']









    



==========
台灣國貼紙 網友譙「死道友不死貧道」 楊志良：林昶佐欠馬政府一個道歉
台灣國貼紙-網友譙-死道友不死貧道-楊志良-林昶佐欠馬政府-個道歉-215007069
587f792a0df572afaba9b2d629e5da8d8c803f52
587f792a0df572afaba9b2d629e5da8d8c803f52
['summary_detail', 'published_parsed', 'links', 'title', 'summary', 'guidislink', 'title_detail', 'link', 'published', 'id']
time.struct_time(tm_year=2016, tm_mon=5, tm_mday=23, tm_hour=21, tm_min=50, tm_sec=7, tm_wday=0, tm_yday=144, tm_isdst=0)
==========
政院撤告太陽花 基層警察：亂了
政院撤告太陽花-基層警察-亂了-193500682
8b584173ba40b0ddd48788e12c03ddbea9b4903a
8b584173ba40b0ddd48788e12c03ddbea9b4903a
['summary_detail', 'published_parsed', 'links', 'title', 'summary', 'guidislink', 'title_detail', 'link', 'published', 'id']
time.struct_time(tm_year=2016, tm_mon=5, tm_mday=24, tm_hour=19, tm_min=35, tm_sec=0, tm_wday=1, tm_yday=145, tm_isdst=0)



In [ ]: