In [2]:
    
import requests
from bs4 import BeautifulSoup
    
In [3]:
    
from IPython.display import display_html, HTML
HTML('<iframe src=http://www.hprc.org.cn/wxzl/wxysl/lczf/ width=1000 height=500></iframe>')
# the webpage we would like to crawl
    
    Out[3]:
<td width="274" class="bl">· <a href="./d12qgrdzfbg/201603/t20160318_369509.html" target="_blank" title="2016年政府工作报告">2016年政府工作报告</a></td>
In [15]:
    
# get the link for each year
url = "http://www.hprc.org.cn/wxzl/wxysl/lczf/" 
content = requests.get(url)
content.encoding
    
    Out[15]:
In [16]:
    
# Specify the encoding
content.encoding = 'utf8' # 'gb18030'
content = content.text
    
In [17]:
    
soup = BeautifulSoup(content, 'html.parser') 
# links = soup.find_all('td', {'class', 'bl'})   
links = soup.select('.bl a')
print(links[0])
    
    
In [18]:
    
len(links)
    
    Out[18]:
In [19]:
    
links[-1]['href']
    
    Out[19]:
In [20]:
    
links[0]['href'].split('./')[1]
    
    Out[20]:
In [21]:
    
url + links[0]['href'].split('./')[1]
    
    Out[21]:
In [22]:
    
hyperlinks = [url + i['href'].split('./')[1] for i in links]
hyperlinks[:5]
    
    Out[22]:
In [23]:
    
hyperlinks[-5:]
    
    Out[23]:
In [26]:
    
hyperlinks[12] # 2007年有分页
    
    Out[26]:
In [30]:
    
from IPython.display import display_html, HTML
HTML('<iframe src=http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_3955570.html width=1000 height=500></iframe>')
# 2007年有分页
    
    Out[30]:
In [39]:
    
url_i = 'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_3955570.html'
content = requests.get(url_i)
content.encoding = 'utf8'
content = content.text
#content = content.text.encode(content.encoding).decode('gb18030')
soup = BeautifulSoup(content, 'html.parser') 
#scripts = soup.find_all('script')
#scripts[0]
scripts = soup.select('td script')[0]
    
In [40]:
    
scripts
    
    Out[40]:
In [41]:
    
scripts.text
    
    Out[41]:
In [42]:
    
# countPage = int(''.join(scripts).split('countPage = ')\
#                 [1].split('//')[0])
# countPage
countPage = int(scripts.text.split('countPage = ')[1].split('//')[0])
countPage
    
    Out[42]:
In [43]:
    
import sys
def flushPrint(s):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % s)
    sys.stdout.flush()
    
def crawler(url_i):
    content = requests.get(url_i)
    content.encoding = 'utf8'  
    content = content.text
    soup = BeautifulSoup(content, 'html.parser') 
    year = soup.find('span', {'class', 'huang16c'}).text[:4]
    year = int(year)
    report = ''.join(s.text for s in soup('p'))
    # 找到分页信息
    scripts = soup.find_all('script')
    countPage = int(''.join(scripts[1]).split('countPage = ')[1].split('//')[0])
    if countPage == 1:
        pass
    else:
        for i in range(1, countPage):
            url_child = url_i.split('.html')[0] +'_'+str(i)+'.html'
            content = requests.get(url_child)
            content.encoding = 'gb18030'
            content = content.text
            soup = BeautifulSoup(content, 'html.parser') 
            report_child = ''.join(s.text for s in soup('p'))
            report = report + report_child
    return year, report
    
In [44]:
    
# 抓取50年政府工作报告内容
reports = {}
for link in hyperlinks:
    year, report = crawler(link)
    flushPrint(year)
    reports[year] = report
    
    
In [45]:
    
with open('../data/gov_reports1954-2019.txt', 'w', encoding = 'utf8') as f:
    for r in reports:
        line = str(r)+'\t'+reports[r].replace('\n', '\t') +'\n'
        f.write(line)
    
In [46]:
    
import pandas as pd
df = pd.read_table('../data/gov_reports1954-2019.txt', names = ['year', 'report'])
    
In [48]:
    
df[-5:]
    
    Out[48]: