数据抓取:

抓取47年政府工作报告



王成军

wangchengjun@nju.edu.cn

计算传播网 http://computational-communication.com


In [1]:
import urllib2
from bs4 import BeautifulSoup

In [111]:
from IPython.display import display_html, HTML
HTML('<iframe src=http://www.hprc.org.cn/wxzl/wxysl/lczf/ width=1000 height=500></iframe>')
# the webpage we would like to crawl


Out[111]:

Inspect

· 2016年政府工作报告

<td width="274" class="bl">· <a href="./d12qgrdzfbg/201603/t20160318_369509.html" target="_blank" title="2016年政府工作报告">2016年政府工作报告</a></td>

In [102]:
# get the link for each year
url = "http://www.hprc.org.cn/wxzl/wxysl/lczf/" 
content = urllib2.urlopen(url).read().decode('gb18030') 
soup = BeautifulSoup(content, 'html.parser') 
links = soup.find_all('td', {'class', 'bl'})

decode

urllib2.urlopen(url).read().decode('gb18030')

html.parser

BeautifulSoup(content, 'html.parser')


In [103]:
links = soup.find_all('td', class_='bl') 
print len(links)


47

In [104]:
print links[0]


<td class="bl" width="274">· <a href="./d12qgrdzfbg/201603/t20160318_369509.html" target="_blank" title="2016年政府工作报告">2016年政府工作报告</a></td>

In [106]:
print links[0].a


<a href="./d12qgrdzfbg/201603/t20160318_369509.html" target="_blank" title="2016年政府工作报告">2016年政府工作报告</a>

In [107]:
print links[0].a['href']


./d12qgrdzfbg/201603/t20160318_369509.html

In [108]:
print links[0].a['href'].split('./')


[u'', u'd12qgrdzfbg/201603/t20160318_369509.html']

In [109]:
print links[0].a['href'].split('./')[1]


d12qgrdzfbg/201603/t20160318_369509.html

In [110]:
print url + links[0].a['href'].split('./')[1]


http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201603/t20160318_369509.html

In [94]:
hyperlinks = [url + i.a['href'].split('./')[1] for i in links]

In [95]:
hyperlinks


Out[95]:
[u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201603/t20160318_369509.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201503/t20150318_319434.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201403/t20140315_270863.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201402/t20140214_266528.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/201402/t20140214_266527.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/201103/t20110315_153641.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/201003/t20100315_44772.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/200908/t20090817_27504.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/200908/t20090817_27495.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27765.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27757.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27756.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27753.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27744.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27741.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27738.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27737.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27736.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27709.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27708.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27707.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27706.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27705.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27702.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27700.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27699.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27678.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27644.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27642.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27640.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27616.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27615.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27614.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27613.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27612.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27611.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27567.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27566.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_7/200908/t20090818_27565.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_8/200908/t20090818_27564.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_9/200908/t20090818_27562.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_9/200908/t20090818_27563.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27561.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27560.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27559.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27558.html']

In [116]:
hyperlinks[9] # 2007年有分页


Out[116]:
u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html'

In [113]:
from IPython.display import display_html, HTML
HTML('<iframe src=http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html \
width=1000 height=500></iframe>')
# 2007年有分页


Out[113]:

Inspect 下一页

下一页

<a href="t20090818_27775_1.html"><span style="color:#0033FF;font-weight:bold">下一页</span></a>

  • a
    • script
      • td

In [119]:
url_i = 'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html'
content = urllib2.urlopen(url_i).read().decode('gb18030')  
soup = BeautifulSoup(content, 'html.parser') 
scripts = soup.find_all('script')
scripts[0]


Out[119]:
<script>\n\nfunction add(){\nwindow.external.AddFavorite("\u4f60\u7684\u7f51\u5740","\u6807\u9898");\n}\n\n\nfunction setHome(obj){\nvar tmp = new Object();\ntmp = obj;\ntmp.style.behavior="url(#default#homepage)";\ntmp.setHomePage("\u4f60\u7684\u7f51\u5740");\n}\n\n</script>

In [142]:
print scripts[1].text


	var currentPage = 0;//所在页从0开始
	var prevPage = currentPage-1//上一页
	var 下一页Page = currentPage+1//下一页
	var countPage = 4//共多少页
	//document.write("共"+countPage+"页&nbsp;&nbsp;");
	
	//循环
	var num = 17;
	for(var i=0+(currentPage-1-(currentPage-1)%num) ; i<=(num+(currentPage-1-(currentPage-1)%num))&&(i<countPage) ; i++){
		if(countPage >1){
			if(currentPage==i)
				document.write("【<span style=\"color:#FF0000;\" class=\"hui14_30_h\">"+(i+1)+"</span>】&nbsp;");
			else if(i==0)
				document.write("<a href=\"t20090818_27775.html\" class=\"hui14_30_h\">【"+(i+1)+"】</a>&nbsp;");
			else
				document.write("<a href=\"t20090818_27775"+"_" + i + "."+"html\" class=\"hui14_30_h\">【"+(i+1)+"】</a>&nbsp;");
		}	
	}
	
	document.write("<br><br>");
	//设置上一页代码
	if(countPage>1&&currentPage!=0&&currentPage!=1)
		document.write("<a href=\"t20090818_27775"+"_" + prevPage + "."+"html\"><span style=\"color:#0033FF;font-weight:bold\">上一页</span></a>&nbsp;");
	else if(countPage>1&&currentPage!=0&&currentPage==1)
		document.write("<a href=\"t20090818_27775.html\"><span style=\"color:#0033FF;font-weight:bold\">上一页</span></a>&nbsp;");
	//else
	//	document.write("上一页 &nbsp;");
	
	
	//设置下一页代码 
	if(countPage>1&&currentPage!=(countPage-1))
		document.write("<a href=\"t20090818_27775"+"_" + 下一页Page + "."+"html\" ><span style=\"color:#0033FF;font-weight:bold\">下一页</span></a> &nbsp;");
	//else
	//	document.write("下一页 &nbsp;");
					 
	

In [133]:
from pyjsparser import PyJsParser
p = PyJsParser()
p.parse('var $ = "Hello!"')


Out[133]:
{'body': [{'declarations': [{'id': {'name': u'$', 'type': u'Identifier'},
     'init': {'raw': None, 'type': u'Literal', 'value': u'Hello!'},
     'type': u'VariableDeclarator'}],
   'kind': 'var',
   'type': u'VariableDeclaration'}],
 'type': u'Program'}

In [151]:
jp = p.parse(scripts[1].text)
jp


Out[151]:
{'body': [{'declarations': [{'id': {'name': u'currentPage',
      'type': u'Identifier'},
     'init': {'raw': None, 'type': u'Literal', 'value': 0.0},
     'type': u'VariableDeclarator'}],
   'kind': 'var',
   'type': u'VariableDeclaration'},
  {'declarations': [{'id': {'name': u'prevPage', 'type': u'Identifier'},
     'init': {'left': {'name': u'currentPage', 'type': u'Identifier'},
      'operator': u'-',
      'right': {'raw': None, 'type': u'Literal', 'value': 1.0},
      'type': u'BinaryExpression'},
     'type': u'VariableDeclarator'}],
   'kind': 'var',
   'type': u'VariableDeclaration'},
  {'declarations': [{'id': {'name': u'\u4e0b\u4e00\u9875Page',
      'type': u'Identifier'},
     'init': {'left': {'name': u'currentPage', 'type': u'Identifier'},
      'operator': u'+',
      'right': {'raw': None, 'type': u'Literal', 'value': 1.0},
      'type': u'BinaryExpression'},
     'type': u'VariableDeclarator'}],
   'kind': 'var',
   'type': u'VariableDeclaration'},
  {'declarations': [{'id': {'name': u'countPage', 'type': u'Identifier'},
     'init': {'raw': None, 'type': u'Literal', 'value': 4.0},
     'type': u'VariableDeclarator'}],
   'kind': 'var',
   'type': u'VariableDeclaration'},
  {'declarations': [{'id': {'name': u'num', 'type': u'Identifier'},
     'init': {'raw': None, 'type': u'Literal', 'value': 17.0},
     'type': u'VariableDeclarator'}],
   'kind': 'var',
   'type': u'VariableDeclaration'},
  {'body': {'body': [{'alternate': None,
      'consequent': {'body': [{'alternate': {'alternate': {'expression': {'arguments': [{'left': {'left': {'left': {'left': {'left': {'left': {'raw': None,
                    'type': u'Literal',
                    'value': u'<a href="t20090818_27775'},
                   'operator': u'+',
                   'right': {'raw': None, 'type': u'Literal', 'value': u'_'},
                   'type': u'BinaryExpression'},
                  'operator': u'+',
                  'right': {'name': u'i', 'type': u'Identifier'},
                  'type': u'BinaryExpression'},
                 'operator': u'+',
                 'right': {'raw': None, 'type': u'Literal', 'value': u'.'},
                 'type': u'BinaryExpression'},
                'operator': u'+',
                'right': {'raw': None,
                 'type': u'Literal',
                 'value': u'html" class="hui14_30_h">\u3010'},
                'type': u'BinaryExpression'},
               'operator': u'+',
               'right': {'left': {'name': u'i', 'type': u'Identifier'},
                'operator': u'+',
                'right': {'raw': None, 'type': u'Literal', 'value': 1.0},
                'type': u'BinaryExpression'},
               'type': u'BinaryExpression'},
              'operator': u'+',
              'right': {'raw': None,
               'type': u'Literal',
               'value': u'\u3011</a>&nbsp;'},
              'type': u'BinaryExpression'}],
            'callee': {'computed': False,
             'object': {'name': u'document', 'type': u'Identifier'},
             'property': {'name': u'write', 'type': u'Identifier'},
             'type': u'MemberExpression'},
            'type': u'CallExpression'},
           'type': u'ExpressionStatement'},
          'consequent': {'expression': {'arguments': [{'left': {'left': {'raw': None,
                'type': u'Literal',
                'value': u'<a href="t20090818_27775.html" class="hui14_30_h">\u3010'},
               'operator': u'+',
               'right': {'left': {'name': u'i', 'type': u'Identifier'},
                'operator': u'+',
                'right': {'raw': None, 'type': u'Literal', 'value': 1.0},
                'type': u'BinaryExpression'},
               'type': u'BinaryExpression'},
              'operator': u'+',
              'right': {'raw': None,
               'type': u'Literal',
               'value': u'\u3011</a>&nbsp;'},
              'type': u'BinaryExpression'}],
            'callee': {'computed': False,
             'object': {'name': u'document', 'type': u'Identifier'},
             'property': {'name': u'write', 'type': u'Identifier'},
             'type': u'MemberExpression'},
            'type': u'CallExpression'},
           'type': u'ExpressionStatement'},
          'test': {'left': {'name': u'i', 'type': u'Identifier'},
           'operator': u'==',
           'right': {'raw': None, 'type': u'Literal', 'value': 0.0},
           'type': u'BinaryExpression'},
          'type': u'IfStatement'},
         'consequent': {'expression': {'arguments': [{'left': {'left': {'raw': None,
               'type': u'Literal',
               'value': u'\u3010<span style="color:#FF0000;" class="hui14_30_h">'},
              'operator': u'+',
              'right': {'left': {'name': u'i', 'type': u'Identifier'},
               'operator': u'+',
               'right': {'raw': None, 'type': u'Literal', 'value': 1.0},
               'type': u'BinaryExpression'},
              'type': u'BinaryExpression'},
             'operator': u'+',
             'right': {'raw': None,
              'type': u'Literal',
              'value': u'</span>\u3011&nbsp;'},
             'type': u'BinaryExpression'}],
           'callee': {'computed': False,
            'object': {'name': u'document', 'type': u'Identifier'},
            'property': {'name': u'write', 'type': u'Identifier'},
            'type': u'MemberExpression'},
           'type': u'CallExpression'},
          'type': u'ExpressionStatement'},
         'test': {'left': {'name': u'currentPage', 'type': u'Identifier'},
          'operator': u'==',
          'right': {'name': u'i', 'type': u'Identifier'},
          'type': u'BinaryExpression'},
         'type': u'IfStatement'}],
       'type': u'BlockStatement'},
      'test': {'left': {'name': u'countPage', 'type': u'Identifier'},
       'operator': u'>',
       'right': {'raw': None, 'type': u'Literal', 'value': 1.0},
       'type': u'BinaryExpression'},
      'type': u'IfStatement'}],
    'type': u'BlockStatement'},
   'init': {'declarations': [{'id': {'name': u'i', 'type': u'Identifier'},
      'init': {'left': {'raw': None, 'type': u'Literal', 'value': 0.0},
       'operator': u'+',
       'right': {'left': {'left': {'name': u'currentPage',
          'type': u'Identifier'},
         'operator': u'-',
         'right': {'raw': None, 'type': u'Literal', 'value': 1.0},
         'type': u'BinaryExpression'},
        'operator': u'-',
        'right': {'left': {'left': {'name': u'currentPage',
           'type': u'Identifier'},
          'operator': u'-',
          'right': {'raw': None, 'type': u'Literal', 'value': 1.0},
          'type': u'BinaryExpression'},
         'operator': u'%',
         'right': {'name': u'num', 'type': u'Identifier'},
         'type': u'BinaryExpression'},
        'type': u'BinaryExpression'},
       'type': u'BinaryExpression'},
      'type': u'VariableDeclarator'}],
    'kind': 'var',
    'type': u'VariableDeclaration'},
   'test': {'left': {'left': {'name': u'i', 'type': u'Identifier'},
     'operator': u'<=',
     'right': {'left': {'name': u'num', 'type': u'Identifier'},
      'operator': u'+',
      'right': {'left': {'left': {'name': u'currentPage',
         'type': u'Identifier'},
        'operator': u'-',
        'right': {'raw': None, 'type': u'Literal', 'value': 1.0},
        'type': u'BinaryExpression'},
       'operator': u'-',
       'right': {'left': {'left': {'name': u'currentPage',
          'type': u'Identifier'},
         'operator': u'-',
         'right': {'raw': None, 'type': u'Literal', 'value': 1.0},
         'type': u'BinaryExpression'},
        'operator': u'%',
        'right': {'name': u'num', 'type': u'Identifier'},
        'type': u'BinaryExpression'},
       'type': u'BinaryExpression'},
      'type': u'BinaryExpression'},
     'type': u'BinaryExpression'},
    'operator': u'&&',
    'right': {'left': {'name': u'i', 'type': u'Identifier'},
     'operator': u'<',
     'right': {'name': u'countPage', 'type': u'Identifier'},
     'type': u'BinaryExpression'},
    'type': u'LogicalExpression'},
   'type': u'ForStatement',
   'update': {'argument': {'name': u'i', 'type': u'Identifier'},
    'operator': u'++',
    'prefix': False,
    'type': u'UpdateExpression'}},
  {'expression': {'arguments': [{'raw': None,
      'type': u'Literal',
      'value': u'<br><br>'}],
    'callee': {'computed': False,
     'object': {'name': u'document', 'type': u'Identifier'},
     'property': {'name': u'write', 'type': u'Identifier'},
     'type': u'MemberExpression'},
    'type': u'CallExpression'},
   'type': u'ExpressionStatement'},
  {'alternate': {'alternate': None,
    'consequent': {'expression': {'arguments': [{'raw': None,
        'type': u'Literal',
        'value': u'<a href="t20090818_27775.html"><span style="color:#0033FF;font-weight:bold">\u4e0a\u4e00\u9875</span></a>&nbsp;'}],
      'callee': {'computed': False,
       'object': {'name': u'document', 'type': u'Identifier'},
       'property': {'name': u'write', 'type': u'Identifier'},
       'type': u'MemberExpression'},
      'type': u'CallExpression'},
     'type': u'ExpressionStatement'},
    'test': {'left': {'left': {'left': {'name': u'countPage',
        'type': u'Identifier'},
       'operator': u'>',
       'right': {'raw': None, 'type': u'Literal', 'value': 1.0},
       'type': u'BinaryExpression'},
      'operator': u'&&',
      'right': {'left': {'name': u'currentPage', 'type': u'Identifier'},
       'operator': u'!=',
       'right': {'raw': None, 'type': u'Literal', 'value': 0.0},
       'type': u'BinaryExpression'},
      'type': u'LogicalExpression'},
     'operator': u'&&',
     'right': {'left': {'name': u'currentPage', 'type': u'Identifier'},
      'operator': u'==',
      'right': {'raw': None, 'type': u'Literal', 'value': 1.0},
      'type': u'BinaryExpression'},
     'type': u'LogicalExpression'},
    'type': u'IfStatement'},
   'consequent': {'expression': {'arguments': [{'left': {'left': {'left': {'left': {'raw': None,
           'type': u'Literal',
           'value': u'<a href="t20090818_27775'},
          'operator': u'+',
          'right': {'raw': None, 'type': u'Literal', 'value': u'_'},
          'type': u'BinaryExpression'},
         'operator': u'+',
         'right': {'name': u'prevPage', 'type': u'Identifier'},
         'type': u'BinaryExpression'},
        'operator': u'+',
        'right': {'raw': None, 'type': u'Literal', 'value': u'.'},
        'type': u'BinaryExpression'},
       'operator': u'+',
       'right': {'raw': None,
        'type': u'Literal',
        'value': u'html"><span style="color:#0033FF;font-weight:bold">\u4e0a\u4e00\u9875</span></a>&nbsp;'},
       'type': u'BinaryExpression'}],
     'callee': {'computed': False,
      'object': {'name': u'document', 'type': u'Identifier'},
      'property': {'name': u'write', 'type': u'Identifier'},
      'type': u'MemberExpression'},
     'type': u'CallExpression'},
    'type': u'ExpressionStatement'},
   'test': {'left': {'left': {'left': {'name': u'countPage',
       'type': u'Identifier'},
      'operator': u'>',
      'right': {'raw': None, 'type': u'Literal', 'value': 1.0},
      'type': u'BinaryExpression'},
     'operator': u'&&',
     'right': {'left': {'name': u'currentPage', 'type': u'Identifier'},
      'operator': u'!=',
      'right': {'raw': None, 'type': u'Literal', 'value': 0.0},
      'type': u'BinaryExpression'},
     'type': u'LogicalExpression'},
    'operator': u'&&',
    'right': {'left': {'name': u'currentPage', 'type': u'Identifier'},
     'operator': u'!=',
     'right': {'raw': None, 'type': u'Literal', 'value': 1.0},
     'type': u'BinaryExpression'},
    'type': u'LogicalExpression'},
   'type': u'IfStatement'},
  {'alternate': None,
   'consequent': {'expression': {'arguments': [{'left': {'left': {'left': {'left': {'raw': None,
           'type': u'Literal',
           'value': u'<a href="t20090818_27775'},
          'operator': u'+',
          'right': {'raw': None, 'type': u'Literal', 'value': u'_'},
          'type': u'BinaryExpression'},
         'operator': u'+',
         'right': {'name': u'\u4e0b\u4e00\u9875Page', 'type': u'Identifier'},
         'type': u'BinaryExpression'},
        'operator': u'+',
        'right': {'raw': None, 'type': u'Literal', 'value': u'.'},
        'type': u'BinaryExpression'},
       'operator': u'+',
       'right': {'raw': None,
        'type': u'Literal',
        'value': u'html" ><span style="color:#0033FF;font-weight:bold">\u4e0b\u4e00\u9875</span></a> &nbsp;'},
       'type': u'BinaryExpression'}],
     'callee': {'computed': False,
      'object': {'name': u'document', 'type': u'Identifier'},
      'property': {'name': u'write', 'type': u'Identifier'},
      'type': u'MemberExpression'},
     'type': u'CallExpression'},
    'type': u'ExpressionStatement'},
   'test': {'left': {'left': {'name': u'countPage', 'type': u'Identifier'},
     'operator': u'>',
     'right': {'raw': None, 'type': u'Literal', 'value': 1.0},
     'type': u'BinaryExpression'},
    'operator': u'&&',
    'right': {'left': {'name': u'currentPage', 'type': u'Identifier'},
     'operator': u'!=',
     'right': {'left': {'name': u'countPage', 'type': u'Identifier'},
      'operator': u'-',
      'right': {'raw': None, 'type': u'Literal', 'value': 1.0},
      'type': u'BinaryExpression'},
     'type': u'BinaryExpression'},
    'type': u'LogicalExpression'},
   'type': u'IfStatement'}],
 'type': u'Program'}

In [154]:
jp['type']


Out[154]:
u'Program'

In [155]:
len(jp['body'])


Out[155]:
9

In [157]:
jp['body'][3]


Out[157]:
{'declarations': [{'id': {'name': u'countPage', 'type': u'Identifier'},
   'init': {'raw': None, 'type': u'Literal', 'value': 4.0},
   'type': u'VariableDeclarator'}],
 'kind': 'var',
 'type': u'VariableDeclaration'}

In [163]:
jp['body'][3]['declarations'][0]['init']['value']


Out[163]:
4.0

In [124]:
countPage = int(''.join(scripts[1]).split('countPage = ')[1].split('//')[0])
countPage


Out[124]:
4

In [96]:
def crawler(url_i):
    content = urllib2.urlopen(url_i).read().decode('gb18030')  
    soup = BeautifulSoup(content, 'html.parser') 
    year = soup.find('span', {'class', 'huang16c'}).text[:4]
    year = int(year)
    report = ''.join(s.text for s in soup('p'))
    # 找到分页信息
    scripts = soup.find_all('script')
    countPage = int(''.join(scripts[1]).split('countPage = ')[1].split('//')[0])
    if countPage == 1:
        pass
    else:
        for i in range(1, countPage):
            url_child = url_i.split('.html')[0] +'_'+str(i)+'.html'
            content = urllib2.urlopen(url_child).read().decode('gb18030') 
            soup = BeautifulSoup(content) 
            report_child = ''.join(s.text for s in soup('p'))
            report = report + report_child
    return year, report

In [98]:
# 抓取47年政府工作报告内容
reports = {}
for link in hyperlinks:
    year, report = crawler(link)
    print year
    reports[year] = report


2016
2015
2014
2013
2012
2011
2010
2009
2008
2007
2006
2005
2004
2003
2002
2001
2000
1999
1998
1997
1996
1995
1994
1993
1992
1991
1990
1989
1988
1987
1986
1985
1984
1983
1982
1981
1980
1979
1978
1975
1964
1959
1960
1957
1956
1955
1954

In [99]:
url2016 = 'http://news.xinhuanet.com/fortune/2016-03/05/c_128775704.htm'
content = urllib2.urlopen(url2016).read()
soup = BeautifulSoup(content, 'html.parser') 
report2016 = ''.join(s.text for s in soup('p'))

In [12]:
with open('/Users/chengjun/github/cjc2016/data/gov_reports1954-2016.txt', 'wb') as f:
    for r in reports:
        line = str(r)+'\t'+reports[r].replace('\n', '\t') +'\n'
        f.write(line.encode('utf-8'))

This is the end.

Thank you for your attention.