ece-display: Education Counts

Python script to remix educationcounts govt site.

Display more info in one page.

Scrape links for excel files - download excel files to folder.

Remove excel files and instead display as more friendly json object and pandas dataframe for display.



In [50]:

    
import bs4
import requests
import dominate
from dominate.tags import *
import os
import shutil
import json
from urlparse import urlparse
from bs4 import BeautifulSoup



In [15]:

    
eceall = requests.get('https://www.educationcounts.govt.nz')



In [16]:

    
#eceall.text

Language use in ECE

Statistics on language use in ECE, including tables on number of services, …



In [17]:

    
edcounz = bs4.BeautifulSoup(eceall.text)



In [18]:

    
eduht = edcounz.findAll('a')



In [47]:

    
urldict = dict()



In [48]:

    
urldict.update({'text' : (edu.text)})
urldict.update({'url' : (edu.attrs['href'])})



In [42]:

    
#urldict.update('test':'blah')









    



  File "<ipython-input-42-547404844573>", line 1
    urldict.update('test':'blah')
                         ^
SyntaxError: invalid syntax



In [49]:

    
urldict









    Out[49]:





{'text': u'Know your Region',
 'url': 'https://www.educationcounts.govt.nz/know-your-region'}

Create a RESTful api from the url. json structure that has title, slug, url, details, path

Parse url removing https://www.educationcounts.govt.nz/ - seprate them into folders.

The HTML of every link.

create dict with every url on the page - currently only creating one dict.

What is the unique key for each url?

6198



In [64]:

    
urldict['url']









    Out[64]:





'https://www.educationcounts.govt.nz/know-your-region'



In [69]:

    
reqsou = requests.get(urldict['url'])



In [73]:

    
urldictx = reqsou.text



In [81]:

    
urlx = urldictx.replace('\t', '')
urlfx = urlx.replace('\n', '')



In [83]:

    
soup = BeautifulSoup(urlfx, 'html.parser')



In [95]:

    
htmlre = soup.prettify()



In [96]:

    
urldict.update({'html' : htmlre})



In [97]:

    
o = urlparse(urldict['url'])



In [98]:

    
urldict.update({'path' : o.path})



In [1]:

    
for edu in eduht:
        #print edu.text
        #print edu.attrs['href']
        minpath = urlparse(edu.attrs['href'])
        print minpath.path
        print edu
        #urldict.update({'path' : minpath.path})









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-07b1a326922d> in <module>()
----> 1 for edu in eduht:
      2         #print edu.text
      3         #print edu.attrs['href']
      4         minpath = urlparse(edu.attrs['href'])
      5         print minpath.path

NameError: name 'eduht' is not defined



In [60]:

    
urldict









    Out[60]:





{'path': '',
 'text': u'Know your Region',
 'url': 'https://www.educationcounts.govt.nz/know-your-region'}



In [ ]:



In [20]:

    
doc = dominate.document(title='Education Counts')

with doc.head:
    link(rel='stylesheet', href='style.css')
    script(type='text/javascript', src='script.js')

with doc:
    #with div(id='header').add(ol()):
        #for i in ['home', 'about', 'contact']:
            #li(a(i.title(), href='/%s.html' % i))

    with div():
        for edu in eduht:
            a(dominate.tags.p(edu.text), href = (edu.attrs['href']))
            #p(edu.txt)
            #dominate.tags.p(edu.attrs['href'])
            #a(dominate.tags.p(edu.text), href=dominate.tags.a(edu.attrs['href']))
            
            #print edu.text

#print doc



In [21]:

    
docre = doc.render()
#s = docre.decode('ascii', 'ignore')
yourstring = docre.encode('ascii', 'ignore').decode('ascii')
indfil = ('/home/wcmckee/educount/index.html')
mkind = open(indfil, 'w')
mkind.write(yourstring)
mkind.close()



In [22]:

    
os.chdir('/home/wcmckee/educount/html/')



In [23]:

    
for edu in eduht:
    if 'http' in (edu.attrs['href']):
        print (edu.attrs['href'])
        response = requests.get((edu.attrs['href']), stream=True)
        with open(str(edu.text + '.html'), 'wb') as out_file:
            shutil.copyfileobj((response), out_file)
            del response
        #requests.get((edu.attrs['href']))









    



https://www.educationcounts.govt.nz/know-your-region






    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-23-c3639483d0b8> in <module>()
      4         response = requests.get((edu.attrs['href']), stream=True)
      5         with open(str(edu.text + '.html'), 'wb') as out_file:
----> 6             shutil.copyfileobj((response), out_file)
      7             del response
      8         #requests.get((edu.attrs['href']))

/usr/lib/python2.7/shutil.pyc in copyfileobj(fsrc, fdst, length)
     47     """copy data from file-like object fsrc to file-like object fdst"""
     48     while 1:
---> 49         buf = fsrc.read(length)
     50         if not buf:
     51             break

AttributeError: 'Response' object has no attribute 'read'



In [ ]: