ece-display: Education Counts

Python script to remix educationcounts govt site.

Display more info in one page.

Scrape links for excel files - download excel files to folder.

Remove excel files and instead display as more friendly json object and pandas dataframe for display.


In [50]:
import bs4
import requests
import dominate
from dominate.tags import *
import os
import shutil
import json
from urlparse import urlparse
from bs4 import BeautifulSoup

In [15]:
eceall = requests.get('https://www.educationcounts.govt.nz')

In [16]:
#eceall.text

  • Language use in ECE

    Statistics on language use in ECE, including tables on number of services, …

    
    
    In [17]:
    edcounz = bs4.BeautifulSoup(eceall.text)
    
    
    
    In [18]:
    eduht = edcounz.findAll('a')
    
    
    
    In [47]:
    urldict = dict()
    
    
    
    In [48]:
    urldict.update({'text' : (edu.text)})
    urldict.update({'url' : (edu.attrs['href'])})
    
    
    
    In [42]:
    #urldict.update('test':'blah')
    
    
    
    
      File "<ipython-input-42-547404844573>", line 1
        urldict.update('test':'blah')
                             ^
    SyntaxError: invalid syntax
    
    
    
    In [49]:
    urldict
    
    
    
    
    Out[49]:
    {'text': u'Know your Region',
     'url': 'https://www.educationcounts.govt.nz/know-your-region'}

    Create a RESTful api from the url. json structure that has title, slug, url, details, path

    Parse url removing https://www.educationcounts.govt.nz/ - seprate them into folders.

    The HTML of every link.

    create dict with every url on the page - currently only creating one dict.

    What is the unique key for each url?

    6198

    
    
    In [64]:
    urldict['url']
    
    
    
    
    Out[64]:
    'https://www.educationcounts.govt.nz/know-your-region'
    
    
    In [69]:
    reqsou = requests.get(urldict['url'])
    
    
    
    In [73]:
    urldictx = reqsou.text
    
    
    
    In [81]:
    urlx = urldictx.replace('\t', '')
    urlfx = urlx.replace('\n', '')
    
    
    
    In [83]:
    soup = BeautifulSoup(urlfx, 'html.parser')
    
    
    
    In [95]:
    htmlre = soup.prettify()
    
    
    
    In [96]:
    urldict.update({'html' : htmlre})
    
    
    
    In [97]:
    o = urlparse(urldict['url'])
    
    
    
    In [98]:
    urldict.update({'path' : o.path})
    
    
    
    In [1]:
    for edu in eduht:
            #print edu.text
            #print edu.attrs['href']
            minpath = urlparse(edu.attrs['href'])
            print minpath.path
            print edu
            #urldict.update({'path' : minpath.path})
    
    
    
    
    ---------------------------------------------------------------------------
    NameError                                 Traceback (most recent call last)
    <ipython-input-1-07b1a326922d> in <module>()
    ----> 1 for edu in eduht:
          2         #print edu.text
          3         #print edu.attrs['href']
          4         minpath = urlparse(edu.attrs['href'])
          5         print minpath.path
    
    NameError: name 'eduht' is not defined
    
    
    In [60]:
    urldict
    
    
    
    
    Out[60]:
    {'path': '',
     'text': u'Know your Region',
     'url': 'https://www.educationcounts.govt.nz/know-your-region'}
    
    
    In [ ]:
    
    
    
    
    In [20]:
    doc = dominate.document(title='Education Counts')
    
    with doc.head:
        link(rel='stylesheet', href='style.css')
        script(type='text/javascript', src='script.js')
    
    with doc:
        #with div(id='header').add(ol()):
            #for i in ['home', 'about', 'contact']:
                #li(a(i.title(), href='/%s.html' % i))
    
        with div():
            for edu in eduht:
                a(dominate.tags.p(edu.text), href = (edu.attrs['href']))
                #p(edu.txt)
                #dominate.tags.p(edu.attrs['href'])
                #a(dominate.tags.p(edu.text), href=dominate.tags.a(edu.attrs['href']))
                
                #print edu.text
    
    #print doc
    
    
    
    In [21]:
    docre = doc.render()
    #s = docre.decode('ascii', 'ignore')
    yourstring = docre.encode('ascii', 'ignore').decode('ascii')
    indfil = ('/home/wcmckee/educount/index.html')
    mkind = open(indfil, 'w')
    mkind.write(yourstring)
    mkind.close()
    
    
    
    In [22]:
    os.chdir('/home/wcmckee/educount/html/')
    
    
    
    In [23]:
    for edu in eduht:
        if 'http' in (edu.attrs['href']):
            print (edu.attrs['href'])
            response = requests.get((edu.attrs['href']), stream=True)
            with open(str(edu.text + '.html'), 'wb') as out_file:
                shutil.copyfileobj((response), out_file)
                del response
            #requests.get((edu.attrs['href']))
    
    
    
    
    https://www.educationcounts.govt.nz/know-your-region
    
    ---------------------------------------------------------------------------
    AttributeError                            Traceback (most recent call last)
    <ipython-input-23-c3639483d0b8> in <module>()
          4         response = requests.get((edu.attrs['href']), stream=True)
          5         with open(str(edu.text + '.html'), 'wb') as out_file:
    ----> 6             shutil.copyfileobj((response), out_file)
          7             del response
          8         #requests.get((edu.attrs['href']))
    
    /usr/lib/python2.7/shutil.pyc in copyfileobj(fsrc, fdst, length)
         47     """copy data from file-like object fsrc to file-like object fdst"""
         48     while 1:
    ---> 49         buf = fsrc.read(length)
         50         if not buf:
         51             break
    
    AttributeError: 'Response' object has no attribute 'read'
    
    
    In [ ]: