ece-display: Education Counts

Python script to remix educationcounts govt site.

Display more info in one page.

Scrape links for excel files - download excel files to folder.

Remove excel files and instead display as more friendly json object and pandas dataframe for display.

In [50]:
import bs4
import requests
import dominate
from dominate.tags import *
import os
import shutil
import json
from urlparse import urlparse
from bs4 import BeautifulSoup

In [15]:
eceall = requests.get('')

In [16]:

  • Language use in ECE

    Statistics on language use in ECE, including tables on number of services, …

    In [17]:
    edcounz = bs4.BeautifulSoup(eceall.text)
    In [18]:
    eduht = edcounz.findAll('a')
    In [47]:
    urldict = dict()
    In [48]:
    urldict.update({'text' : (edu.text)})
    urldict.update({'url' : (edu.attrs['href'])})
    In [42]:
      File "<ipython-input-42-547404844573>", line 1
    SyntaxError: invalid syntax
    In [49]:
    {'text': u'Know your Region',
     'url': ''}

    Create a RESTful api from the url. json structure that has title, slug, url, details, path

    Parse url removing - seprate them into folders.

    The HTML of every link.

    create dict with every url on the page - currently only creating one dict.

    What is the unique key for each url?


    In [64]:
    In [69]:
    reqsou = requests.get(urldict['url'])
    In [73]:
    urldictx = reqsou.text
    In [81]:
    urlx = urldictx.replace('\t', '')
    urlfx = urlx.replace('\n', '')
    In [83]:
    soup = BeautifulSoup(urlfx, 'html.parser')
    In [95]:
    htmlre = soup.prettify()
    In [96]:
    urldict.update({'html' : htmlre})
    In [97]:
    o = urlparse(urldict['url'])
    In [98]:
    urldict.update({'path' : o.path})
    In [1]:
    for edu in eduht:
            #print edu.text
            #print edu.attrs['href']
            minpath = urlparse(edu.attrs['href'])
            print minpath.path
            print edu
            #urldict.update({'path' : minpath.path})
    NameError                                 Traceback (most recent call last)
    <ipython-input-1-07b1a326922d> in <module>()
    ----> 1 for edu in eduht:
          2         #print edu.text
          3         #print edu.attrs['href']
          4         minpath = urlparse(edu.attrs['href'])
          5         print minpath.path
    NameError: name 'eduht' is not defined
    In [60]:
    {'path': '',
     'text': u'Know your Region',
     'url': ''}
    In [20]:
    doc = dominate.document(title='Education Counts')
    with doc.head:
        link(rel='stylesheet', href='style.css')
        script(type='text/javascript', src='script.js')
    with doc:
        #with div(id='header').add(ol()):
            #for i in ['home', 'about', 'contact']:
                #li(a(i.title(), href='/%s.html' % i))
        with div():
            for edu in eduht:
                a(dominate.tags.p(edu.text), href = (edu.attrs['href']))
                #a(dominate.tags.p(edu.text), href=dominate.tags.a(edu.attrs['href']))
                #print edu.text
    #print doc
    In [21]:
    docre = doc.render()
    #s = docre.decode('ascii', 'ignore')
    yourstring = docre.encode('ascii', 'ignore').decode('ascii')
    indfil = ('/home/wcmckee/educount/index.html')
    mkind = open(indfil, 'w')
    In [22]:
    In [23]:
    for edu in eduht:
        if 'http' in (edu.attrs['href']):
            print (edu.attrs['href'])
            response = requests.get((edu.attrs['href']), stream=True)
            with open(str(edu.text + '.html'), 'wb') as out_file:
                shutil.copyfileobj((response), out_file)
                del response
    AttributeError                            Traceback (most recent call last)
    <ipython-input-23-c3639483d0b8> in <module>()
          4         response = requests.get((edu.attrs['href']), stream=True)
          5         with open(str(edu.text + '.html'), 'wb') as out_file:
    ----> 6             shutil.copyfileobj((response), out_file)
          7             del response
          8         #requests.get((edu.attrs['href']))
    /usr/lib/python2.7/shutil.pyc in copyfileobj(fsrc, fdst, length)
         47     """copy data from file-like object fsrc to file-like object fdst"""
         48     while 1:
    ---> 49         buf =
         50         if not buf:
         51             break
    AttributeError: 'Response' object has no attribute 'read'
    In [ ]: