ece-display: Education Counts
Python script to remix educationcounts govt site.
Display more info in one page.
Scrape links for excel files - download excel files to folder.
Remove excel files and instead display as more friendly json object and pandas dataframe for display.
In [50]:
    
import bs4
import requests
import dominate
from dominate.tags import *
import os
import shutil
import json
from urlparse import urlparse
from bs4 import BeautifulSoup
    
In [15]:
    
eceall = requests.get('https://www.educationcounts.govt.nz')
    
In [16]:
    
#eceall.text
    
In [17]:
    
edcounz = bs4.BeautifulSoup(eceall.text)
    
In [18]:
    
eduht = edcounz.findAll('a')
    
In [47]:
    
urldict = dict()
    
In [48]:
    
urldict.update({'text' : (edu.text)})
urldict.update({'url' : (edu.attrs['href'])})
    
In [42]:
    
#urldict.update('test':'blah')
    
    
In [49]:
    
urldict
    
    Out[49]:
Create a RESTful api from the url. json structure that has title, slug, url, details, path
Parse url removing https://www.educationcounts.govt.nz/ - seprate them into folders.
The HTML of every link.
create dict with every url on the page - currently only creating one dict.
What is the unique key for each url?
6198
In [64]:
    
urldict['url']
    
    Out[64]:
In [69]:
    
reqsou = requests.get(urldict['url'])
    
In [73]:
    
urldictx = reqsou.text
    
In [81]:
    
urlx = urldictx.replace('\t', '')
urlfx = urlx.replace('\n', '')
    
In [83]:
    
soup = BeautifulSoup(urlfx, 'html.parser')
    
In [95]:
    
htmlre = soup.prettify()
    
In [96]:
    
urldict.update({'html' : htmlre})
    
In [97]:
    
o = urlparse(urldict['url'])
    
In [98]:
    
urldict.update({'path' : o.path})
    
In [1]:
    
for edu in eduht:
        #print edu.text
        #print edu.attrs['href']
        minpath = urlparse(edu.attrs['href'])
        print minpath.path
        print edu
        #urldict.update({'path' : minpath.path})
    
    
In [60]:
    
urldict
    
    Out[60]:
In [ ]:
    
    
In [20]:
    
doc = dominate.document(title='Education Counts')
with doc.head:
    link(rel='stylesheet', href='style.css')
    script(type='text/javascript', src='script.js')
with doc:
    #with div(id='header').add(ol()):
        #for i in ['home', 'about', 'contact']:
            #li(a(i.title(), href='/%s.html' % i))
    with div():
        for edu in eduht:
            a(dominate.tags.p(edu.text), href = (edu.attrs['href']))
            #p(edu.txt)
            #dominate.tags.p(edu.attrs['href'])
            #a(dominate.tags.p(edu.text), href=dominate.tags.a(edu.attrs['href']))
            
            #print edu.text
#print doc
    
In [21]:
    
docre = doc.render()
#s = docre.decode('ascii', 'ignore')
yourstring = docre.encode('ascii', 'ignore').decode('ascii')
indfil = ('/home/wcmckee/educount/index.html')
mkind = open(indfil, 'w')
mkind.write(yourstring)
mkind.close()
    
In [22]:
    
os.chdir('/home/wcmckee/educount/html/')
    
In [23]:
    
for edu in eduht:
    if 'http' in (edu.attrs['href']):
        print (edu.attrs['href'])
        response = requests.get((edu.attrs['href']), stream=True)
        with open(str(edu.text + '.html'), 'wb') as out_file:
            shutil.copyfileobj((response), out_file)
            del response
        #requests.get((edu.attrs['href']))
    
    
    
In [ ]: