notebook.community

Edit and run



In [60]:

    
import bs4
import requests
import dominate
from dominate.tags import *
import os
import shutil
import json
from urlparse import urlparse
from bs4 import BeautifulSoup



In [61]:

    
#eceall = requests.get('')



In [62]:

    
opind = open('/media/removable/WILLIAMS30G/www.educationcounts.govt.nz/index.html', 'r')



In [63]:

    
optfz = opind.read()



In [64]:

    
edcountz = bs4.BeautifulSoup(optfz)



In [65]:

    
afind = edcountz.findAll('a')



In [66]:

    
gidict = dict()



In [67]:

    
for afi in afind:
    gied = afi.text
    gifz = gied.replace(' ', '-')
    giflow = gifz.lower()
    gidict.update({giflow[:12] : afi.attrs['href']})
    
    gihrd = urlparse(afi.attrs['href'])



In [68]:

    
doc = dominate.document(title='educount')

with doc.head:
    link(rel='stylesheet', href='style.css')
    script(type='text/javascript', src='script.js')

with doc:
    #with div(id='header').add(ol()):
        #for i in ['home', 'about', 'contact']:
            #li(a(i.title(), href='/%s.html' % i))

    with div(cls='row'):
        h1('education-counts-url')
        for afi in afind:
            if 'https://' in afi.attrs['href']:
                gied = afi.text
                gifz = gied.replace(' ', '-')
                giflow = gifz.lower()
                a(dominate.tags.li(giflow[:12]), href = (afi.attrs['href']))
                #print afi.attrs['href']
            #p(edu.txt)
            #dominate.tags.p(edu.attrs['href'])
            #a(dominate.tags.p(edu.text), href=dominate.tags.a(edu.attrs['href']))
            
            #print edu.text

#print doc



In [69]:

    
docre = doc.render()
#s = docre.decode('ascii', 'ignore')
yourstring = docre.encode('ascii', 'ignore').decode('ascii')
indfil = ('/media/removable/WILLIAMS30G/education-counts-url/index.json')
mkind = open(indfil, 'w')
mkind.write(yourstring)
mkind.close()



In [70]:

    
dicon = json.dumps(gidict)



In [71]:

    
opeducon = open('/media/removable/WILLIAMS30G/education-counts-url/index.json', 'w')

opeducon.write(dicon)

opeducon.close()



In [72]:

    
rdeducon = open('/media/removable/WILLIAMS30G/education-counts-url/index.json', 'r')

rdqa = rdeducon.read()



In [73]:

    
loadic = json.loads(rdqa)