ece-display: Education Counts
Python script to remix educationcounts govt site.
Display more info in one page.
Scrape links for excel files - download excel files to folder.
Remove excel files and instead display as more friendly json object and pandas dataframe for display.
In [50]:
import bs4
import requests
import dominate
from dominate.tags import *
import os
import shutil
import json
from urlparse import urlparse
from bs4 import BeautifulSoup
In [15]:
eceall = requests.get('https://www.educationcounts.govt.nz')
In [16]:
#eceall.text
In [17]:
edcounz = bs4.BeautifulSoup(eceall.text)
In [18]:
eduht = edcounz.findAll('a')
In [47]:
urldict = dict()
In [48]:
urldict.update({'text' : (edu.text)})
urldict.update({'url' : (edu.attrs['href'])})
In [42]:
#urldict.update('test':'blah')
In [49]:
urldict
Out[49]:
Create a RESTful api from the url. json structure that has title, slug, url, details, path
Parse url removing https://www.educationcounts.govt.nz/ - seprate them into folders.
The HTML of every link.
create dict with every url on the page - currently only creating one dict.
What is the unique key for each url?
6198
In [64]:
urldict['url']
Out[64]:
In [69]:
reqsou = requests.get(urldict['url'])
In [73]:
urldictx = reqsou.text
In [81]:
urlx = urldictx.replace('\t', '')
urlfx = urlx.replace('\n', '')
In [83]:
soup = BeautifulSoup(urlfx, 'html.parser')
In [95]:
htmlre = soup.prettify()
In [96]:
urldict.update({'html' : htmlre})
In [97]:
o = urlparse(urldict['url'])
In [98]:
urldict.update({'path' : o.path})
In [1]:
for edu in eduht:
#print edu.text
#print edu.attrs['href']
minpath = urlparse(edu.attrs['href'])
print minpath.path
print edu
#urldict.update({'path' : minpath.path})
In [60]:
urldict
Out[60]:
In [ ]:
In [20]:
doc = dominate.document(title='Education Counts')
with doc.head:
link(rel='stylesheet', href='style.css')
script(type='text/javascript', src='script.js')
with doc:
#with div(id='header').add(ol()):
#for i in ['home', 'about', 'contact']:
#li(a(i.title(), href='/%s.html' % i))
with div():
for edu in eduht:
a(dominate.tags.p(edu.text), href = (edu.attrs['href']))
#p(edu.txt)
#dominate.tags.p(edu.attrs['href'])
#a(dominate.tags.p(edu.text), href=dominate.tags.a(edu.attrs['href']))
#print edu.text
#print doc
In [21]:
docre = doc.render()
#s = docre.decode('ascii', 'ignore')
yourstring = docre.encode('ascii', 'ignore').decode('ascii')
indfil = ('/home/wcmckee/educount/index.html')
mkind = open(indfil, 'w')
mkind.write(yourstring)
mkind.close()
In [22]:
os.chdir('/home/wcmckee/educount/html/')
In [23]:
for edu in eduht:
if 'http' in (edu.attrs['href']):
print (edu.attrs['href'])
response = requests.get((edu.attrs['href']), stream=True)
with open(str(edu.text + '.html'), 'wb') as out_file:
shutil.copyfileobj((response), out_file)
del response
#requests.get((edu.attrs['href']))
In [ ]: