Python script to fetch jobs from Ministry Of Education and convert it into a json object. Fix format problems around the xml.
In [14]:
TODO
done - Parse description tag for the keys and values hidden within that. done - Extract email from description, if email look up persons phone number and attach it.
Location - look on digitalnz for cc by images. Search location and aerial photos of the area that could be used in the job ad. title -
scrape url.
done - cycle over all job listings and create json object with them all instead of just a random choice.
Short the url
Fix arrow to convert to the time given rather than time now.
Merge data in from DigitalNZ.
What other apis could i merge in?
Fix the li that doesn't get included in description text (p) and make sure it is included.
In [14]:
In [14]:
In [14]:
In [15]:
import requests
#import untangle
import xmltodict
import json
#import random
import bs4
#import dominate
#from dominate.tags import *
#from pydnz import Dnz
#import arrow
#import bs4
#import pyshorteners
#import tweepy
In [15]:
In [15]:
In [16]:
#dnz = Dnz('keyhere')
In [17]:
jobreq = requests.get('https://jobs.minedu.govt.nz/jobtools/job_rss?o1=17584&k2=A52B3674BC046465&source=JobRSS&medium=JobRSS')
In [18]:
jobtxta = jobreq.text
In [19]:
#obj = untangle.parse(jobtxta)
In [20]:
#obj
In [21]:
dicjobz = xmltodict.parse(jobtxta)
In [22]:
ranldicj = len(dicjobz['rss']['channel']['item'])
In [23]:
#ranldicj
In [24]:
#randicz = random.randint(0, ranldicj)
In [25]:
#randicz
In [26]:
wrapdict = dict()
In [27]:
for dic in range(ranldicj):
dicrs = dicjobz['rss']['channel']['item'][dic]
dicrts = dicrs['title']
dicrtq = dicrs
#artim = arrow.now(dicrtq['pubDate'])
#jobclose = artim.replace(weeks=+2)
#jclodat = jobclose.date()
msjobdic = dict()
#msjobdic.update({#'date-advertised' : str(artim.date()),
#'time-advertised' : str(artim.time()),
#'title' : dicrts,
#'date-closed' : str(jclodat)})
requlink = dicrtq['link']
reqlinkq = requests.get(requlink)
bsoup = bs4.BeautifulSoup(reqlinkq.text)
bfina = bsoup.findAll('a')
for bfin in bfina:
if ('@') in bfin.text:
#print bfin.text
msjobdic.update({('email') : str(bfin.text)})
for bfin in bfina:
if ('href') in bfin.text:
#print bfin.text
msjobdic.update({('href') : str(bfin.text)})
for bfiny in bfina:
if '.docx' in bfiny.text:
msjobdic.update({'doc' : bfiny.text})
msjobdic.update({'link' : dicrtq['link']})
msjobdic.update({'jsonlen' : (str(dic))})
bsdescr = bs4.BeautifulSoup(dicrtq['description'])
txtspli = [line.text.split(": ") for line in bsdescr.findAll('li')[0:8]]
docitems = [line.text.split(": ") for line in bsdescr.findAll('li')[9:]]
findict = dict()
findict.update({'lidocend' : docitems})
totlen = len(txtspli)
for tes in range(totlen):
findict.update({txtspli[tes][0] : txtspli[tes][1]})
findict.update({txtspli[0][0] : txtspli[0][1]})
msjobz = findict.copy()
msjobz.update(msjobdic)
wrapdict.update({dic : msjobz})
jsmsdob = json.dumps(wrapdict)
#opeind = open('/home/wcmckee/minedujob/' + str(dic) + '.json', 'w')
opind = open('/home/wcmckee/github/wcmckee.com/output/minedujobs/index.json', 'w')
opind.write(jsmsdob)
#api.update_status(dicrts)
#opeind.close()
In [28]:
opind.close()
In [14]:
In [15]:
#opeind.close()
In [16]:
#dicrs = dicjobz['rss']['channel']['item'][0]
In [17]:
#dicrts = dicrs['title']
#dicrtq = dicrs
In [18]:
#artim = arrow.now(dicrtq['pubDate'])
In [19]:
#jobclose = artim.replace(weeks=+2)
In [20]:
#jclodat = jobclose.date()
In [20]:
In [21]:
#msjobdic = dict()
In [22]:
#msjobdic.update({'date-advertised' : str(artim.date()),
# 'time-advertised' : str(artim.time()),
# 'title' : dicrts,
# 'date-closed' : str(jclodat)})
In [23]:
#msjobdic
In [24]:
#requlink = dicrtq['link']
In [25]:
#reqlinkq = requests.get(requlink)
In [26]:
#bsoup = bs4.BeautifulSoup(reqlinkq.text)
In [27]:
#bfina = bsoup.findAll('a')
In [28]:
#msjobdic.update({'date advertised' : str(artim.date()),
# 'time advertised' : str(artim.time()),
# 'title' : dicrts,
# '})
#for bfin in bfina:
# if ('@') in bfin.text:
# #print bfin.text
# msjobdic.update({('email') : str(bfin.text)})
In [29]:
#for bfiny in bfina:
# if '.docx' in bfiny.text:
# print bfiny.text
In [30]:
#Search for this file and render text.
#if jpg/gif render.
In [31]:
#for bfin in bfina:
# if ('href') in bfin.text:
#print bfin.text
# msjobdic.update({('href') : str(bfin.text)})
In [32]:
#msjob
In [33]:
#msjobdic.update({'randnum' : randicz})
In [34]:
#for bfiny in bfina:
# if '.docx' in bfiny.text:
# msjobdic.update({'doc' : bfiny.text})
In [35]:
#msjobdic
In [36]:
#msjobdic.update({'link' : dicrtq['link']})
In [37]:
#msjobdic
In [38]:
#msjobdic.update({'doc' : b
In [39]:
#bsdescr = bs4.BeautifulSoup(dicrtq['description'])
In [40]:
#for iza in bsdescr.findAll('li')[0:8]:
# print iza
In [41]:
#lili = list()
In [42]:
#txtspli = [line.text.split(": ") for line in bsdescr.findAll('li')[0:8]]
In [43]:
#findict = dict()
In [44]:
#totlen = len(txtspli)
In [45]:
#for tes in range(totlen):
# findict.update({txtspli[tes][0] : txtspli[tes][1]})
In [46]:
#findict.update({txtspli[0][0] : txtspli[0][1]})
In [47]:
#msjobz = findict.copy()
#msjobz.update(msjobdic)
In [48]:
#for bsdz in bsdescr.findAll('li'):
# (k,v) = bsdz.text.split(": ")
# print bsdz.text
# print(k,v)
# lili.append(bsdz.text)
In [49]:
#txtlis = list()
In [49]:
In [50]:
#bsp = bsdescr.findAll('p')
In [51]:
#for bs in bsp:
#print bs.text
#txtlis.append(bs.text)
#numchc = [int(s) for s in bs.text.split() if s.isdigit()]
#It's skipping items in the li since thats not a p :(
#There are other details in the description that I
#would like to extract - like closing date.
#Why not for closing date just take the date it was
#advertised and + 2 weeks?
In [52]:
#dicrts
In [53]:
#debsnz = dnz.search(dicrs)
In [54]:
#randrecord = len(debsnz.records)
In [55]:
#ranitdz = random.randint(0, randrecord)
In [56]:
#ranitdz
In [57]:
#randicz
In [58]:
#debsnz.records
In [59]:
#debrecintz = debsnz.records[ranitdz]
In [60]:
#kederz = debrecintz.keys()
In [61]:
#print debrecintz['category']
#print debrecintz['usage']
In [62]:
#for ked in kederz:
# print ked
# print debrecintz[ked]
#print ked
#print ked
#print debrecintz['category']
In [63]:
#print debrecintz['id']
In [64]:
#getiddnz = ('http://api.digitalnz.org/v3/records/' + str(debrecintz['id']) + '.json?api_key=Ph2LDuyiJmJcQm1S5myy')
In [65]:
#getiddnz
In [66]:
#reqidnz = requests.get(getiddnz)
In [67]:
#json.dumps(reqidnz)
In [68]:
#mylirq = list()
In [69]:
#for reqi in reqidnz:
#print reqi
#print reqi.upper()
#reqi
In [70]:
#my_dict.pop("key", None)
In [71]:
#dicrq = len(dicjobz['rss']['channel']['item'])
In [72]:
#dicrq
In [73]:
#Return a random job.
In [74]:
#ranjoz = random.randint(0, dicrq)
#dicrsch = dicjobz['rss']['channel']['item']
In [75]:
#print dicrsch[ranjoz]['link']
In [76]:
#print dicrsch[ranjoz]['title']
In [77]:
#jobtype
#location
#date advertised
#jobreference
#jobtitle
#should be keys
#Currently they are inside description key
#Create new json file that fixes this.
In [78]:
#for dezsr in dicrsch[ranjoz]['description']:
## if 'JobType' in dezs#r:
# # print dezsr
In [79]:
#docstart.title = ('ministry-of-education-jobs')
#doc = dominate.document(title='ministry-of-education-jobs')
#with doc.head:
# link(rel='stylesheet', href='style.css')
# script(type='text/javascript', src='script.js')
#with doc:
#with div(id='header').add(ol()):
#for i in ['home', 'about', 'contact']:
#li(a(i.title(), href='/%s.html' % i))
# with div(cls='row'):
# h1('education-counts-jobs')
# h2(dicrsch[ranjoz]['title'])
# p(dicrs)
#p(dicrsch[ranjoz]['description'])
# p(a(dicrs, href= dicrsch[ranjoz]['link']))
#for ked in kederz:
#print ked
# p((kederz[ked]))
#print ked
#print ked
In [80]:
#print doc
#docre = doc.render()
#s = docre.decode('ascii', 'ignore')
#yourstring = docre.encode('ascii', 'ignore').decode('ascii')
#indfil = ('/home/wcmckee/minedujob/index.html')
#mkind = open(indfil, 'w')
#mkind.write(yourstring)
#mkind.close()
In [81]:
#jsmsdob = json.dumps(msjobdic)
In [82]:
#opeind = open('/home/wcmckee/minedujob/minedujobs.json', 'w')
In [83]:
#opeind.write(jsmsdob)
In [84]:
#opeind.close()
In [84]:
In [84]: