minedujob

Python script to fetch jobs from Ministry Of Education and convert it into a json object. Fix format problems around the xml.


In [14]:

TODO

done - Parse description tag for the keys and values hidden within that. done - Extract email from description, if email look up persons phone number and attach it.

Location - look on digitalnz for cc by images. Search location and aerial photos of the area that could be used in the job ad. title -

scrape url.

done - cycle over all job listings and create json object with them all instead of just a random choice.

Short the url

Fix arrow to convert to the time given rather than time now.

Merge data in from DigitalNZ.

What other apis could i merge in?

Fix the li that doesn't get included in description text (p) and make sure it is included.


In [14]:


In [14]:


In [14]:


In [15]:
import requests
#import untangle
import xmltodict
import json
#import random
import bs4
#import dominate
#from dominate.tags import *
#from pydnz import Dnz
#import arrow
#import bs4
#import pyshorteners
#import tweepy

In [15]:


In [15]:


In [16]:
#dnz = Dnz('keyhere')

In [17]:
jobreq = requests.get('https://jobs.minedu.govt.nz/jobtools/job_rss?o1=17584&k2=A52B3674BC046465&source=JobRSS&medium=JobRSS')

In [18]:
jobtxta = jobreq.text

In [19]:
#obj = untangle.parse(jobtxta)

In [20]:
#obj

In [21]:
dicjobz = xmltodict.parse(jobtxta)

In [22]:
ranldicj = len(dicjobz['rss']['channel']['item'])

In [23]:
#ranldicj

In [24]:
#randicz = random.randint(0, ranldicj)

In [25]:
#randicz

In [26]:
wrapdict = dict()

In [27]:
for dic in range(ranldicj):
    dicrs = dicjobz['rss']['channel']['item'][dic]
    dicrts = dicrs['title']
    dicrtq = dicrs
    #artim = arrow.now(dicrtq['pubDate'])
    #jobclose = artim.replace(weeks=+2)
    #jclodat = jobclose.date()
    msjobdic = dict()
    #msjobdic.update({#'date-advertised' : str(artim.date()), 
                #'time-advertised' : str(artim.time()),
                #'title' : dicrts,
                #'date-closed' : str(jclodat)})
    requlink = dicrtq['link']
    reqlinkq = requests.get(requlink)
    bsoup = bs4.BeautifulSoup(reqlinkq.text)
    bfina = bsoup.findAll('a')
    for bfin in bfina:
        if ('@') in bfin.text:
            #print bfin.text
            msjobdic.update({('email') : str(bfin.text)})
            
    for bfin in bfina:
        if ('href') in bfin.text:
            #print bfin.text
            msjobdic.update({('href') : str(bfin.text)})
    for bfiny in bfina:
        if '.docx' in bfiny.text:
            msjobdic.update({'doc' : bfiny.text})
    msjobdic.update({'link' : dicrtq['link']})
    msjobdic.update({'jsonlen' : (str(dic))})
    bsdescr = bs4.BeautifulSoup(dicrtq['description'])
    txtspli = [line.text.split(": ") for line in bsdescr.findAll('li')[0:8]]
    docitems = [line.text.split(": ") for line in bsdescr.findAll('li')[9:]]
    findict = dict()
    findict.update({'lidocend' : docitems})
    totlen = len(txtspli)
    for tes in range(totlen):
        findict.update({txtspli[tes][0] : txtspli[tes][1]})
    findict.update({txtspli[0][0] : txtspli[0][1]})
    msjobz = findict.copy()
    msjobz.update(msjobdic)
    
    wrapdict.update({dic : msjobz})
    jsmsdob = json.dumps(wrapdict)
    #opeind = open('/home/wcmckee/minedujob/' + str(dic) + '.json', 'w')
    opind = open('/home/wcmckee/github/wcmckee.com/output/minedujobs/index.json', 'w')
    opind.write(jsmsdob)
    #api.update_status(dicrts)
    #opeind.close()

In [28]:
opind.close()

In [14]:


In [15]:
#opeind.close()

In [16]:
#dicrs = dicjobz['rss']['channel']['item'][0]

In [17]:
#dicrts = dicrs['title']
#dicrtq = dicrs

In [18]:
#artim = arrow.now(dicrtq['pubDate'])

In [19]:
#jobclose = artim.replace(weeks=+2)

In [20]:
#jclodat = jobclose.date()

In [20]:


In [21]:
#msjobdic = dict()

In [22]:
#msjobdic.update({'date-advertised' : str(artim.date()), 
#                'time-advertised' : str(artim.time()),
#                'title' : dicrts,
#                'date-closed' : str(jclodat)})

In [23]:
#msjobdic

In [24]:
#requlink = dicrtq['link']

In [25]:
#reqlinkq = requests.get(requlink)

In [26]:
#bsoup = bs4.BeautifulSoup(reqlinkq.text)

In [27]:
#bfina = bsoup.findAll('a')

In [28]:
#msjobdic.update({'date advertised' : str(artim.date()), 
#                'time advertised' : str(artim.time()),
#                'title' : dicrts,
#                '})
#for bfin in bfina:
#    if ('@') in bfin.text:
#        #print bfin.text
#        msjobdic.update({('email') : str(bfin.text)})

In [29]:
#for bfiny in bfina:
#    if '.docx' in bfiny.text:
#        print bfiny.text

In [30]:
#Search for this file and render text.
#if jpg/gif render.

In [31]:
#for bfin in bfina:
#    if ('href') in bfin.text:
        #print bfin.text
#        msjobdic.update({('href') : str(bfin.text)})

In [32]:
#msjob

In [33]:
#msjobdic.update({'randnum' : randicz})

In [34]:
#for bfiny in bfina:
#    if '.docx' in bfiny.text:
#        msjobdic.update({'doc' : bfiny.text})

In [35]:
#msjobdic

In [36]:
#msjobdic.update({'link' : dicrtq['link']})

In [37]:
#msjobdic

In [38]:
#msjobdic.update({'doc' : b

In [39]:
#bsdescr = bs4.BeautifulSoup(dicrtq['description'])

In [40]:
#for iza in bsdescr.findAll('li')[0:8]:
#    print iza

In [41]:
#lili = list()

In [42]:
#txtspli = [line.text.split(": ") for line in bsdescr.findAll('li')[0:8]]

In [43]:
#findict = dict()

In [44]:
#totlen = len(txtspli)

In [45]:
#for tes in range(totlen):
#    findict.update({txtspli[tes][0] : txtspli[tes][1]})

In [46]:
#findict.update({txtspli[0][0] : txtspli[0][1]})

In [47]:
#msjobz = findict.copy()
#msjobz.update(msjobdic)

In [48]:
#for bsdz in bsdescr.findAll('li'):
#    (k,v) = bsdz.text.split(": ")
#    print bsdz.text
#    print(k,v)
#    lili.append(bsdz.text)

In [49]:
#txtlis = list()

In [49]:


In [50]:
#bsp = bsdescr.findAll('p')

In [51]:
#for bs in bsp:
    #print bs.text
    #txtlis.append(bs.text)
    #numchc = [int(s) for s in bs.text.split() if s.isdigit()]
    #It's skipping items in the li since thats not a p :( 
    #There are other details in the description that I 
    #would like to extract - like closing date. 
    #Why not for closing date just take the date it was 
    #advertised and + 2 weeks?

In [52]:
#dicrts

In [53]:
#debsnz =  dnz.search(dicrs)

In [54]:
#randrecord = len(debsnz.records)

In [55]:
#ranitdz = random.randint(0, randrecord)

In [56]:
#ranitdz

In [57]:
#randicz

In [58]:
#debsnz.records

In [59]:
#debrecintz = debsnz.records[ranitdz]

In [60]:
#kederz = debrecintz.keys()

In [61]:
#print debrecintz['category']
#print debrecintz['usage']

In [62]:
#for ked in kederz:
#    print ked
#    print debrecintz[ked]
    #print ked
    #print ked
    #print debrecintz['category']

In [63]:
#print debrecintz['id']

In [64]:
#getiddnz = ('http://api.digitalnz.org/v3/records/' + str(debrecintz['id']) + '.json?api_key=Ph2LDuyiJmJcQm1S5myy')

In [65]:
#getiddnz

In [66]:
#reqidnz = requests.get(getiddnz)

In [67]:
#json.dumps(reqidnz)

In [68]:
#mylirq = list()

In [69]:
#for reqi in reqidnz:
    #print reqi
    #print reqi.upper()
    #reqi

In [70]:
#my_dict.pop("key", None)

In [71]:
#dicrq = len(dicjobz['rss']['channel']['item'])

In [72]:
#dicrq

In [73]:
#Return a random job.

In [74]:
#ranjoz = random.randint(0, dicrq)

#dicrsch = dicjobz['rss']['channel']['item']

In [75]:
#print dicrsch[ranjoz]['link']

In [76]:
#print dicrsch[ranjoz]['title']

In [77]:
#jobtype
#location
#date advertised
#jobreference
#jobtitle
#should be keys
#Currently they are inside description key
#Create new json file that fixes this.

In [78]:
#for dezsr in  dicrsch[ranjoz]['description']:
##    if 'JobType' in dezs#r:
# #       print dezsr

In [79]:
#docstart.title = ('ministry-of-education-jobs')
#doc = dominate.document(title='ministry-of-education-jobs')

#with doc.head:
#    link(rel='stylesheet', href='style.css')
#    script(type='text/javascript', src='script.js')

#with doc:
    #with div(id='header').add(ol()):
        #for i in ['home', 'about', 'contact']:
            #li(a(i.title(), href='/%s.html' % i))

#    with div(cls='row'):
#        h1('education-counts-jobs')
#        h2(dicrsch[ranjoz]['title'])
#        p(dicrs)
        #p(dicrsch[ranjoz]['description'])
#        p(a(dicrs, href= dicrsch[ranjoz]['link']))
        
        #for ked in kederz:
        #print ked
        #    p((kederz[ked]))
        #print ked
        #print ked

In [80]:
#print doc

#docre = doc.render()
#s = docre.decode('ascii', 'ignore')
#yourstring = docre.encode('ascii', 'ignore').decode('ascii')
#indfil = ('/home/wcmckee/minedujob/index.html')
#mkind = open(indfil, 'w')
#mkind.write(yourstring)
#mkind.close()

In [81]:
#jsmsdob = json.dumps(msjobdic)

In [82]:
#opeind = open('/home/wcmckee/minedujob/minedujobs.json', 'w')

In [83]:
#opeind.write(jsmsdob)

In [84]:
#opeind.close()

In [84]:


In [84]: