In [23]:
import json
import codecs
import requests
from bs4 import BeautifulSoup
In [30]:
URL = 'http://jobregister.aas.org'
CACHE = '/tmp/rumormill.json'
def get_page(url):
try:
with json.load(open(CACHE,'r')) as j:
return j[url]
except:
page = requests.get(url).text
f = json.load(open(CACHE, 'r'))
f[url] = page
json.dump(f, open(CACHE, 'w'))
return page
# with codecs.open(CACHE, "r") as f:
# print 'Loaded Cache!'
# return f.read()
# except:
# page = requests.get(URL).text
# with codecs.open(CACHE, "w", "utf-8-sig") as f:
# f.write(page)
# return page
In [31]:
def build_main(url):
soup = BeautifulSoup(get_page(url))
s = soup.find('div', {'class':'pane-2'})
table = s.find('table')
out = {}
for tr in table.findAll('tr'):
if tr.get('class')[0] == 'JR_Table_Row_Category':
header = tr.text
out[header] = []
continue
elif tr.get('class')[0] == 'JR_Table_Row_Header':
continue
tmp = [td.text for td in tr.findAll('td')]
row = {'ident':tr.find('a').text.strip(),
'link':'{}/{}'.format(URL,tr.find('a').get('href')),
'title':tmp[1],
'org':tmp[2],
'date':tmp[3]}
out[header].append(row)
return out
data = build_main(URL)
In [85]:
def _clean(x):
return ' '.join([a.strip() for a in x.splitlines()]).strip()
def build_indiv(url):
soup = BeautifulSoup(get_page(url))
s = soup#.find('div',{'class':'pane-content'})
for x in s.findAll('div'):
if x.get('class') is not None and x.get('class')[-1].startswith('field-field'):
print x.get('class')[-1], #''.join([s.strip() in x.text.splitlines()]).strip()
return
items = ['ad-post-date','ad-archive-date','application-deadline',
'job-category','institution-name','institution-classification',
# 'inquirie-email',
# 'job-related-url',
# 'attention-to-city','attention-state-province','attention-to-rganization',
# 'attention-to-street-addres','field-field-attention-state-province',
# 'field-field-zip-postal-code','field-field-attention-to-country',
# 'field-field-attention-telephon','field-field-attention-to-email',
'attention-to','job-related-url','job-announcement','included-benefits']
out = {item:_clean(s.find('div',{'class':'field-field-'+item}).text) for item in items}
return out
# archive = s.find('div',{'class':'field-field-ad-archive-date'}).text
# datetime = s.find('div',{'class':'field-field-application-deadline'}).text
# print s.prettify()
build_indiv('http://jobregister.aas.org/job_view?JobID=48719')
In [ ]:
In [ ]: