In [2]:
from bs4 import BeautifulSoup
import requests
from spacy.en import English
from tqdm import tqdm_notebook

def StringToFunction(func_name):
    possibles = globals().copy()
    possibles.update(locals())
    func = possibles.get(func_name)
    if not func:
        raise NotImplementedError("Function %s not implemented" % func_name)
    return func

class JobScraper(object):
    def __init__(self, spider, parmdata):
        self.spider = StringToFunction(spider)
        self.parmdata = parmdata
        
    def Spider(self,*argv):
        jobs = []
        nlp = English()
        self.data = list()
        for joburl in self.spider(self,*argv):
            self.data.append(dict())
            r = requests.get(joburl, cookies=self.cookies)
            soup = BeautifulSoup(r.text,"lxml")
            content = soup.body.find('div', attrs={'id':'ctl00_MainContent_PrimaryContent'})
            jobs.append(dict())
            try:
                for span in content.span.find_all('span',recursive=True):
                      if span.has_attr('aria-labelledby'):
                            #print span['id'] + " : " + span.text
                            self.data[-1][span['id']] = span.getText(separator=u' ')
                            #doc = nlp(span.text)
                            #for np in doc.noun_chunks:
                            #    np.text
            except:
                print soup.body

In [3]:
def NullSpider(url):
    yield url
    
def DullSpider(url):
    yield url + 'dull'
    
def BrassRingSpider(self):
    # we need to get the cookies from this "base url" to make subsequent queries
    self.parmdata['baseurl'] = "%s/searchopenings.aspx?partnerid=%s&siteid=%s"%(self.parmdata['site'],self.parmdata['partnerid'],self.parmdata['siteid'])
    baser = requests.get(self.parmdata['baseurl'])
    self.cookies = baser.cookies
    
    # identify the url for performing search queries
    searchpage = dict()
    soup = BeautifulSoup(baser.text,"lxml")
    search = soup.find_all('form',attrs={'id' : 'aspnetForm'})
    
    # loop over search result pages; each page contains 50 listings by default
    startrecord = 1
    while True:
        searchpage['url'] = "%s/%s"%(self.parmdata['site'],search[0]['action'])
        headers = {'recordstart':startrecord}
        searchpage['data'] = requests.post(searchpage['url'],cookies=baser.cookies,data=headers)
        searchpage['soup'] = BeautifulSoup(unicode(searchpage['data'].text),"lxml")
        for maincontent in searchpage['soup'].find_all('input',attrs={'id':'ctl00_MainContent_GridFormatter_json_tabledata'}):
            if maincontent.has_attr('id'):
                subsoup = BeautifulSoup(maincontent['value'],"lxml")
                for job in subsoup.find_all('input',attrs={'name':'chkJobClientIds'}):
                    jobid = job['id']
                    joburl = "%s/jobdetails.aspx?jobId=%s&JobSiteId=%s"%(self.parmdata['site'],jobid,self.parmdata['siteid'])
                    yield joburl
                    
        numjobs = int(searchpage['soup'].find('input',{'name':'totalrecords'})['value'])
        if startrecord == 1:
            bar = tqdm_notebook(total=numjobs)
        bar.update(50)
        if startrecord + 50 >= numjobs:
            break
        else:
            startrecord = startrecord + 50

parmdata={'partnerid':54,'siteid' : 5346, 'site':'https://xjobs.brassring.com/TGWebHost'}

test = JobScraper('BrassRingSpider',parmdata)

In [7]:
mySpider = test.Spider()


<body id="bdyError">
<span role="main">
<table border="0" cellpadding="0" cellspacing="0" id="ErrorTbl" role="presentation" width="100%">
<tr>
<td id="imagecontainer" valign="top"><img alt="Error" src="images/error.gif"/></td>
<td id="messagecontainer" valign="top"><h1 class="PAGEtitle h1Title">Error</h1>You have encountered a system error. We apologize for the inconvenience. Please click the Back button to try again or (in case of continued problems) click the Home button to return to the beginning.</td>
</tr>
<tr>
<td colspan="2"><div class="text" id="errorid">Error ID:<img alt="" src="images/pixel.gif" width="5px"/>69406231-f405-4d42-a420-80601ae80a75</div></td>
</tr>
<tr>
<td colspan="2"><div id="buttons"><input id="backButton" name="backButton" onclick="javascript:history.go(-1);" type="button" value="Back"/><img alt="" src="images/pixel.gif" width="25px"/><input aria-label="home" id="homeButton" name="homeButton" onclick="javascript:window.location.href='home.aspx?partnerid=54&amp;siteid=5346';" role="button" type="button" value="Home"/></div></td>
</tr>
</table></span>
</body>
<body id="bdyError">
<span role="main">
<table border="0" cellpadding="0" cellspacing="0" id="ErrorTbl" role="presentation" width="100%">
<tr>
<td id="imagecontainer" valign="top"><img alt="Error" src="images/error.gif"/></td>
<td id="messagecontainer" valign="top"><h1 class="PAGEtitle h1Title">Error</h1>You have encountered a system error. We apologize for the inconvenience. Please click the Back button to try again or (in case of continued problems) click the Home button to return to the beginning.</td>
</tr>
<tr>
<td colspan="2"><div class="text" id="errorid">Error ID:<img alt="" src="images/pixel.gif" width="5px"/>2152ca46-f453-497f-a5a3-022658621b7c</div></td>
</tr>
<tr>
<td colspan="2"><div id="buttons"><input id="backButton" name="backButton" onclick="javascript:history.go(-1);" type="button" value="Back"/><img alt="" src="images/pixel.gif" width="25px"/><input aria-label="home" id="homeButton" name="homeButton" onclick="javascript:window.location.href='home.aspx?partnerid=54&amp;siteid=5346';" role="button" type="button" value="Home"/></div></td>
</tr>
</table></span>
</body>


In [8]:
test.data[0]


Out[8]:
{'About Us': u'GE is the world\u2019s Digital Industrial Company, transforming industry with software-defined machines and solutions that are connected, responsive and predictive. Through our people, leadership development, services, technology and scale, GE delivers better outcomes for global customers by speaking the language of industry.',
 'Business': u'GE Power',
 'Career Level': u'Experienced',
 'City': u'Tzafit',
 'Desired Characteristics': u'Willingness to work in 12 hour shifts, including nights, weekends and holidays.',
 'Essential Responsibilities': u'As an Operations Technician you will be part of a team working in the Control Room, ensuring a safe and efficient operation of the Power Station, according to the Grid Company demands and the contract with the Customer. Additional function-specific tasks and responsibilities include monitoring the Power Plant production both from the Control Room and from the field, receiving instructions from the grid operator and adjusting the units loads and/or voltage accordingly and issuing work requests whenever a deviation or malfunction is observed.',
 'Function': u'Services',
 'Function Segment': u'Services Project Management',
 'Job Number': u'2541899',
 'Location(s) Where Opening Is Available': u'Israel',
 'Postal Code': u'7987500',
 'Posted Position Title': u'Operations Technician',
 'Qualifications/Requirements': u"Practical Engineer (Electrical or Mechanical) ;\xa0 Three to five years' experience as an Operator in a Control Room, preferably in a Power Station;\xa0 Mother tongue Hebrew, fluent English (both verbal and written);\xa0 Able to learn fast and work independently;\xa0 Excellent interpersonal skills and teamwork;\xa0 Ability to work under pressure and make critical decisions when needed",
 'Relocation Assistance': u'No',
 'Role Summary/Purpose': u"We are looking for an Operations Technician for the Dalia (Tzafit) Power Plant, located 10 minutes' drive from Kfar Menahem. The Dalia (Tzafit) Plant is a 835MW combined-cycle power plant (CCPP), the largest of its kind in Israel, adding close to 8% of the country\u2019s installed power generation capacity . The 2 gas-fired combined-cycle units integrate Alstom\u2019s GT26 gas turbine, steam turbine, generator and heat recovery steam generator."}

In [8]:
for data in test.data:
    print data['Job Title']
    blah = nlp(data['Job Description'])
    for noun in blah.noun_chunks:
        print noun


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-8-6939bd0f352c> in <module>()
      1 for data in test.data:
----> 2     print data['Job Title']
      3     blah = nlp(data['Job Description'])
      4     for noun in blah.noun_chunks:
      5         print noun

KeyError: 'Job Title'

In [93]:
test.data[0]


Out[93]:
{'Business': u'GE Renewable Energy',
 'Business Segment': u'REN-ONW OnShore Wind',
 'Career Level': u'Entry-Level',
 'City': u'Bangalore',
 'Desired Characteristics': u'\u2022Knowledge in the field of Wind Turbine Technology. \u2022Programming skills to develop tools for process automations (Excel VBA, etc.) \u2022Ability to work with and across all global resources (US, Europe, India, China).',
 'Essential Responsibilities': u'\u2022Support NTI/NPI IP clearance programs by collaborating with technical experts, program managers & IP counsel. \u2022Explore IP landscaping opportunities and lead the initiative by performing patent analysis. Work with technical experts on white space  opportunities to come up with Invention Disclosure Letters (IDLs). \u2022Work with domain experts and IP counsel on performing patent invalidity studies.  \u2022Support patent evaluation board on IDL reviews and provide inputs to bring quality inventions. \u2022Periodically summarize granted patents across the globe and send alerts to relevant domain experts to work on IP concerns. \u2022Analyze recently published patents / utility models in various countries and release report monthly to relevant stakeholders. \u2022Support IP culture drive in BEC Renewable engineering team through IP workouts, tech trend sessions etc..  \u2022Support inventors in IDL submissions, prior arts searches & office actions.',
 'Function': u'Engineering/Technology',
 'Function Segment': u'Research',
 'Job Number': u'2406375',
 'Location(s) Where Opening Is Available': u'India',
 'Postal Code': u'560066',
 'Posted Position Title': u'Engineer, Intellectual Property',
 'Qualifications/Requirements': u'\u2022Master\u2019s degree in Electrical, Electrical & Electronics Engineering with \u2022Good knowledge on usage of patent search tools and databases like Thomson Innovation, PatBase, US PTO, Espacenet, JPO, WIPO etc. \u2022Good knowledge on various IP studies like Prior-art searches, Landscape analysis, patent invalidity and IP clearance. \u2022Good knowledge of USA, EU & Indian patent laws. \u2022Broad technical knowledge and understanding, ability to understand technical content quickly and to categorize it. \u2022Should possess good oral and written communication skills, with very good text comprehension skills. Affinity to written explanation of technical matters.',
 'Relocation Assistance': u'No'}

In [ ]: