Analysis of Indeed.com Job Postings

Imports


In [18]:
import requests
from time import sleep
import pandas as pd
import numpy as np
import bs4
from bs4 import BeautifulSoup

Functions to web-scrap Indeed.com using Beautiful Soup


In [241]:
def text_preprocess(text):
    t = str()
    for i in text.split():
        t = t + i + "+"
    return t[:-1]

In [192]:
def extract_job_title(soup): 
    jobs = []
    for div in soup.find_all(name="div", attrs={"class":"row"}):
        for a in div.find_all(name="a", attrs={"data-tn-element":"jobTitle"}):
            jobs.append(a["title"])
    return(jobs)

In [194]:
def extract_company(soup): 
    companies = []
    for div in soup.find_all(name="div", attrs={"class":"row"}):
        company = div.find_all(name="span", attrs={"class":"company"})
        if len(company) > 0:
            for b in company:
                companies.append(b.text.strip())
        else:
            sec_try = div.find_all(name="span", attrs={"class":"result-link-source"})
            for span in sec_try:
                companies.append(span.text.strip())
    return(companies)

In [195]:
def extract_location(soup): 
    locations = []
    spans = soup.findAll("span", attrs={"class": "location"})
    for span in spans:
        locations.append(span.text)
    return(locations)

In [211]:
def extract_summary(soup): 
    summaries = []
    spans = soup.findAll("span", attrs={"class": "summary"})
    for span in spans:
            summaries.append(span.text.strip())
    return(summaries)

In [261]:
def get_indeed_jobs(city = 'Seattle', job_title = 'Data Scientist', n_jobs = 100):
    
    city = text_preprocess(city)
    job_title = text_preprocess(job_title)
    
    cname = []
    jtitle = []
    location = []
    summary = []
    
    for i in range(0,n_jobs,50):
        URL = "https://www.indeed.com/jobs?q=" + job_title + "&l=" + city + "&limit=50&radius=25&start=" + str(i)
        page = requests.get(URL)
        soup = BeautifulSoup(page.text, "html.parser")
        cname = cname + extract_company(soup)
        jtitle = jtitle + extract_job_title(soup)
        location = location + extract_location(soup)
        summary = summary + extract_summary(soup)
        sleep(1)
    results = pd.DataFrame()
    results['Job Title'] = jtitle
    results['Company'] = cname
    results['Location'] = location
    results['Summary'] = summary
    return results

Getting Data


In [262]:
get_indeed_jobs(job_title='Data Scientist', city='Seattle', n_jobs=50).head()


Out[262]:
Job Title Company Location Summary
0 Data Scientist Indeed Seattle, WA As a Data Scientist at Indeed your role is to ...
1 Deep Learning Engineer/Scientist EchoNous, Inc Redmond, WA Developing deep learning models with unstructu...
2 Principal Data Scientist Philips Bothell, WA Configure prototypes according to local clinic...
3 Product Scientist Indeed Seattle, WA Relevant experience (including internships and...
4 Machine Learning Engineer AnswerIQ Bellevue, WA Create and manage ML infrastructure and data p...

In [263]:
get_indeed_jobs(job_title='Chemical Engineer', city='Seattle', n_jobs=50).head()


Out[263]:
Job Title Company Location Summary
0 Consultant Engineer FM Global Bellevue, WA 98009 When you join our team as a Consultant Enginee...
1 Nanofabrication Engineer Modern Electron Bellevue, WA Modern Electron has an immediate opening for a...
2 Engineer - Environmental Consulting SoundEarth Strategies, Inc. Seattle, WA Professional Engineer (PE). Registration as a ...
3 Methods Process Analyst BOEING Everett, WA 98204 (Holly area) We are engineers and technicians. Participates...
4 Process Engineer BioLife Solutions Bothell, WA 98021 Technical understanding of aqueous chemical fo...