In [1]:

    
import pandas as pd 
import numpy as np

# Let's change how printing the series works. I need to see all elements in the Series 
# source: http://stackoverflow.com/questions/19124601/is-there-a-way-to-pretty-print-the-entire-pandas-series-dataframe
pd.set_option('display.max_columns', 70)
pd.set_option('display.max_rows', 70)

jobs = pd.read_csv('../data/stackoverflow_jobs.csv')

Exploring data



In [2]:

    
jobs.columns









    Out[2]:





Index([u'jobid', u'date', u'description', u'employer', u'location', u'salary',
       u'tags', u'title', u'url', u'weeknum'],
      dtype='object')



In [3]:

    
jobs.dtypes









    Out[3]:





jobid           int64
date           object
description    object
employer       object
location       object
salary         object
tags           object
title          object
url            object
weeknum         int64
dtype: object



In [4]:

    
jobs.describe()









    Out[4]:






  
    
      
      jobid
      weeknum
    
  
  
    
      count
      21350.000000
      21350.000000
    
    
      mean
      117851.347635
      33.717049
    
    
      std
      11518.877500
      6.260739
    
    
      min
      15018.000000
      22.000000
    
    
      25%
      116224.000000
      30.000000
    
    
      50%
      120873.000000
      34.000000
    
    
      75%
      123979.750000
      40.000000
    
    
      max
      128738.000000
      45.000000



In [5]:

    
jobs.head()









    Out[5]:






  
    
      
      jobid
      date
      description
      employer
      location
      salary
      tags
      title
      url
      weeknum
    
  
  
    
      0
      125541
      2016-10-24
      Front-End Engineer (Angular)\r\nAs a Software ...
      Check I'm Here
      Saint Petersburg, FL
      $45,000 - 75,000\r\n\r\n                      ...
      [angularjs, html5, css3, javascript, rest]
      Front-End Engineer (Angular)
      /jobs/125541/front-end-engineer-angular-check-...
      43
    
    
      1
      125540
      2016-10-24
      Vector Software, the leading provider of autom...
      Vector Software
      East Greenwich, RI
      NaN
      [c++, boost, stl]
      Field Applications Engineer - Post Sales
      /jobs/125540/field-applications-engineer-post-...
      43
    
    
      2
      125539
      2016-10-24
      Vector Software, the leading provider of autom...
      Vector Software
      East Greenwich, RI
      NaN
      [c++, python]
      Senior C++ Software Developer
      /jobs/125539/senior-c-plus-plus-software-devel...
      43
    
    
      3
      125538
      2016-10-24
      Vector Software, the leading provider of autom...
      Vector Software
      East Greenwich, RI
      NaN
      [python, amazon-web-services, linux]
      DevOps Engineer
      /jobs/125538/devops-engineer-vector-software
      43
    
    
      4
      125537
      2016-10-24
      Key Role: Support numerous challenging, missio...
      Booz Allen Hamilton
      McLean, VA
      NaN
      [c++, embedded, reverse-engineering, java, pyt...
      Embedded Developer
      /jobs/125537/embedded-developer-booz-allen-ham...
      43



In [6]:

    
jobs.tail()









    Out[6]:






  
    
      
      jobid
      date
      description
      employer
      location
      salary
      tags
      title
      url
      weeknum
    
  
  
    
      21345
      123217
      2016-10-18
      At INFARE, we are not happy if we simply achie...
      INFARE
      Copenhagen, Denmark
      NaN
      ["java","scala","c#",".net","bigdata"]
      Senior Back End Developer - Big Data
      /jobs/123217/senior-back-end-developer-big-dat...
      42
    
    
      21346
      111740
      2016-10-18
      Who we are:\r\n\r\nWe implement GSMA standards...
      Samsung R&D Institute
      Warsaw, Poland
      NaN
      ["c","c++","git","tcp","rcs"]
      Developer C/C++ (Network 2020)
      /jobs/111740/developer-c-c-plus-plus-network-2...
      42
    
    
      21347
      116619
      2016-10-18
      We are looking for cutting-edge QA talents who...
      Project A Ventures
      Berlin, Deutschland
      NaN
      ["automated-tests","performance-testing","jira...
      QA Automation Engineer (m/f) wanted!
      /jobs/116619/qa-automation-engineer-m-f-wanted...
      42
    
    
      21348
      104969
      2016-10-18
      As a member of Splash Damage’s AI Programming ...
      Splash Damage
      Bromley, UK
      NaN
      ["c++","unreal-engine4","artificial-intelligen...
      AI Programmer
      /jobs/104969/ai-programmer-splash-damage
      42
    
    
      21349
      123218
      2016-10-18
      Are you passionate about improving conversion ...
      simplesurance GmbH
      Berlin, Germany
      NaN
      ["html","css","adobe"]
      Conversion Optimization Specialist (f/m)
      /jobs/123218/conversion-optimization-specialis...
      42

Salary

Replacing NA values with empty strings in the salary column



In [7]:

    
jobs.salary = jobs.salary.fillna('')

Extracting equity



In [8]:

    
jobs['equity'] = jobs['salary'].str.contains('Provides Equity')

Extracting currency and high - low salary

Need to extract currency, salary_low and salary_high from salary field and copy it to their own columns.

Using regex here to capture parts of the salary field into three columns:

- currency will capture zero or more characters that are non digits
- number_low captures one or more characters that are digits and spearators (currently only comma is used)
- number high will capture all the numbers plus separators from the dash until the end of the string



In [9]:

    
# salary = jobs.salary
salary = jobs.salary.map(lambda x: x.replace("Provides Equity","").replace("/","").strip())

sal = salary.str.extract('(?P<currency>[^\d]*)(?P<number_low>[\d,]+) - (?P<number_high>[\d,]+$)')

sal.number_low = sal.number_low.fillna(0)
sal.number_high = sal.number_high.fillna(0)
sal.currency = sal.currency.fillna('')

# mapping the new columns back
jobs['currency'] = sal.currency
jobs['salary_low'] = sal.number_low
jobs['salary_high'] = sal.number_high









    



/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:4: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)

Location

We need better location information, so we can do analysis by countries and cities. For this we need to extract country, state and city out of location column. But first let's remove the na values from location column.

Then use a lambda to split the location into individual fields.



In [10]:

    
jobs.location = jobs.location.fillna('') # sometimes we have nothing in the location field. 

location_split = lambda x: pd.Series([i for i in x.split(',')])
locations = jobs['location'].apply(location_split)

locations.rename(columns={0:'city', 1: 'location_1', 2: 'location_2'},inplace=True)

Fixing US locations

US locations seems to be special. They are in the form of city, state, we need this to be in form of city, state, country, so let's fix this first.

If we have a US state in _location1 column then put US in _location2.



In [11]:

    
# Fixing US States
us_states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

locations['location_1'] = locations['location_1'].str.strip()
locations.loc[locations['location_1'].isin(us_states),'location_2'] = "US"

Filling the state and country columns

If in a row location_2 is null then location_1 contains the country of that location, if location_2 is not empty thne location_2 is going to be the country and location_1 will contain the state.



In [12]:

    
# if location_2 is null then location_1 column has the country 
# if location_2 is not null then location_2 has the country and location_1 contains the state 
jobs['country'] = np.where(locations['location_2'].isnull(), locations['location_1'], locations['location_2'])
jobs['state'] = np.where(locations['location_2'].notnull(), locations['location_1'], '')

jobs['city'] = locations['city']

# filling na for country 
jobs.country = jobs.country.fillna('')

# stripping spaces from new columns
jobs['city'] = jobs['city'].str.strip()
jobs['country'] = jobs['country'].str.strip()

Now we can see what countries are posting the most jobs. It seems that the US, Deutschland, Germany and the UK are the top countries. But wait. Aren't Germany and Deutschland are the same country? Let's fix this and some other countries with native names.



In [20]:

    
# replacing some of the country names with their english version 
jobs.loc[jobs['country'].str.contains('Deutschland'),'country'] = 'Germany' # Deutschland -> Germany
jobs.loc[jobs['country'].str.contains('Österreich'),'country'] = 'Austria' # Österreich -> Austria
jobs.loc[jobs['country'].str.contains('Suisse'), 'country'] = 'Switzerland' # Suisse -> Switzerland
jobs.loc[jobs['country'].str.contains('Schweiz'), 'country'] = 'Switzerland' # Schweiz -> Switzerland
jobs.loc[jobs['country'].str.contains('Espagne'), 'country'] = 'Spain' # Espagne -> Spain
jobs.loc[jobs['country'].str.contains('République tchèque'), 'country'] = 'Czech Republic' # République tchèque -> Czech Republic
jobs.loc[jobs['country'].str.contains('Niederlande'), 'country'] = 'Netherlands' # Niederlande -> Netherlands

jobs['country'].value_counts().head()









    Out[20]:





US             9739
Germany        3538
UK             2084
Netherlands    1026
Canada          697
Name: country, dtype: int64



In [56]:

    
top_cities = jobs['city'].value_counts().nlargest(100)

Top cities



In [59]:

    
top_cities.head(20)









    Out[59]:





New York              1146
London                1099
Berlin                1080
San Francisco          804
Amsterdam              444
München                442
Seattle                423
Philadelphia           373
Chicago                344
Toronto                321
Hamburg                302
No office location     296
Washington             255
Boston                 223
Dublin                 198
Stockholm              194
Los Angeles            187
Cambridge              187
Austin                 181
Denver                 172
Name: city, dtype: int64



In [60]:

    
# saving the result to csv 
top_cities.to_csv('../data/top_cities.csv')



In [ ]:

	jobid	weeknum
count	21350.000000	21350.000000
mean	117851.347635	33.717049
std	11518.877500	6.260739
min	15018.000000	22.000000
25%	116224.000000	30.000000
50%	120873.000000	34.000000
75%	123979.750000	40.000000
max	128738.000000	45.000000

	jobid	date	description	employer	location	salary	tags	title	url	weeknum
0	125541	2016-10-24	Front-End Engineer (Angular)\r\nAs a Software ...	Check I'm Here	Saint Petersburg, FL	$45,000 - 75,000\r\n\r\n ...	[angularjs, html5, css3, javascript, rest]	Front-End Engineer (Angular)	/jobs/125541/front-end-engineer-angular-check-...	43
1	125540	2016-10-24	Vector Software, the leading provider of autom...	Vector Software	East Greenwich, RI	NaN	[c++, boost, stl]	Field Applications Engineer - Post Sales	/jobs/125540/field-applications-engineer-post-...	43
2	125539	2016-10-24	Vector Software, the leading provider of autom...	Vector Software	East Greenwich, RI	NaN	[c++, python]	Senior C++ Software Developer	/jobs/125539/senior-c-plus-plus-software-devel...	43
3	125538	2016-10-24	Vector Software, the leading provider of autom...	Vector Software	East Greenwich, RI	NaN	[python, amazon-web-services, linux]	DevOps Engineer	/jobs/125538/devops-engineer-vector-software	43
4	125537	2016-10-24	Key Role: Support numerous challenging, missio...	Booz Allen Hamilton	McLean, VA	NaN	[c++, embedded, reverse-engineering, java, pyt...	Embedded Developer	/jobs/125537/embedded-developer-booz-allen-ham...	43

	jobid	date	description	employer	location	salary	tags	title	url	weeknum
21345	123217	2016-10-18	At INFARE, we are not happy if we simply achie...	INFARE	Copenhagen, Denmark	NaN	["java","scala","c#",".net","bigdata"]	Senior Back End Developer - Big Data	/jobs/123217/senior-back-end-developer-big-dat...	42
21346	111740	2016-10-18	Who we are:\r\n\r\nWe implement GSMA standards...	Samsung R&D Institute	Warsaw, Poland	NaN	["c","c++","git","tcp","rcs"]	Developer C/C++ (Network 2020)	/jobs/111740/developer-c-c-plus-plus-network-2...	42
21347	116619	2016-10-18	We are looking for cutting-edge QA talents who...	Project A Ventures	Berlin, Deutschland	NaN	["automated-tests","performance-testing","jira...	QA Automation Engineer (m/f) wanted!	/jobs/116619/qa-automation-engineer-m-f-wanted...	42
21348	104969	2016-10-18	As a member of Splash Damage’s AI Programming ...	Splash Damage	Bromley, UK	NaN	["c++","unreal-engine4","artificial-intelligen...	AI Programmer	/jobs/104969/ai-programmer-splash-damage	42
21349	123218	2016-10-18	Are you passionate about improving conversion ...	simplesurance GmbH	Berlin, Germany	NaN	["html","css","adobe"]	Conversion Optimization Specialist (f/m)	/jobs/123218/conversion-optimization-specialis...	42