In [1]:
import pandas as pd
import numpy as np
# Let's change how printing the series works. I need to see all elements in the Series
# source: http://stackoverflow.com/questions/19124601/is-there-a-way-to-pretty-print-the-entire-pandas-series-dataframe
pd.set_option('display.max_columns', 70)
pd.set_option('display.max_rows', 70)
jobs = pd.read_csv('../data/stackoverflow_jobs.csv')
In [2]:
jobs.columns
Out[2]:
In [3]:
jobs.dtypes
Out[3]:
In [4]:
jobs.describe()
Out[4]:
In [5]:
jobs.head()
Out[5]:
In [6]:
jobs.tail()
Out[6]:
In [7]:
jobs.salary = jobs.salary.fillna('')
In [8]:
jobs['equity'] = jobs['salary'].str.contains('Provides Equity')
Need to extract currency, salary_low and salary_high from salary field and copy it to their own columns.
Using regex here to capture parts of the salary field into three columns:
- currency will capture zero or more characters that are non digits
- number_low captures one or more characters that are digits and spearators (currently only comma is used)
- number high will capture all the numbers plus separators from the dash until the end of the string
In [9]:
# salary = jobs.salary
salary = jobs.salary.map(lambda x: x.replace("Provides Equity","").replace("/","").strip())
sal = salary.str.extract('(?P<currency>[^\d]*)(?P<number_low>[\d,]+) - (?P<number_high>[\d,]+$)')
sal.number_low = sal.number_low.fillna(0)
sal.number_high = sal.number_high.fillna(0)
sal.currency = sal.currency.fillna('')
# mapping the new columns back
jobs['currency'] = sal.currency
jobs['salary_low'] = sal.number_low
jobs['salary_high'] = sal.number_high
In [10]:
jobs.location = jobs.location.fillna('') # sometimes we have nothing in the location field.
location_split = lambda x: pd.Series([i for i in x.split(',')])
locations = jobs['location'].apply(location_split)
locations.rename(columns={0:'city', 1: 'location_1', 2: 'location_2'},inplace=True)
In [11]:
# Fixing US States
us_states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA",
"HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
"MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
"NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
"SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
locations['location_1'] = locations['location_1'].str.strip()
locations.loc[locations['location_1'].isin(us_states),'location_2'] = "US"
In [12]:
# if location_2 is null then location_1 column has the country
# if location_2 is not null then location_2 has the country and location_1 contains the state
jobs['country'] = np.where(locations['location_2'].isnull(), locations['location_1'], locations['location_2'])
jobs['state'] = np.where(locations['location_2'].notnull(), locations['location_1'], '')
jobs['city'] = locations['city']
# filling na for country
jobs.country = jobs.country.fillna('')
# stripping spaces from new columns
jobs['city'] = jobs['city'].str.strip()
jobs['country'] = jobs['country'].str.strip()
Now we can see what countries are posting the most jobs. It seems that the US, Deutschland, Germany and the UK are the top countries. But wait. Aren't Germany and Deutschland are the same country? Let's fix this and some other countries with native names.
In [20]:
# replacing some of the country names with their english version
jobs.loc[jobs['country'].str.contains('Deutschland'),'country'] = 'Germany' # Deutschland -> Germany
jobs.loc[jobs['country'].str.contains('Österreich'),'country'] = 'Austria' # Österreich -> Austria
jobs.loc[jobs['country'].str.contains('Suisse'), 'country'] = 'Switzerland' # Suisse -> Switzerland
jobs.loc[jobs['country'].str.contains('Schweiz'), 'country'] = 'Switzerland' # Schweiz -> Switzerland
jobs.loc[jobs['country'].str.contains('Espagne'), 'country'] = 'Spain' # Espagne -> Spain
jobs.loc[jobs['country'].str.contains('République tchèque'), 'country'] = 'Czech Republic' # République tchèque -> Czech Republic
jobs.loc[jobs['country'].str.contains('Niederlande'), 'country'] = 'Netherlands' # Niederlande -> Netherlands
jobs['country'].value_counts().head()
Out[20]:
In [56]:
top_cities = jobs['city'].value_counts().nlargest(100)
In [59]:
top_cities.head(20)
Out[59]:
In [60]:
# saving the result to csv
top_cities.to_csv('../data/top_cities.csv')
In [ ]: