In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
jobs = pd.read_csv('../data/stackoverflow_jobs_enhanced.csv', thousands=',')
jobs.describe()
Out[7]:
In [3]:
tmp = jobs.loc[:,['jobid', 'city', 'state', 'country', 'tags', 'weeknum', 'salary', 'salary_low', 'salary_high', 'currency', 'equity']]
tag_split = lambda x: pd.Series([i for i in x[1:-1].split(',')])
tag_splitted = tmp['tags'].apply(tag_split)
tag_splitted = tag_splitted.fillna('')
In [4]:
t = []
for i in range(0, len(tmp)):
num_tags = len(tag_splitted.iloc[i])
for j in range(0,num_tags):
tech = tag_splitted.iloc[i][j]
if tech == '': break
# creating new row
new_row = {}
new_row['jobid'] = tmp.iloc[i]['jobid']
new_row['city'] = tmp.iloc[i]['city']
new_row['state'] = tmp.iloc[i]['state']
new_row['country'] = tmp.iloc[i]['country']
new_row['weeknum'] = tmp.iloc[i]['weeknum']
new_row['salary_low'] = tmp.iloc[i]['salary_low']
new_row['salary_high'] = tmp.iloc[i]['salary_high']
new_row['equity'] = tmp.iloc[i]['equity']
new_row['currency'] = tmp.iloc[i]['currency']
new_row['tech'] = tech
# adding
t.append(new_row)
technologies = pd.DataFrame(t)
In [5]:
# removing spaces from the beginning and ending
technologies.tech = technologies.tech.str.lstrip(' ')
technologies.tech = technologies.tech.str.rstrip(' ')
technologies.tech = technologies.tech.str.rstrip('"')
technologies.tech = technologies.tech.str.lstrip('"')
In [6]:
# getting the mean figure for salary
technologies['salary_mean'] = technologies[['salary_high','salary_low']].mean(axis=1)
technologies.fillna('', inplace=True)
# technologies.head(355)
In [6]:
technologies[technologies.city == 'London'].groupby(['city', 'tech'])['jobid'].count().sort_values(ascending=False).nlargest(10)
Out[6]:
In [7]:
technologies[technologies.city == 'Berlin'].groupby(['city', 'tech'])['jobid'].count().sort_values(ascending=False).nlargest(10)
Out[7]:
In [8]:
technologies[technologies.state == 'CA'].groupby('tech')['jobid'].count().sort_values(ascending=False).nlargest(10)
Out[8]:
In [9]:
technologies[technologies.state == 'TX'].groupby('tech')['jobid'].count().sort_values(ascending=False).nlargest(10)
Out[9]:
In [10]:
technologies[technologies.tech.str.startswith('machine-learning', na=False)].groupby('city')['city'].count().sort_values(ascending=False).nlargest(10)
Out[10]:
In [11]:
technologies[technologies.tech.str.startswith('apache-spark', na=False)].groupby('city')['city'].count().sort_values(ascending=False).nlargest(10)
Out[11]:
In [12]:
# saving the result to csv
technologies.to_csv('../data/technologies.csv', index = False)
In [13]:
top_tech = technologies['tech'].value_counts().nlargest(100)
In [14]:
# saving the top technologies
top_tech.to_csv('../data/top_technologies.csv')
In [ ]: