Loading data



In [7]:

    
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import time 

jobs = pd.read_csv('../data/stackoverflow_jobs_enhanced.csv', thousands=',')

jobs.describe()









    Out[7]:






  
    
      
      jobid
      weeknum
      salary_low
      salary_high
    
  
  
    
      count
      22702.000000
      22702.000000
      2.270200e+04
      2.270200e+04
    
    
      mean
      118549.537045
      34.276628
      1.487488e+04
      2.262928e+04
    
    
      std
      11599.841612
      7.442802
      1.226277e+05
      2.265854e+05
    
    
      min
      15018.000000
      1.000000
      0.000000e+00
      0.000000e+00
    
    
      25%
      116655.750000
      30.000000
      0.000000e+00
      0.000000e+00
    
    
      50%
      121380.000000
      34.000000
      0.000000e+00
      0.000000e+00
    
    
      75%
      124591.750000
      40.000000
      0.000000e+00
      0.000000e+00
    
    
      max
      132266.000000
      52.000000
      1.000000e+07
      1.947870e+07

Filtering for important columns & splitting



In [3]:

    
tmp = jobs.loc[:,['jobid', 'city', 'state', 'country', 'tags', 'weeknum', 'salary', 'salary_low', 'salary_high', 'currency', 'equity']]

tag_split = lambda x: pd.Series([i for i in x[1:-1].split(',')])
tag_splitted = tmp['tags'].apply(tag_split)

tag_splitted = tag_splitted.fillna('')



In [4]:

    
t = []

for i in range(0, len(tmp)):    
    num_tags = len(tag_splitted.iloc[i])
    for j in range(0,num_tags):
        tech = tag_splitted.iloc[i][j]        
        if tech == '': break
        # creating new row 
        new_row = {}
        new_row['jobid'] = tmp.iloc[i]['jobid']
        new_row['city'] = tmp.iloc[i]['city']
        new_row['state'] = tmp.iloc[i]['state']
        new_row['country'] = tmp.iloc[i]['country']
        new_row['weeknum'] = tmp.iloc[i]['weeknum']
        new_row['salary_low'] = tmp.iloc[i]['salary_low']
        new_row['salary_high'] = tmp.iloc[i]['salary_high']
        new_row['equity'] = tmp.iloc[i]['equity']
        new_row['currency'] = tmp.iloc[i]['currency']            
        new_row['tech'] = tech
        # adding 
        t.append(new_row)
        
technologies = pd.DataFrame(t)



In [5]:

    
#  removing spaces from the beginning and ending
technologies.tech = technologies.tech.str.lstrip(' ')
technologies.tech = technologies.tech.str.rstrip(' ')

technologies.tech = technologies.tech.str.rstrip('"')
technologies.tech = technologies.tech.str.lstrip('"')



In [6]:

    
# getting the mean figure for salary
technologies['salary_mean'] = technologies[['salary_high','salary_low']].mean(axis=1)

technologies.fillna('', inplace=True)
# technologies.head(355)

Top 10 technologies in London



In [6]:

    
technologies[technologies.city == 'London'].groupby(['city', 'tech'])['jobid'].count().sort_values(ascending=False).nlargest(10)









    Out[6]:





city    tech               
London  javascript             276
        java                   261
        python                 186
        amazon-web-services    137
        c#                     131
        angularjs               99
        linux                   91
        php                     87
        ruby                    81
        html                    81
        node.js                 81
Name: jobid, dtype: int64

... and Berlin



In [7]:

    
technologies[technologies.city == 'Berlin'].groupby(['city', 'tech'])['jobid'].count().sort_values(ascending=False).nlargest(10)









    Out[7]:





city    tech      
Berlin  java          327
        javascript    302
        php           162
        mysql         141
        python        131
        angularjs     115
        sql           110
        css           102
        html           86
        linux          85
Name: jobid, dtype: int64

Top 10 technologies in Silicon Valley



In [8]:

    
technologies[technologies.state == 'CA'].groupby('tech')['jobid'].count().sort_values(ascending=False).nlargest(10)









    Out[8]:





tech
javascript             541
java                   531
python                 529
c++                    278
linux                  230
amazon-web-services    210
sql                    196
angularjs              177
c#                     165
ruby-on-rails          165
Name: jobid, dtype: int64

... and Texas



In [9]:

    
technologies[technologies.state == 'TX'].groupby('tech')['jobid'].count().sort_values(ascending=False).nlargest(10)









    Out[9]:





tech
javascript             143
java                    94
c#                      86
python                  70
angularjs               51
mysql                   45
amazon-web-services     42
linux                   39
sql                     38
.net                    37
Name: jobid, dtype: int64

Best place to do machine learning...



In [10]:

    
technologies[technologies.tech.str.startswith('machine-learning', na=False)].groupby('city')['city'].count().sort_values(ascending=False).nlargest(10)









    Out[10]:





city
San Francisco    17
Rotterdam        15
London           11
Berlin           10
Rotterdam        14
London           10
Mountain View     5
New York          5
Name: city, dtype: int64

... or Apache Spark



In [11]:

    
technologies[technologies.tech.str.startswith('apache-spark', na=False)].groupby('city')['city'].count().sort_values(ascending=False).nlargest(10)









    Out[11]:





city
Berlin           19
London           19
San Francisco    18
New York         12
Seattle           7
Toronto           6
Stockport         6
Hamburg           6
Ottawa            5
Singapore         5
Name: city, dtype: int64



In [12]:

    
# saving the result to csv 
technologies.to_csv('../data/technologies.csv', index = False)



In [13]:

    
top_tech = technologies['tech'].value_counts().nlargest(100)

Top technologies



In [14]:

    
# saving the top technologies
top_tech.to_csv('../data/top_technologies.csv')



In [ ]:

	jobid	weeknum	salary_low	salary_high
count	22702.000000	22702.000000	2.270200e+04	2.270200e+04
mean	118549.537045	34.276628	1.487488e+04	2.262928e+04
std	11599.841612	7.442802	1.226277e+05	2.265854e+05
min	15018.000000	1.000000	0.000000e+00	0.000000e+00
25%	116655.750000	30.000000	0.000000e+00	0.000000e+00
50%	121380.000000	34.000000	0.000000e+00	0.000000e+00
75%	124591.750000	40.000000	0.000000e+00	0.000000e+00
max	132266.000000	52.000000	1.000000e+07	1.947870e+07