Loading data


In [7]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import time 

jobs = pd.read_csv('../data/stackoverflow_jobs_enhanced.csv', thousands=',')

jobs.describe()


Out[7]:
jobid weeknum salary_low salary_high
count 22702.000000 22702.000000 2.270200e+04 2.270200e+04
mean 118549.537045 34.276628 1.487488e+04 2.262928e+04
std 11599.841612 7.442802 1.226277e+05 2.265854e+05
min 15018.000000 1.000000 0.000000e+00 0.000000e+00
25% 116655.750000 30.000000 0.000000e+00 0.000000e+00
50% 121380.000000 34.000000 0.000000e+00 0.000000e+00
75% 124591.750000 40.000000 0.000000e+00 0.000000e+00
max 132266.000000 52.000000 1.000000e+07 1.947870e+07

Filtering for important columns & splitting


In [3]:
tmp = jobs.loc[:,['jobid', 'city', 'state', 'country', 'tags', 'weeknum', 'salary', 'salary_low', 'salary_high', 'currency', 'equity']]

tag_split = lambda x: pd.Series([i for i in x[1:-1].split(',')])
tag_splitted = tmp['tags'].apply(tag_split)

tag_splitted = tag_splitted.fillna('')

In [4]:
t = []

for i in range(0, len(tmp)):    
    num_tags = len(tag_splitted.iloc[i])
    for j in range(0,num_tags):
        tech = tag_splitted.iloc[i][j]        
        if tech == '': break
        # creating new row 
        new_row = {}
        new_row['jobid'] = tmp.iloc[i]['jobid']
        new_row['city'] = tmp.iloc[i]['city']
        new_row['state'] = tmp.iloc[i]['state']
        new_row['country'] = tmp.iloc[i]['country']
        new_row['weeknum'] = tmp.iloc[i]['weeknum']
        new_row['salary_low'] = tmp.iloc[i]['salary_low']
        new_row['salary_high'] = tmp.iloc[i]['salary_high']
        new_row['equity'] = tmp.iloc[i]['equity']
        new_row['currency'] = tmp.iloc[i]['currency']            
        new_row['tech'] = tech
        # adding 
        t.append(new_row)
        
technologies = pd.DataFrame(t)

In [5]:
#  removing spaces from the beginning and ending
technologies.tech = technologies.tech.str.lstrip(' ')
technologies.tech = technologies.tech.str.rstrip(' ')

technologies.tech = technologies.tech.str.rstrip('"')
technologies.tech = technologies.tech.str.lstrip('"')

In [6]:
# getting the mean figure for salary
technologies['salary_mean'] = technologies[['salary_high','salary_low']].mean(axis=1)

technologies.fillna('', inplace=True)
# technologies.head(355)

Top 10 technologies in London


In [6]:
technologies[technologies.city == 'London'].groupby(['city', 'tech'])['jobid'].count().sort_values(ascending=False).nlargest(10)


Out[6]:
city    tech               
London  javascript             276
        java                   261
        python                 186
        amazon-web-services    137
        c#                     131
        angularjs               99
        linux                   91
        php                     87
        ruby                    81
        html                    81
        node.js                 81
Name: jobid, dtype: int64

... and Berlin


In [7]:
technologies[technologies.city == 'Berlin'].groupby(['city', 'tech'])['jobid'].count().sort_values(ascending=False).nlargest(10)


Out[7]:
city    tech      
Berlin  java          327
        javascript    302
        php           162
        mysql         141
        python        131
        angularjs     115
        sql           110
        css           102
        html           86
        linux          85
Name: jobid, dtype: int64

Top 10 technologies in Silicon Valley


In [8]:
technologies[technologies.state == 'CA'].groupby('tech')['jobid'].count().sort_values(ascending=False).nlargest(10)


Out[8]:
tech
javascript             541
java                   531
python                 529
c++                    278
linux                  230
amazon-web-services    210
sql                    196
angularjs              177
c#                     165
ruby-on-rails          165
Name: jobid, dtype: int64

... and Texas


In [9]:
technologies[technologies.state == 'TX'].groupby('tech')['jobid'].count().sort_values(ascending=False).nlargest(10)


Out[9]:
tech
javascript             143
java                    94
c#                      86
python                  70
angularjs               51
mysql                   45
amazon-web-services     42
linux                   39
sql                     38
.net                    37
Name: jobid, dtype: int64

Best place to do machine learning...


In [10]:
technologies[technologies.tech.str.startswith('machine-learning', na=False)].groupby('city')['city'].count().sort_values(ascending=False).nlargest(10)


Out[10]:
city
San Francisco    17
Rotterdam        15
London           11
Berlin           10
Rotterdam        14
London           10
Mountain View     5
New York          5
Name: city, dtype: int64

... or Apache Spark


In [11]:
technologies[technologies.tech.str.startswith('apache-spark', na=False)].groupby('city')['city'].count().sort_values(ascending=False).nlargest(10)


Out[11]:
city
Berlin           19
London           19
San Francisco    18
New York         12
Seattle           7
Toronto           6
Stockport         6
Hamburg           6
Ottawa            5
Singapore         5
Name: city, dtype: int64

In [12]:
# saving the result to csv 
technologies.to_csv('../data/technologies.csv', index = False)

In [13]:
top_tech = technologies['tech'].value_counts().nlargest(100)

Top technologies


In [14]:
# saving the top technologies
top_tech.to_csv('../data/top_technologies.csv')

In [ ]: