In [1]:
import pandas as pd
import numpy as np
import re
from bokeh.plotting import *
output_notebook()


Loading BokehJS ...

In [2]:
def plen(x): print(len(x))

def format_column(c):
    return c.replace(' ', '_').lower()

df = pd.read_csv('salaries.csv')
df.columns = [format_column(c) for c in df.columns]
df.head()


Out[2]:
timestamp employer location job_title years_at_employer years_of_experience annual_base_pay signing_bonus annual_bonus annual_stock_value/bonus gender additional_comments
0 3/21/2016 12:54:49 NaN Raleigh, NC Software Developer NaN 18 122000 NaN NaN NaN Male NaN
1 3/21/2016 12:58:52 Opower San Francisco, CA Systems Engineer 2 13 125000 5000 0 5000 shares Male Don't work here.
2 3/21/2016 12:58:57 NaN San Diego CA Systems engineer 1 1 of employment 83,000 0 $50.00 0 Male NaN
3 3/21/2016 12:58:58 Walmart Bentonville, AR Senior Developer 8 15 65,000 NaN 5,000 3,000 Male NaN
4 3/21/2016 12:59:11 Vertical Knowledge Cleveland, OH Software Engineer 1 4 86000 5000 6000 0 Male NaN

In [3]:
def create_standard_location(df):
    
    def standardize(loc):
        if pd.isnull(loc):
            return ''
        return loc.split(',')[0].lower().strip()
    
    df['std_location'] = df.location.map(standardize)
    
    return df

df = create_standard_location(df)

In [4]:
def create_clean_salaries(df):

    def clean_individual_salary(x):    
        """
        Clean the salary data
        1. reject some malformed/difficult to parse data
        2. try to detect currency and return the type
        """    
        final_salary = np.nan
        salary_type = np.nan
        if pd.notnull(x):        
            salary_type = 'USD' # default
            x = x.lower()        
            x = re.sub("[\s,]", "", x)    
            x = re.sub("\.[0-9]*", "", x)
            if '€' in x or 'eur' in x:
                salary_type = 'Euro'
                x = re.sub(r'[€(eur)(euro)]', '', x)
            if '$' in x or 'usd' in x:            
                x = re.sub(r'[\$(usd)]', '', x)
            if '£' in x or 'gbp' in x:
                salary_type = 'GBP'
                x = x.replace('[£(gbp)]', '')                                
            try:
                final_salary = int(x)        

            except:
                #print("convert to numeric failure: '{}'".format(x))
                final_salary = np.nan
                salary_type = np.nan
        return (final_salary, salary_type)

    clean_salary_ufunc = np.frompyfunc(clean_individual_salary, 1, 2)
    final_salary, salary_type = clean_salary_ufunc(df.annual_base_pay)
    df['salary'] = final_salary
    df['salary_type'] = salary_type
    return df


def drop_unreasonable_salaries(df):    
    reasonable = df['salary'].map(lambda x: x > 20000 and x < 360000)
    print("resonable salaries:")
    print(reasonable.value_counts())    
    df = df[reasonable]
    return df

df = create_clean_salaries(df)
plen(df)
df = drop_unreasonable_salaries(df)
plen(df)


3583
resonable salaries:
True     3070
False     513
Name: salary, dtype: int64
3070

In [5]:
df.loc


Out[5]:
<pandas.core.indexing._LocIndexer at 0x7f4b60ef6828>

In [6]:
def create_sex(df):
    
    def convert_gender(x):
        if pd.isnull(x):
            return x
        elif 'female' in x.lower():
            return 'F'
        elif 'male' in x.lower():
            return 'M'
        else:
            return 'other'
        
    df['sex'] = df.gender.map(convert_gender)
    return df

df = create_sex(df)
df = df.dropna(subset='sex'.split())
df.sex.value_counts()


Out[6]:
M        2806
F         134
other      14
Name: sex, dtype: int64

In [7]:
cols = 'location job_title salary years_at_employer years_of_experience sex'.split()
df = df.sort_values('salary', ascending=False)

In [8]:
def drop_unreasonable_years(df):
    plen(df)
    df = df.dropna(subset='salary years_job years_experience'.split())
    df = df[df.years_experience > 0]
    df = df[df.years_experience < 50]
    df = df[df.years_experience < 50]
    df = df[df.years_job >= 0]
    df = df[df.years_job <= 50]
    plen(df)
    return df

df['years_job'] = pd.to_numeric(df.years_at_employer, errors='coerce')
df['years_experience'] = pd.to_numeric(df.years_of_experience, errors='coerce')
df = drop_unreasonable_years(df)


2954
2632

In [9]:
df[cols].head(5)


Out[9]:
location job_title salary years_at_employer years_of_experience sex
677 san francisco CTO Google rail product 356000 5 12 M
488 Los Gatos Senior Software Engineer 350000 1 10 M
1524 NaN General Manager 350000 0.5 20 M
3099 Menlo Park Strategic Value Executive 350000 5 10 M
2494 Los Gatos, CA Senior Software Engineer 330000 2 10 M

In [10]:
df[cols].tail(5)


Out[10]:
location job_title salary years_at_employer years_of_experience sex
1148 Cluj-Napoca, Romania Software Engineer 21500 2 2 M
577 Bangalore, India Developer 21496 0.5 3 M
2488 Mumbai DevOps Engineer 21027 1 2.5 M
1216 Nottingham UK Jnr Software Developer 20500 1 1 M
841 Paris, France Research Engineer 20400 0 1 M

In [11]:
df = df[df.salary_type == 'USD']
loc = df.location.fillna('')
sv_set = {'bay area', 'san jose', 'san francisco', 'mountain view'}
is_silicon = loc.str.lower().isin(sv_set)
sv_df = df[is_silicon]
other_df = df[~is_silicon]  

print("silicon valley:", len(sv_df))
print("other:", len(other_df))


silicon valley: 246
other: 2321

Basic Salary Info


In [12]:
def show_salary_info():    
    dlist = []
    def add(df, name):
        mean = df.salary.mean()
        median = df.salary.median()
        quants = list(df.salary.quantile([0.01, 0.1, 0.25, 0.75, 0.9, 0.99]))
        d = {
            'name' : name,
            'mean' : mean,
            'median': median,
            '1%' : quants[0],
            '10%' : quants[1],
            '25%' : quants[2],
            '75%' : quants[3],
            '90%' : quants[4],
            '99%' : quants[5],
        }
        dlist.append(d)
        
    sv_male_df = sv_df[sv_df.sex == 'M']
    sv_female_df = sv_df[sv_df.sex == 'F']
    other_male_df = other_df[other_df.sex == 'M']
    other_female_df = other_df[other_df.sex == 'F']
        
    add(sv_df, 'Silicon Valley')
    add(other_df, 'Non-Silicon Valley')
    add(sv_male_df, 'SV_Males')
    add(other_male_df, 'Non_SV_Males')
    add(sv_female_df, 'SV_Females')    
    add(other_female_df, 'Non_SV_Females')
    df = pd.DataFrame(dlist)
    df = df['name mean median 1% 10% 25% 75% 90% 99%'.split()]
    for c in ['mean', 'median'] + [c for c in df.columns if '%' in c]:
        df[c] = df[c].astype(int)        
    s = df.style
    s.background_gradient(cmap='coolwarm')    
        
    return s

# print("SV Salary Quantiles:")
# print(sv_df.salary.quantile([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))

# print("Non_SV Salary Quantiles:")
# print(other_df.salary.quantile([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))

show_salary_info()


/home/jrenner/miniconda3/lib/python3.5/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
/home/jrenner/miniconda3/lib/python3.5/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
Out[12]:
name mean median 1% 10% 25% 75% 90% 99%
0 Silicon Valley 135514 130250 61350 99000 110000 155000 175000 259350
1 Non-Silicon Valley 97325 93500 25000 44000 65000 125000 150000 235800
2 SV_Males 136526 132000 65100 100000 113250 155000 175000 246999
3 Non_SV_Males 97725 94000 24437 44000 65000 125000 150000 235929
4 SV_Females 119924 110000 57800 79000 100000 122755 160000 253419
5 Non_SV_Females 90345 90000 30035 54100 60875 110000 130000 160000

Visualizations


In [68]:
F_ALPHA = 0.3
CIRCLE_SIZE=7

def graph_salary_vs_experience():
    p = figure(title="Salary vs Years of Experience", y_axis_label='Salary (1k)', x_axis_label='Years Experience', \
              y_axis_type='log', y_range=[18, 400])                      

    x = sv_df.years_experience
    y = sv_df.salary / 1000
    p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Silicon Valley')
    
    x = other_df.years_experience
    y = other_df.salary / 1000
    p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Other USA')
    
    show(p)
    
graph_salary_vs_experience()



In [78]:
def graph_salary_vs_years_at_job():
    p = figure(title="Salary vs Years at Job", y_axis_label='Salary (1k)', x_axis_label='Years at Job', \
               y_axis_type='log', y_range=[20, 400])                      

    x = sv_df.years_job
    y = sv_df.salary / 1000
    p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Silicon Valley')
    
    x = other_df.years_job
    y = other_df.salary / 1000
    p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Other USA')
    
    show(p)
    
graph_salary_vs_years_at_job()



In [80]:
def graph_salary_vs_gender(df):
    
    male = df[df.sex == 'M']
    female = df[df.sex == 'F']
    other = df[df.sex == 'other']
    
    p = figure(title="Salary vs Years Experience (by Gender)", y_axis_label='Salary (1k)', x_axis_label='Years Experience', \
              y_axis_type='log', y_range=[20, 400])                      

    x = male.years_experience
    y = male.salary / 1000
    p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Male')
    
    x = female.years_experience
    y = female.salary / 1000
    p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='blue', legend='Female')
    
    x = other.years_experience
    y = other.salary / 1000
    p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Other')
    
    show(p)
    
graph_salary_vs_gender(df)


Let's try to see salary by common geographic locations


In [82]:
def regional_salaries():
    res_list = []
    for name, grp in df.groupby('std_location'):
        if len(grp) < 4 or name == '':
            continue
        res = (name, grp.salary.mean() / 1000)
        res_list.append(res)
    #print(res_list)
    
    p = figure()
    p.plot_width = 1000
    p.plot_height = 800
    
    def sort_func(tup):
        return tup[1]
    
    res_list.sort(key=sort_func)
    
    x = [i for i in range(len(res_list))]
    y = [r[1] for r in res_list]
    
    p.scatter(x, y)
    texts = [r[0] for r in res_list]
    p.text(x, y, texts, angle=-0.8)
    
    for name, mean in res_list:
        pass
        
    show(p)
    
regional_salaries()



In [ ]: