notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import re
from bokeh.plotting import *
output_notebook()









    





    
        
        Loading BokehJS ...



In [2]:

    
def plen(x): print(len(x))

def format_column(c):
    return c.replace(' ', '_').lower()

df = pd.read_csv('salaries.csv')
df.columns = [format_column(c) for c in df.columns]
df.head()









    Out[2]:






  
    
      
      timestamp
      employer
      location
      job_title
      years_at_employer
      years_of_experience
      annual_base_pay
      signing_bonus
      annual_bonus
      annual_stock_value/bonus
      gender
      additional_comments
    
  
  
    
      0
      3/21/2016 12:54:49
      NaN
      Raleigh, NC
      Software Developer
      NaN
      18
      122000
      NaN
      NaN
      NaN
      Male
      NaN
    
    
      1
      3/21/2016 12:58:52
      Opower
      San Francisco, CA
      Systems Engineer
      2
      13
      125000
      5000
      0
      5000 shares
      Male
      Don't work here.
    
    
      2
      3/21/2016 12:58:57
      NaN
      San Diego CA
      Systems engineer
      1
      1 of employment
      83,000
      0
      $50.00
      0
      Male
      NaN
    
    
      3
      3/21/2016 12:58:58
      Walmart
      Bentonville, AR
      Senior Developer
      8
      15
      65,000
      NaN
      5,000
      3,000
      Male
      NaN
    
    
      4
      3/21/2016 12:59:11
      Vertical Knowledge
      Cleveland, OH
      Software Engineer
      1
      4
      86000
      5000
      6000
      0
      Male
      NaN



In [3]:

    
def create_standard_location(df):
    
    def standardize(loc):
        if pd.isnull(loc):
            return ''
        return loc.split(',')[0].lower().strip()
    
    df['std_location'] = df.location.map(standardize)
    
    return df

df = create_standard_location(df)



In [4]:

    
def create_clean_salaries(df):

    def clean_individual_salary(x):    
        """
        Clean the salary data
        1. reject some malformed/difficult to parse data
        2. try to detect currency and return the type
        """    
        final_salary = np.nan
        salary_type = np.nan
        if pd.notnull(x):        
            salary_type = 'USD' # default
            x = x.lower()        
            x = re.sub("[\s,]", "", x)    
            x = re.sub("\.[0-9]*", "", x)
            if '€' in x or 'eur' in x:
                salary_type = 'Euro'
                x = re.sub(r'[€(eur)(euro)]', '', x)
            if '$' in x or 'usd' in x:            
                x = re.sub(r'[\$(usd)]', '', x)
            if '£' in x or 'gbp' in x:
                salary_type = 'GBP'
                x = x.replace('[£(gbp)]', '')                                
            try:
                final_salary = int(x)        

            except:
                #print("convert to numeric failure: '{}'".format(x))
                final_salary = np.nan
                salary_type = np.nan
        return (final_salary, salary_type)

    clean_salary_ufunc = np.frompyfunc(clean_individual_salary, 1, 2)
    final_salary, salary_type = clean_salary_ufunc(df.annual_base_pay)
    df['salary'] = final_salary
    df['salary_type'] = salary_type
    return df


def drop_unreasonable_salaries(df):    
    reasonable = df['salary'].map(lambda x: x > 20000 and x < 360000)
    print("resonable salaries:")
    print(reasonable.value_counts())    
    df = df[reasonable]
    return df

df = create_clean_salaries(df)
plen(df)
df = drop_unreasonable_salaries(df)
plen(df)









    



3583
resonable salaries:
True     3070
False     513
Name: salary, dtype: int64
3070



In [5]:

    
df.loc









    Out[5]:





<pandas.core.indexing._LocIndexer at 0x7f4b60ef6828>



In [6]:

    
def create_sex(df):
    
    def convert_gender(x):
        if pd.isnull(x):
            return x
        elif 'female' in x.lower():
            return 'F'
        elif 'male' in x.lower():
            return 'M'
        else:
            return 'other'
        
    df['sex'] = df.gender.map(convert_gender)
    return df

df = create_sex(df)
df = df.dropna(subset='sex'.split())
df.sex.value_counts()









    Out[6]:





M        2806
F         134
other      14
Name: sex, dtype: int64



In [7]:

    
cols = 'location job_title salary years_at_employer years_of_experience sex'.split()
df = df.sort_values('salary', ascending=False)



In [8]:

    
def drop_unreasonable_years(df):
    plen(df)
    df = df.dropna(subset='salary years_job years_experience'.split())
    df = df[df.years_experience > 0]
    df = df[df.years_experience < 50]
    df = df[df.years_experience < 50]
    df = df[df.years_job >= 0]
    df = df[df.years_job <= 50]
    plen(df)
    return df

df['years_job'] = pd.to_numeric(df.years_at_employer, errors='coerce')
df['years_experience'] = pd.to_numeric(df.years_of_experience, errors='coerce')
df = drop_unreasonable_years(df)



In [9]:

    
df[cols].head(5)









    Out[9]:






  
    
      
      location
      job_title
      salary
      years_at_employer
      years_of_experience
      sex
    
  
  
    
      677
      san francisco
      CTO Google rail product
      356000
      5
      12
      M
    
    
      488
      Los Gatos
      Senior Software Engineer
      350000
      1
      10
      M
    
    
      1524
      NaN
      General Manager
      350000
      0.5
      20
      M
    
    
      3099
      Menlo Park
      Strategic Value Executive
      350000
      5
      10
      M
    
    
      2494
      Los Gatos, CA
      Senior Software Engineer
      330000
      2
      10
      M



In [10]:

    
df[cols].tail(5)









    Out[10]:






  
    
      
      location
      job_title
      salary
      years_at_employer
      years_of_experience
      sex
    
  
  
    
      1148
      Cluj-Napoca, Romania
      Software Engineer
      21500
      2
      2
      M
    
    
      577
      Bangalore, India
      Developer
      21496
      0.5
      3
      M
    
    
      2488
      Mumbai
      DevOps Engineer
      21027
      1
      2.5
      M
    
    
      1216
      Nottingham UK
      Jnr Software Developer
      20500
      1
      1
      M
    
    
      841
      Paris, France
      Research Engineer
      20400
      0
      1
      M



In [11]:

    
df = df[df.salary_type == 'USD']
loc = df.location.fillna('')
sv_set = {'bay area', 'san jose', 'san francisco', 'mountain view'}
is_silicon = loc.str.lower().isin(sv_set)
sv_df = df[is_silicon]
other_df = df[~is_silicon]  

print("silicon valley:", len(sv_df))
print("other:", len(other_df))









    



silicon valley: 246
other: 2321

Basic Salary Info



In [12]:

    
def show_salary_info():    
    dlist = []
    def add(df, name):
        mean = df.salary.mean()
        median = df.salary.median()
        quants = list(df.salary.quantile([0.01, 0.1, 0.25, 0.75, 0.9, 0.99]))
        d = {
            'name' : name,
            'mean' : mean,
            'median': median,
            '1%' : quants[0],
            '10%' : quants[1],
            '25%' : quants[2],
            '75%' : quants[3],
            '90%' : quants[4],
            '99%' : quants[5],
        }
        dlist.append(d)
        
    sv_male_df = sv_df[sv_df.sex == 'M']
    sv_female_df = sv_df[sv_df.sex == 'F']
    other_male_df = other_df[other_df.sex == 'M']
    other_female_df = other_df[other_df.sex == 'F']
        
    add(sv_df, 'Silicon Valley')
    add(other_df, 'Non-Silicon Valley')
    add(sv_male_df, 'SV_Males')
    add(other_male_df, 'Non_SV_Males')
    add(sv_female_df, 'SV_Females')    
    add(other_female_df, 'Non_SV_Females')
    df = pd.DataFrame(dlist)
    df = df['name mean median 1% 10% 25% 75% 90% 99%'.split()]
    for c in ['mean', 'median'] + [c for c in df.columns if '%' in c]:
        df[c] = df[c].astype(int)        
    s = df.style
    s.background_gradient(cmap='coolwarm')    
        
    return s

# print("SV Salary Quantiles:")
# print(sv_df.salary.quantile([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))

# print("Non_SV Salary Quantiles:")
# print(other_df.salary.quantile([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))

show_salary_info()









    



/home/jrenner/miniconda3/lib/python3.5/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
/home/jrenner/miniconda3/lib/python3.5/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')






    Out[12]:





        

        
        

        
            
            
                
                
                
                name
                
                mean
                
                median
                
                1%
                
                10%
                
                25%
                
                75%
                
                90%
                
                99%
                
            
            
        
        
            
            
                
                
                    0
                
                
                    Silicon Valley
                
                
                    135514
                
                
                    130250
                
                
                    61350
                
                
                    99000
                
                
                    110000
                
                
                    155000
                
                
                    175000
                
                
                    259350
                
            
            
            
                
                
                    1
                
                
                    Non-Silicon Valley
                
                
                    97325
                
                
                    93500
                
                
                    25000
                
                
                    44000
                
                
                    65000
                
                
                    125000
                
                
                    150000
                
                
                    235800
                
            
            
            
                
                
                    2
                
                
                    SV_Males
                
                
                    136526
                
                
                    132000
                
                
                    65100
                
                
                    100000
                
                
                    113250
                
                
                    155000
                
                
                    175000
                
                
                    246999
                
            
            
            
                
                
                    3
                
                
                    Non_SV_Males
                
                
                    97725
                
                
                    94000
                
                
                    24437
                
                
                    44000
                
                
                    65000
                
                
                    125000
                
                
                    150000
                
                
                    235929
                
            
            
            
                
                
                    4
                
                
                    SV_Females
                
                
                    119924
                
                
                    110000
                
                
                    57800
                
                
                    79000
                
                
                    100000
                
                
                    122755
                
                
                    160000
                
                
                    253419
                
            
            
            
                
                
                    5
                
                
                    Non_SV_Females
                
                
                    90345
                
                
                    90000
                
                
                    30035
                
                
                    54100
                
                
                    60875
                
                
                    110000
                
                
                    130000
                
                
                    160000

Visualizations



In [68]:

    
F_ALPHA = 0.3
CIRCLE_SIZE=7

def graph_salary_vs_experience():
    p = figure(title="Salary vs Years of Experience", y_axis_label='Salary (1k)', x_axis_label='Years Experience', \
              y_axis_type='log', y_range=[18, 400])                      

    x = sv_df.years_experience
    y = sv_df.salary / 1000
    p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Silicon Valley')
    
    x = other_df.years_experience
    y = other_df.salary / 1000
    p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Other USA')
    
    show(p)
    
graph_salary_vs_experience()



In [78]:

    
def graph_salary_vs_years_at_job():
    p = figure(title="Salary vs Years at Job", y_axis_label='Salary (1k)', x_axis_label='Years at Job', \
               y_axis_type='log', y_range=[20, 400])                      

    x = sv_df.years_job
    y = sv_df.salary / 1000
    p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Silicon Valley')
    
    x = other_df.years_job
    y = other_df.salary / 1000
    p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Other USA')
    
    show(p)
    
graph_salary_vs_years_at_job()



In [80]:

    
def graph_salary_vs_gender(df):
    
    male = df[df.sex == 'M']
    female = df[df.sex == 'F']
    other = df[df.sex == 'other']
    
    p = figure(title="Salary vs Years Experience (by Gender)", y_axis_label='Salary (1k)', x_axis_label='Years Experience', \
              y_axis_type='log', y_range=[20, 400])                      

    x = male.years_experience
    y = male.salary / 1000
    p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Male')
    
    x = female.years_experience
    y = female.salary / 1000
    p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='blue', legend='Female')
    
    x = other.years_experience
    y = other.salary / 1000
    p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Other')
    
    show(p)
    
graph_salary_vs_gender(df)

Let's try to see salary by common geographic locations



In [82]:

    
def regional_salaries():
    res_list = []
    for name, grp in df.groupby('std_location'):
        if len(grp) < 4 or name == '':
            continue
        res = (name, grp.salary.mean() / 1000)
        res_list.append(res)
    #print(res_list)
    
    p = figure()
    p.plot_width = 1000
    p.plot_height = 800
    
    def sort_func(tup):
        return tup[1]
    
    res_list.sort(key=sort_func)
    
    x = [i for i in range(len(res_list))]
    y = [r[1] for r in res_list]
    
    p.scatter(x, y)
    texts = [r[0] for r in res_list]
    p.text(x, y, texts, angle=-0.8)
    
    for name, mean in res_list:
        pass
        
    show(p)
    
regional_salaries()



In [ ]:

	timestamp	employer	location	job_title	years_at_employer	years_of_experience	annual_base_pay	signing_bonus	annual_bonus	annual_stock_value/bonus	gender	additional_comments
0	3/21/2016 12:54:49	NaN	Raleigh, NC	Software Developer	NaN	18	122000	NaN	NaN	NaN	Male	NaN
1	3/21/2016 12:58:52	Opower	San Francisco, CA	Systems Engineer	2	13	125000	5000	0	5000 shares	Male	Don't work here.
2	3/21/2016 12:58:57	NaN	San Diego CA	Systems engineer	1	1 of employment	83,000	0	$50.00	0	Male	NaN
3	3/21/2016 12:58:58	Walmart	Bentonville, AR	Senior Developer	8	15	65,000	NaN	5,000	3,000	Male	NaN
4	3/21/2016 12:59:11	Vertical Knowledge	Cleveland, OH	Software Engineer	1	4	86000	5000	6000	0	Male	NaN

	location	job_title	salary	years_at_employer	years_of_experience	sex
677	san francisco	CTO Google rail product	356000	5	12	M
488	Los Gatos	Senior Software Engineer	350000	1	10	M
1524	NaN	General Manager	350000	0.5	20	M
3099	Menlo Park	Strategic Value Executive	350000	5	10	M
2494	Los Gatos, CA	Senior Software Engineer	330000	2	10	M

	location	job_title	salary	years_at_employer	years_of_experience	sex
1148	Cluj-Napoca, Romania	Software Engineer	21500	2	2	M
577	Bangalore, India	Developer	21496	0.5	3	M
2488	Mumbai	DevOps Engineer	21027	1	2.5	M
1216	Nottingham UK	Jnr Software Developer	20500	1	1	M
841	Paris, France	Research Engineer	20400	0	1	M

	name	mean	median	1%	10%	25%	75%	90%	99%
0	Silicon Valley	135514	130250	61350	99000	110000	155000	175000	259350
1	Non-Silicon Valley	97325	93500	25000	44000	65000	125000	150000	235800
2	SV_Males	136526	132000	65100	100000	113250	155000	175000	246999
3	Non_SV_Males	97725	94000	24437	44000	65000	125000	150000	235929
4	SV_Females	119924	110000	57800	79000	100000	122755	160000	253419
5	Non_SV_Females	90345	90000	30035	54100	60875	110000	130000	160000