In [1]:
import pandas as pd
import numpy as np
import re
from bokeh.plotting import *
output_notebook()
In [2]:
def plen(x): print(len(x))
def format_column(c):
return c.replace(' ', '_').lower()
df = pd.read_csv('salaries.csv')
df.columns = [format_column(c) for c in df.columns]
df.head()
Out[2]:
In [3]:
def create_standard_location(df):
def standardize(loc):
if pd.isnull(loc):
return ''
return loc.split(',')[0].lower().strip()
df['std_location'] = df.location.map(standardize)
return df
df = create_standard_location(df)
In [4]:
def create_clean_salaries(df):
def clean_individual_salary(x):
"""
Clean the salary data
1. reject some malformed/difficult to parse data
2. try to detect currency and return the type
"""
final_salary = np.nan
salary_type = np.nan
if pd.notnull(x):
salary_type = 'USD' # default
x = x.lower()
x = re.sub("[\s,]", "", x)
x = re.sub("\.[0-9]*", "", x)
if '€' in x or 'eur' in x:
salary_type = 'Euro'
x = re.sub(r'[€(eur)(euro)]', '', x)
if '$' in x or 'usd' in x:
x = re.sub(r'[\$(usd)]', '', x)
if '£' in x or 'gbp' in x:
salary_type = 'GBP'
x = x.replace('[£(gbp)]', '')
try:
final_salary = int(x)
except:
#print("convert to numeric failure: '{}'".format(x))
final_salary = np.nan
salary_type = np.nan
return (final_salary, salary_type)
clean_salary_ufunc = np.frompyfunc(clean_individual_salary, 1, 2)
final_salary, salary_type = clean_salary_ufunc(df.annual_base_pay)
df['salary'] = final_salary
df['salary_type'] = salary_type
return df
def drop_unreasonable_salaries(df):
reasonable = df['salary'].map(lambda x: x > 20000 and x < 360000)
print("resonable salaries:")
print(reasonable.value_counts())
df = df[reasonable]
return df
df = create_clean_salaries(df)
plen(df)
df = drop_unreasonable_salaries(df)
plen(df)
In [5]:
df.loc
Out[5]:
In [6]:
def create_sex(df):
def convert_gender(x):
if pd.isnull(x):
return x
elif 'female' in x.lower():
return 'F'
elif 'male' in x.lower():
return 'M'
else:
return 'other'
df['sex'] = df.gender.map(convert_gender)
return df
df = create_sex(df)
df = df.dropna(subset='sex'.split())
df.sex.value_counts()
Out[6]:
In [7]:
cols = 'location job_title salary years_at_employer years_of_experience sex'.split()
df = df.sort_values('salary', ascending=False)
In [8]:
def drop_unreasonable_years(df):
plen(df)
df = df.dropna(subset='salary years_job years_experience'.split())
df = df[df.years_experience > 0]
df = df[df.years_experience < 50]
df = df[df.years_experience < 50]
df = df[df.years_job >= 0]
df = df[df.years_job <= 50]
plen(df)
return df
df['years_job'] = pd.to_numeric(df.years_at_employer, errors='coerce')
df['years_experience'] = pd.to_numeric(df.years_of_experience, errors='coerce')
df = drop_unreasonable_years(df)
In [9]:
df[cols].head(5)
Out[9]:
In [10]:
df[cols].tail(5)
Out[10]:
In [11]:
df = df[df.salary_type == 'USD']
loc = df.location.fillna('')
sv_set = {'bay area', 'san jose', 'san francisco', 'mountain view'}
is_silicon = loc.str.lower().isin(sv_set)
sv_df = df[is_silicon]
other_df = df[~is_silicon]
print("silicon valley:", len(sv_df))
print("other:", len(other_df))
In [12]:
def show_salary_info():
dlist = []
def add(df, name):
mean = df.salary.mean()
median = df.salary.median()
quants = list(df.salary.quantile([0.01, 0.1, 0.25, 0.75, 0.9, 0.99]))
d = {
'name' : name,
'mean' : mean,
'median': median,
'1%' : quants[0],
'10%' : quants[1],
'25%' : quants[2],
'75%' : quants[3],
'90%' : quants[4],
'99%' : quants[5],
}
dlist.append(d)
sv_male_df = sv_df[sv_df.sex == 'M']
sv_female_df = sv_df[sv_df.sex == 'F']
other_male_df = other_df[other_df.sex == 'M']
other_female_df = other_df[other_df.sex == 'F']
add(sv_df, 'Silicon Valley')
add(other_df, 'Non-Silicon Valley')
add(sv_male_df, 'SV_Males')
add(other_male_df, 'Non_SV_Males')
add(sv_female_df, 'SV_Females')
add(other_female_df, 'Non_SV_Females')
df = pd.DataFrame(dlist)
df = df['name mean median 1% 10% 25% 75% 90% 99%'.split()]
for c in ['mean', 'median'] + [c for c in df.columns if '%' in c]:
df[c] = df[c].astype(int)
s = df.style
s.background_gradient(cmap='coolwarm')
return s
# print("SV Salary Quantiles:")
# print(sv_df.salary.quantile([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))
# print("Non_SV Salary Quantiles:")
# print(other_df.salary.quantile([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))
show_salary_info()
Out[12]:
In [68]:
F_ALPHA = 0.3
CIRCLE_SIZE=7
def graph_salary_vs_experience():
p = figure(title="Salary vs Years of Experience", y_axis_label='Salary (1k)', x_axis_label='Years Experience', \
y_axis_type='log', y_range=[18, 400])
x = sv_df.years_experience
y = sv_df.salary / 1000
p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Silicon Valley')
x = other_df.years_experience
y = other_df.salary / 1000
p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Other USA')
show(p)
graph_salary_vs_experience()
In [78]:
def graph_salary_vs_years_at_job():
p = figure(title="Salary vs Years at Job", y_axis_label='Salary (1k)', x_axis_label='Years at Job', \
y_axis_type='log', y_range=[20, 400])
x = sv_df.years_job
y = sv_df.salary / 1000
p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Silicon Valley')
x = other_df.years_job
y = other_df.salary / 1000
p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Other USA')
show(p)
graph_salary_vs_years_at_job()
In [80]:
def graph_salary_vs_gender(df):
male = df[df.sex == 'M']
female = df[df.sex == 'F']
other = df[df.sex == 'other']
p = figure(title="Salary vs Years Experience (by Gender)", y_axis_label='Salary (1k)', x_axis_label='Years Experience', \
y_axis_type='log', y_range=[20, 400])
x = male.years_experience
y = male.salary / 1000
p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Male')
x = female.years_experience
y = female.salary / 1000
p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='blue', legend='Female')
x = other.years_experience
y = other.salary / 1000
p.scatter(x, y, size=CIRCLE_SIZE, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Other')
show(p)
graph_salary_vs_gender(df)
In [82]:
def regional_salaries():
res_list = []
for name, grp in df.groupby('std_location'):
if len(grp) < 4 or name == '':
continue
res = (name, grp.salary.mean() / 1000)
res_list.append(res)
#print(res_list)
p = figure()
p.plot_width = 1000
p.plot_height = 800
def sort_func(tup):
return tup[1]
res_list.sort(key=sort_func)
x = [i for i in range(len(res_list))]
y = [r[1] for r in res_list]
p.scatter(x, y)
texts = [r[0] for r in res_list]
p.text(x, y, texts, angle=-0.8)
for name, mean in res_list:
pass
show(p)
regional_salaries()
In [ ]: