In [1]:
%matplotlib inline
In [2]:
import os
import numpy as np
import numpy.random as npr
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from sqlalchemy import create_engine
In [3]:
plt.style.use('ggplot')
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['figure.figsize'] = [8, 6]
%config InlineBackend.figure_format = 'retina'
In [4]:
db_pass = os.getenv('PASSWORD')
In [5]:
# Server
if db_pass:
engine = create_engine("postgresql://tm470_owner:{}@172.17.0.1/tm470".format(db_pass))
conn = engine.connect()
else:
# Local environment
engine = create_engine("postgresql://Joachim:@localhost/tm470")
conn = engine.connect()
In [6]:
pay_periods_df = pd.read_sql_table('pay_periods', conn)
pay_periods_df
Out[6]:
In [7]:
pay_options_df = pd.read_sql_table('pay_options_v', conn)
pay_options_df
Out[7]:
In [8]:
pay_ranges_df = pd.read_sql_table('pay_ranges_v', conn, index_col='payband')
pay_ranges_df
Out[8]:
In [9]:
options_query = '''
SELECT key,
payband_id AS payband,
value
FROM pay_options_data
WHERE pay_option_id = 1
'''
In [10]:
option_df = pd.read_sql(options_query, conn)
In [11]:
option_df = option_df.pivot(index='payband', columns='key', values='value')
option_df
Out[11]:
In [12]:
bonus_query = '''
SELECT key,
payband_id AS payband,
value
FROM pay_options_data
WHERE pay_option_id = 2
'''
In [13]:
bonus_df = pd.read_sql(bonus_query, conn)
bonus_df = bonus_df.pivot(index='payband', columns='key', values='value')
bonus_df
Out[13]:
In [14]:
def load_data_from_file():
return pd.read_csv('data/model_data.csv', parse_dates=['hire_date'],
thousands=',', index_col='emp_id')
def load_data_from_db():
return pd.read_sql_table('model_data', conn, index_col='emp_id')
In [15]:
model_data = load_data_from_db()
df = model_data.copy()
df['headcount'] = 1
df['fte_salary'] = df['salary'] * df['fte']
df[:5]
Out[15]:
In [16]:
df.info()
In [17]:
conn.close()
In [18]:
payband_stats = df.groupby(['payband'])['salary','fte'].aggregate({'salary': [np.sum, np.min, np.max, np.median],
'fte': [np.sum, 'count']})
payband_stats
Out[18]:
In [19]:
paybill = 13811459 # for testing
print('Est Paybill: £{:,}'.format(paybill))
In [20]:
g = sns.boxplot(x='payband', y='salary', data=df)
#g.set_ylim(55000, 210000)
In [21]:
# Generate random number according to the specified probability distribution
def _generate_random(n=1):
a = [1, 2, 3]
p = [.25, .65, .10]
r_n = npr.choice(a, n, p=p)
return r_n
In [22]:
# Assign random group and show distribution
n_recs = len(df)
df['random_group'] = _generate_random(n_recs)
group_totals = (df.groupby('random_group')['headcount'].agg([np.size])).rename(columns={'size': 'total'})
group_totals['percent'] = group_totals.total/group_totals.total.sum()
group_totals
Out[22]:
In [23]:
pay_ranges_df # from database
Out[23]:
In [24]:
def _apply_pay_uplift(row):
pb = row.payband
new_min = pay_ranges_df.loc[pb, 'min']
if new_min > row.salary:
return new_min - row.salary
else:
return 0
In [25]:
# Create new salary column
temp_df = df.copy()
# Apply the pay uplift function column-wise
temp_df['uplift'] = temp_df.apply(_apply_pay_uplift, axis=1)
df = temp_df
In [26]:
uplift = df.uplift.sum()
'Cost of uplift: £{:,}'.format(uplift)
Out[26]:
In [27]:
'Cost of uplift: {:0.00}%'.format(uplift / paybill * 100)
Out[27]:
In [28]:
df.groupby(['payband'])['salary','fte','headcount'].aggregate({
'salary': ['min', 'max', np.mean, np.median],
'fte': [np.sum],
'headcount': ['count']
}).round(1).style.format('{:,}')
Out[28]:
In [29]:
eligibility_date = pay_periods_df.get_value(0, 'cutoff_date')
In [30]:
option_df # from pay_options_data table
Out[30]:
In [31]:
def get_special_cases(df, option_df):
'''
Returns lists of lists of employee numbers of records randomly selected as
special cases, based on the parameters specified in the pay model options.
'''
pb1_bpoint = option_df.loc[1, 'breakpoint']
pb2_bpoint = option_df.loc[2, 'breakpoint']
pb1_cases = option_df.loc[1, 'special_cases']
pb2_cases = option_df.loc[2, 'special_cases']
pb1_special = df[(df['payband'] == 1) &
(df['salary'] < pb1_bpoint) &
(df['random_group'] < 3)].reset_index().sample(int(pb1_cases))['emp_id']
pb2_special = df[(df['payband'] == 2) &
(df['salary'] < pb2_bpoint) &
(df['random_group'] < 3)].reset_index().sample(int(pb2_cases))['emp_id']
return pb1_special.tolist() + pb2_special.tolist()
In [32]:
special_cases = get_special_cases(df, option_df)
print(sorted(special_cases))
print('Total: {}'.format(len(special_cases)))
In [33]:
def _pay_award(row):
pb = row.payband
pay_award = 0
# Apply the pay award for eligible employees
if ((row.hire_date < pd.to_datetime(eligibility_date)) & (row.random_group < 3)):
if row.salary < option_df.loc[pb, 'breakpoint']:
pay_award = option_df.loc[pb, 'pay_award_tier1']
else:
pay_award = option_df.loc[pb, 'pay_award_tier2']
# If the employee is a special case, apply additional award
if row.emp_id in special_cases:
if row.payband == 1:
pay_award = option_df.loc[1, 'special_award']
if row.payband == 2:
pay_award = option_df.loc[2, 'special_award']
return pay_award
In [34]:
award_df = df.reset_index()
award_df['pay_award'] = award_df.apply(_pay_award, axis=1)
In [35]:
# Fill N/A's with 0 to enable totals
# award_df = award_df.fillna(value=0)
award_df['new_salary'] = award_df['salary'] + award_df['pay_award'] + award_df['uplift']
award_df['pro_rata_cost'] = (award_df['pay_award'] + award_df['uplift']) * award_df['fte']
In [36]:
'Cost of award: £{:,}'.format(award_df['pro_rata_cost'].sum())
Out[36]:
In [37]:
'Cost of award: {:}%'.format(award_df['pro_rata_cost'].sum() / paybill * 100)
Out[37]:
In [38]:
f, (ax1, ax2) = plt.subplots(1, 2, sharex='all', sharey='all', figsize=(12,5))
sns.boxplot(x='payband', y='salary', data=df, ax=ax1)
ax1.set_title('Before')
sns.boxplot(x='payband', y='new_salary', data=award_df, ax=ax2)
ax2.set_title('After')
ax2.set_ylabel('')
plt.show()
In [39]:
def _bonus_award(row):
pb = row.payband
# Apply the pay award for eligible employees
if ((row.hire_date < pd.to_datetime(eligibility_date)) & (row.random_group == 1)):
return bonus_df.loc[pb, 'bonus']
else:
return 0
In [40]:
award_df['bonus'] = award_df.apply(_bonus_award, axis=1)
award_df['pro_rata_bonus'] = award_df['bonus'] * award_df['fte']
In [41]:
# Show a sample of current data
award_df[award_df['bonus']>0].sample(5)
Out[41]:
In [42]:
bonus_total = award_df['pro_rata_bonus'].sum()
'Cost of bonus: £{:,}'.format(bonus_total)
Out[42]:
In [43]:
'Cost of bonus: {:}%'.format(award_df['pro_rata_bonus'].sum() / paybill * 100)
Out[43]:
In [44]:
def run_simulation(df, paybill=None, N=100):
df['headcount'] = 1
# Calculate pro-rata salary
df['fte_salary'] = df['salary'] * df['fte']
# Check number of records to determine how many random numbers to generate
n_recs = len(df)
# If paybill amount not supplied, estimate it for calculations
if paybill:
paybill = paybill
else:
paybill = df['fte_salary'].sum()
results = []
for i in range(N):
df['random_group'] = _generate_random(n_recs)
model_distribution = df.groupby('random_group')['headcount'].agg([np.size]).rename(columns={'size':'total'})
model_distribution['percent'] = model_distribution.total / model_distribution.total.sum()
#############################
# Apply uplift to new mins
#############################
temp_df = df.copy()
temp_df['uplift'] = temp_df.apply(_apply_pay_uplift, axis=1)
df = temp_df
############################
# Apply pay awards
############################
award_df = df.reset_index()
df['pay_award'] = award_df.apply(_pay_award, axis=1)
############################
# Apply bonus award
############################
df['bonus'] = df.apply(_bonus_award, axis=1)
df['pro_rata_bonus'] = df['bonus'] * df['fte']
############################
# Analyse results
############################
# Fill N/A's with 0 to enable totals
df = df.fillna(value=0)
df['pro_rata_cost'] = (df['pay_award'] + df['uplift']) * df['fte']
uplift_costs = (df['uplift'] * df['fte']).sum()
costs = df['pro_rata_cost'].sum()
bonus_total = df['pro_rata_bonus'].sum()
results.append({
'group1': model_distribution.loc[1, 'percent'],
'group2': model_distribution.loc[2, 'percent'],
'group3': model_distribution.loc[3, 'percent'],
'pay_costs': costs,
'pay_percent': (costs / paybill) * 100,
'uplift_costs': uplift_costs,
'uplift_percent': (uplift_costs / paybill) * 100,
'bonus_costs': bonus_total,
'bonus_percent': (bonus_total / paybill) * 100
})
return results
In [45]:
%%time
# source_df = load_data(n_recs=110)
source_df = model_data
runs = 500
paybill = 13811459
output = run_simulation(source_df, paybill=paybill, N=runs)
In [46]:
results_df = pd.DataFrame(output)
results_df.mean().round(2)
Out[46]:
In [47]:
results_df.describe().round(2)
Out[47]:
In [48]:
if db_pass:
# Server
engine = create_engine("postgresql://tm470_owner:{}@172.17.0.1/tm470".format(db_pass))
conn = engine.connect()
else:
# Local environment
engine = create_engine("postgresql://Joachim:@localhost/tm470")
conn = engine.connect()
In [49]:
# Results of the consolidated option
pay_option_id = 1
period_id = 1
model_params = option_df.to_json()
pay_costs = float(results_df[['pay_costs']].mean().round(2))
pay_percent = float(results_df[['pay_percent']].mean().round(2))
uplift_costs = float(results_df[['uplift_costs']].mean().round(2))
uplift_percent = float(results_df[['uplift_percent']].mean().round(2))
In [50]:
sql = '''
INSERT INTO model_results("option_id", "period_id", "model_params", "consolidated_costs", "consolidated_percent",
"paybill", "runs", "uplift_costs", "uplift_percent")
VALUES({option_id}, {period_id}, '{model_params}', {consolidated_costs}, {consolidated_percent}, {paybill}, {runs},
{uplift_costs}, {uplift_percent});
'''.format(option_id=pay_option_id, period_id=period_id, model_params=model_params,
consolidated_costs=pay_costs, consolidated_percent=pay_percent, paybill=paybill, runs=runs,
uplift_costs=uplift_costs, uplift_percent=uplift_percent)
In [51]:
res = conn.execute(sql)
In [52]:
# Results of the bonus option
pay_option_id = 2
period_id = 1
model_params = bonus_df.to_json()
costs = float(results_df[['bonus_costs']].mean().round(2))
percent = float(results_df[['bonus_percent']].mean().round(2))
In [53]:
sql = '''
INSERT INTO model_results("option_id", "period_id", "model_params", "bonus_costs", "bonus_percent",
"paybill", "runs")
VALUES({option_id}, {period_id}, '{model_params}', {bonus_costs}, {bonus_percent}, {paybill}, {runs});
'''.format(option_id=pay_option_id, period_id=period_id, model_params=model_params,
bonus_costs=costs, bonus_percent=percent, paybill=paybill, runs=runs)
In [54]:
res = conn.execute(sql)
In [55]:
conn.close()
In [56]:
def generate_best_distribution():
def generate_random():
df = model_data
#df = source_df.drop('random_group', 1)
n_recs = len(df)
df['random_group'] = _generate_random(n_recs)
group_totals = (df.groupby('random_group')['headcount'].agg([np.size])).rename(columns={'size': 'total'})
group_totals['percent'] = group_totals.total/group_totals.total.sum()
return group_totals, df
group_totals, df = generate_random()
while not ((group_totals.round(2).loc[1, 'percent'] > 0.245 and
group_totals.round(2).loc[1, 'percent'] <= 0.255) and
(group_totals.round(2).loc[3, 'percent'] > 0.095 and
group_totals.round(2).loc[3, 'percent'] <= 0.105)):
group_totals, df = generate_random()
return group_totals, df
In [57]:
# Check distribution by random group
group_totals, df = generate_best_distribution()
group_totals
Out[57]:
In [58]:
# Check distribution within paybands
df.pivot_table(index='payband', columns='random_group', values='headcount', aggfunc=np.size, margins=True)
Out[58]:
In [ ]:
In [ ]: