Monte-Carlo simulation pay model



In [1]:

    
%matplotlib inline



In [2]:

    
import os
import numpy as np
import numpy.random as npr
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from sqlalchemy import create_engine



In [3]:

    
plt.style.use('ggplot')
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['figure.figsize'] = [8, 6]
%config InlineBackend.figure_format = 'retina'

Database connection



In [4]:

    
db_pass = os.getenv('PASSWORD')



In [5]:

    
# Server
if db_pass:
    engine = create_engine("postgresql://tm470_owner:{}@172.17.0.1/tm470".format(db_pass))
    conn = engine.connect()
else:
    # Local environment
    engine = create_engine("postgresql://Joachim:@localhost/tm470")
    conn = engine.connect()

Load reference tables

Pay periods



In [6]:

    
pay_periods_df = pd.read_sql_table('pay_periods', conn)
pay_periods_df









    Out[6]:







  
    
      
      id
      period_name
      start_date
      end_date
      cutoff_date
    
  
  
    
      0
      1
      2016/17
      2016-04-01
      2017-03-31
      2017-01-01
    
    
      1
      2
      2017/18
      2017-04-01
      2018-03-31
      2018-01-01



In [7]:

    
pay_options_df = pd.read_sql_table('pay_options_v', conn)
pay_options_df









    Out[7]:







  
    
      
      id
      name
      type
      period_name
      cutoff_date
    
  
  
    
      0
      1
      Pay award 16/17 - £1k (consolidated)
      Consolidated
      2016/17
      2017-01-01
    
    
      1
      2
      Pay award 16/17 - Bonuses
      Non-consolidated
      2016/17
      2017-01-01

Pay ranges



In [8]:

    
pay_ranges_df = pd.read_sql_table('pay_ranges_v', conn, index_col='payband')
pay_ranges_df









    Out[8]:







  
    
      
      min
      max
      period_name
    
    
      payband
      
      
      
    
  
  
    
      1
      64000
      117800
      2016/17
    
    
      2
      87000
      162500
      2016/17
    
    
      3
      106000
      208100
      2016/17

Consolidated pay option data



In [9]:

    
options_query = '''
SELECT key,
       payband_id AS payband,
       value
  FROM pay_options_data
 WHERE pay_option_id = 1
'''



In [10]:

    
option_df = pd.read_sql(options_query, conn)



In [11]:

    
option_df = option_df.pivot(index='payband', columns='key', values='value')
option_df









    Out[11]:







  
    
      key
      breakpoint
      pay_award_tier1
      pay_award_tier2
      special_award
      special_cases
    
    
      payband
      
      
      
      
      
    
  
  
    
      1
      80000.0
      1000.0
      250.0
      2000.0
      10.0
    
    
      2
      110000.0
      1000.0
      250.0
      2000.0
      5.0
    
    
      3
      135000.0
      1000.0
      250.0
      NaN
      NaN

Non-consolidated (bonus) option data



In [12]:

    
bonus_query = '''
SELECT key,
       payband_id AS payband,
       value
  FROM pay_options_data
 WHERE pay_option_id = 2
'''



In [13]:

    
bonus_df = pd.read_sql(bonus_query, conn)
bonus_df = bonus_df.pivot(index='payband', columns='key', values='value')
bonus_df

Load model data



In [14]:

    
def load_data_from_file():
    return pd.read_csv('data/model_data.csv', parse_dates=['hire_date'], 
                       thousands=',', index_col='emp_id')

def load_data_from_db():
    return pd.read_sql_table('model_data', conn, index_col='emp_id')



In [15]:

    
model_data = load_data_from_db()
df = model_data.copy()
df['headcount'] = 1
df['fte_salary'] = df['salary'] * df['fte']
df[:5]



In [16]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 166 entries, 1 to 166
Data columns (total 6 columns):
payband       166 non-null int64
salary        166 non-null int64
fte           166 non-null float64
hire_date     166 non-null datetime64[ns]
headcount     166 non-null int64
fte_salary    166 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(3)
memory usage: 9.1 KB



In [17]:

    
conn.close()

Overview analysis



In [18]:

    
payband_stats = df.groupby(['payband'])['salary','fte'].aggregate({'salary': [np.sum, np.min, np.max, np.median],
                                                                   'fte': [np.sum, 'count']})
payband_stats



In [19]:

    
paybill = 13811459 # for testing
print('Est Paybill: £{:,}'.format(paybill))









    



Est Paybill: £13,811,459



In [20]:

    
g = sns.boxplot(x='payband', y='salary', data=df)
#g.set_ylim(55000, 210000)

Random number generation



In [21]:

    
# Generate random number according to the specified probability distribution
def _generate_random(n=1):
    a = [1, 2, 3]
    p = [.25, .65, .10]
    r_n = npr.choice(a, n, p=p)
    return r_n



In [22]:

    
# Assign random group and show distribution
n_recs = len(df)
df['random_group'] = _generate_random(n_recs)

group_totals = (df.groupby('random_group')['headcount'].agg([np.size])).rename(columns={'size': 'total'})
group_totals['percent'] = group_totals.total/group_totals.total.sum()
group_totals









    Out[22]:







  
    
      
      total
      percent
    
    
      random_group
      
      
    
  
  
    
      1
      37
      0.222892
    
    
      2
      112
      0.674699
    
    
      3
      17
      0.102410

Pay uplift

New pay ranges



In [23]:

    
pay_ranges_df # from database









    Out[23]:







  
    
      
      min
      max
      period_name
    
    
      payband
      
      
      
    
  
  
    
      1
      64000
      117800
      2016/17
    
    
      2
      87000
      162500
      2016/17
    
    
      3
      106000
      208100
      2016/17

Uplift to new min

Everyone is eligible



In [24]:

    
def _apply_pay_uplift(row):
    pb = row.payband
    new_min = pay_ranges_df.loc[pb, 'min']
    if new_min > row.salary:
        return new_min - row.salary
    else:
        return 0



In [25]:

    
# Create new salary column 
temp_df = df.copy()
# Apply the pay uplift function column-wise
temp_df['uplift'] = temp_df.apply(_apply_pay_uplift, axis=1)
df = temp_df

Calculate cost of uplift



In [26]:

    
uplift = df.uplift.sum()
'Cost of uplift: £{:,}'.format(uplift)









    Out[26]:





'Cost of uplift: £10,419'



In [27]:

    
'Cost of uplift: {:0.00}%'.format(uplift / paybill * 100)









    Out[27]:





'Cost of uplift: 0.08%'

Analyse model population



In [28]:

    
df.groupby(['payband'])['salary','fte','headcount'].aggregate({
    'salary': ['min', 'max', np.mean, np.median],
    'fte': [np.sum],
    'headcount': ['count']
}).round(1).style.format('{:,}')

Pay options



In [29]:

    
eligibility_date = pay_periods_df.get_value(0, 'cutoff_date')



In [30]:

    
option_df # from pay_options_data table









    Out[30]:







  
    
      key
      breakpoint
      pay_award_tier1
      pay_award_tier2
      special_award
      special_cases
    
    
      payband
      
      
      
      
      
    
  
  
    
      1
      80000.0
      1000.0
      250.0
      2000.0
      10.0
    
    
      2
      110000.0
      1000.0
      250.0
      2000.0
      5.0
    
    
      3
      135000.0
      1000.0
      250.0
      NaN
      NaN



In [31]:

    
def get_special_cases(df, option_df):
    '''
    Returns lists of lists of employee numbers of records randomly selected as
    special cases, based on the parameters specified in the pay model options.
    '''
    pb1_bpoint = option_df.loc[1, 'breakpoint']
    pb2_bpoint = option_df.loc[2, 'breakpoint']
    pb1_cases = option_df.loc[1, 'special_cases']
    pb2_cases = option_df.loc[2, 'special_cases']
    pb1_special = df[(df['payband'] == 1) & 
                     (df['salary'] < pb1_bpoint) &
                     (df['random_group'] < 3)].reset_index().sample(int(pb1_cases))['emp_id']
    pb2_special = df[(df['payband'] == 2) & 
                     (df['salary'] < pb2_bpoint) &
                     (df['random_group'] < 3)].reset_index().sample(int(pb2_cases))['emp_id']
    return pb1_special.tolist() + pb2_special.tolist()



In [32]:

    
special_cases = get_special_cases(df, option_df)
print(sorted(special_cases))
print('Total: {}'.format(len(special_cases)))









    



[6, 12, 15, 26, 33, 46, 71, 78, 80, 84, 131, 133, 134, 138, 146]
Total: 15



In [33]:

    
def _pay_award(row):
    pb = row.payband
    pay_award = 0
    # Apply the pay award for eligible employees
    if ((row.hire_date < pd.to_datetime(eligibility_date)) & (row.random_group < 3)):
        if row.salary < option_df.loc[pb, 'breakpoint']:
            pay_award = option_df.loc[pb, 'pay_award_tier1']
        else:
            pay_award = option_df.loc[pb, 'pay_award_tier2']
    
    # If the employee is a special case, apply additional award
    if row.emp_id in special_cases:
        if row.payband == 1:
            pay_award = option_df.loc[1, 'special_award']
        if row.payband == 2:
            pay_award = option_df.loc[2, 'special_award']
    
    return pay_award



In [34]:

    
award_df = df.reset_index()
award_df['pay_award'] = award_df.apply(_pay_award, axis=1)



In [35]:

    
# Fill N/A's with 0 to enable totals
# award_df = award_df.fillna(value=0)
award_df['new_salary'] = award_df['salary'] + award_df['pay_award'] + award_df['uplift']
award_df['pro_rata_cost'] = (award_df['pay_award'] + award_df['uplift']) * award_df['fte']



In [36]:

    
'Cost of award: £{:,}'.format(award_df['pro_rata_cost'].sum())









    Out[36]:





'Cost of award: £134,064.0'



In [37]:

    
'Cost of award: {:}%'.format(award_df['pro_rata_cost'].sum() / paybill * 100)









    Out[37]:





'Cost of award: 0.9706722512082178%'



In [38]:

    
f, (ax1, ax2) = plt.subplots(1, 2, sharex='all', sharey='all', figsize=(12,5))
sns.boxplot(x='payband', y='salary', data=df, ax=ax1)
ax1.set_title('Before')
sns.boxplot(x='payband', y='new_salary', data=award_df, ax=ax2)
ax2.set_title('After')
ax2.set_ylabel('')
plt.show()

Non-consolidated pay award



In [39]:

    
def _bonus_award(row):
    pb = row.payband
    # Apply the pay award for eligible employees
    if ((row.hire_date < pd.to_datetime(eligibility_date)) & (row.random_group == 1)):
        return bonus_df.loc[pb, 'bonus']
    else:
        return 0



In [40]:

    
award_df['bonus'] = award_df.apply(_bonus_award, axis=1)
award_df['pro_rata_bonus'] = award_df['bonus'] * award_df['fte']



In [41]:

    
# Show a sample of current data
award_df[award_df['bonus']>0].sample(5)









    Out[41]:







  
    
      
      emp_id
      payband
      salary
      fte
      hire_date
      headcount
      fte_salary
      random_group
      uplift
      pay_award
      new_salary
      pro_rata_cost
      bonus
      pro_rata_bonus
    
  
  
    
      147
      148
      2
      116150
      1.0
      2011-05-12
      1
      116150.0
      1
      0
      250.0
      116400.0
      250.0
      10750.0
      10750.0
    
    
      120
      121
      1
      87776
      1.0
      1992-01-04
      1
      87776.0
      1
      0
      250.0
      88026.0
      250.0
      10750.0
      10750.0
    
    
      98
      99
      1
      80475
      1.0
      2007-01-06
      1
      80475.0
      1
      0
      250.0
      80725.0
      250.0
      10750.0
      10750.0
    
    
      78
      79
      1
      76247
      1.0
      2013-01-06
      1
      76247.0
      1
      0
      1000.0
      77247.0
      1000.0
      10750.0
      10750.0
    
    
      102
      103
      1
      81132
      0.5
      1992-01-04
      1
      40566.0
      1
      0
      250.0
      81382.0
      125.0
      10750.0
      5375.0



In [42]:

    
bonus_total = award_df['pro_rata_bonus'].sum()
'Cost of bonus: £{:,}'.format(bonus_total)









    Out[42]:





'Cost of bonus: £377,755.0'



In [43]:

    
'Cost of bonus: {:}%'.format(award_df['pro_rata_bonus'].sum() / paybill * 100)









    Out[43]:





'Cost of bonus: 2.7350839618030216%'

Monte-Carlo Simulation



In [44]:

    
def run_simulation(df, paybill=None, N=100):
    df['headcount'] = 1
    
    # Calculate pro-rata salary
    df['fte_salary'] = df['salary'] * df['fte']
    
    # Check number of records to determine how many random numbers to generate
    n_recs = len(df)
    
    # If paybill amount not supplied, estimate it for calculations
    if paybill:
        paybill = paybill
    else:
        paybill = df['fte_salary'].sum()
    
    results = []
    
    for i in range(N):
        df['random_group'] = _generate_random(n_recs)
        
        model_distribution = df.groupby('random_group')['headcount'].agg([np.size]).rename(columns={'size':'total'})
        model_distribution['percent'] = model_distribution.total / model_distribution.total.sum()
        
        #############################
        # Apply uplift to new mins
        #############################
        temp_df = df.copy()
        temp_df['uplift'] = temp_df.apply(_apply_pay_uplift, axis=1)
        df = temp_df
        
        ############################
        # Apply pay awards
        ############################
        award_df = df.reset_index()
        df['pay_award'] = award_df.apply(_pay_award, axis=1)
        
        ############################
        # Apply bonus award
        ############################
        df['bonus'] = df.apply(_bonus_award, axis=1)
        df['pro_rata_bonus'] = df['bonus'] * df['fte']
        
        ############################
        # Analyse results 
        ############################
        # Fill N/A's with 0 to enable totals
        df = df.fillna(value=0)
        df['pro_rata_cost'] = (df['pay_award'] + df['uplift']) * df['fte']
        uplift_costs = (df['uplift'] * df['fte']).sum()
        costs = df['pro_rata_cost'].sum()
        bonus_total = df['pro_rata_bonus'].sum()
        
        results.append({
            'group1': model_distribution.loc[1, 'percent'],
            'group2': model_distribution.loc[2, 'percent'],
            'group3': model_distribution.loc[3, 'percent'],
            'pay_costs': costs,
            'pay_percent': (costs / paybill) * 100,
            'uplift_costs': uplift_costs,
            'uplift_percent': (uplift_costs / paybill) * 100,
            'bonus_costs': bonus_total,
            'bonus_percent': (bonus_total / paybill) * 100
        })
        
    return results



In [45]:

    
%%time
# source_df = load_data(n_recs=110)
source_df = model_data
runs = 500
paybill = 13811459
output = run_simulation(source_df, paybill=paybill, N=runs)









    



CPU times: user 44.2 s, sys: 457 ms, total: 44.7 s
Wall time: 46.5 s



In [46]:

    
results_df = pd.DataFrame(output)
results_df.mean().round(2)









    Out[46]:





bonus_costs       427019.46
bonus_percent          3.09
group1                 0.25
group2                 0.65
group3                 0.10
pay_costs         135573.54
pay_percent            0.98
uplift_costs       10419.00
uplift_percent         0.08
dtype: float64



In [47]:

    
results_df.describe().round(2)









    Out[47]:







  
    
      
      bonus_costs
      bonus_percent
      group1
      group2
      group3
      pay_costs
      pay_percent
      uplift_costs
      uplift_percent
    
  
  
    
      count
      500.00
      500.00
      500.00
      500.00
      500.00
      500.00
      500.00
      500.0
      500.00
    
    
      mean
      427019.46
      3.09
      0.25
      0.65
      0.10
      135573.54
      0.98
      10419.0
      0.08
    
    
      std
      58178.25
      0.42
      0.03
      0.04
      0.02
      2952.64
      0.02
      0.0
      0.00
    
    
      min
      230695.00
      1.67
      0.14
      0.53
      0.04
      125326.50
      0.91
      10419.0
      0.08
    
    
      25%
      386623.75
      2.80
      0.23
      0.63
      0.08
      133585.25
      0.97
      10419.0
      0.08
    
    
      50%
      425323.75
      3.08
      0.25
      0.65
      0.10
      135669.00
      0.98
      10419.0
      0.08
    
    
      75%
      464158.12
      3.36
      0.27
      0.67
      0.11
      137798.38
      1.00
      10419.0
      0.08
    
    
      max
      595980.00
      4.32
      0.36
      0.74
      0.17
      143254.00
      1.04
      10419.0
      0.08

Saving results in the database

Open database connection



In [48]:

    
if db_pass:
    # Server
    engine = create_engine("postgresql://tm470_owner:{}@172.17.0.1/tm470".format(db_pass))
    conn = engine.connect()
else:
    # Local environment
    engine = create_engine("postgresql://Joachim:@localhost/tm470")
    conn = engine.connect()

Store consolidated option results



In [49]:

    
# Results of the consolidated option
pay_option_id = 1
period_id = 1
model_params = option_df.to_json()
pay_costs = float(results_df[['pay_costs']].mean().round(2))
pay_percent = float(results_df[['pay_percent']].mean().round(2))
uplift_costs = float(results_df[['uplift_costs']].mean().round(2))
uplift_percent = float(results_df[['uplift_percent']].mean().round(2))



In [50]:

    
sql = '''
INSERT INTO model_results("option_id", "period_id", "model_params", "consolidated_costs", "consolidated_percent",
                          "paybill", "runs", "uplift_costs", "uplift_percent") 
VALUES({option_id}, {period_id}, '{model_params}', {consolidated_costs}, {consolidated_percent}, {paybill}, {runs},
       {uplift_costs}, {uplift_percent});
'''.format(option_id=pay_option_id, period_id=period_id, model_params=model_params, 
           consolidated_costs=pay_costs, consolidated_percent=pay_percent, paybill=paybill, runs=runs,
           uplift_costs=uplift_costs, uplift_percent=uplift_percent)



In [51]:

    
res = conn.execute(sql)

Store bonus option results



In [52]:

    
# Results of the bonus option
pay_option_id = 2
period_id = 1
model_params = bonus_df.to_json()
costs = float(results_df[['bonus_costs']].mean().round(2))
percent = float(results_df[['bonus_percent']].mean().round(2))



In [53]:

    
sql = '''
INSERT INTO model_results("option_id", "period_id", "model_params", "bonus_costs", "bonus_percent",
                          "paybill", "runs") 
VALUES({option_id}, {period_id}, '{model_params}', {bonus_costs}, {bonus_percent}, {paybill}, {runs});
'''.format(option_id=pay_option_id, period_id=period_id, model_params=model_params, 
           bonus_costs=costs, bonus_percent=percent, paybill=paybill, runs=runs)



In [54]:

    
res = conn.execute(sql)



In [55]:

    
conn.close()

Sample data for further analysis

Generate random groups corresponding exactly with the guided distribution, i.e. 25%, 65% and 10%



In [56]:

    
def generate_best_distribution():
    def generate_random():
        df = model_data
        #df = source_df.drop('random_group', 1)
        n_recs = len(df)
        df['random_group'] = _generate_random(n_recs)

        group_totals = (df.groupby('random_group')['headcount'].agg([np.size])).rename(columns={'size': 'total'})
        group_totals['percent'] = group_totals.total/group_totals.total.sum()
        return group_totals, df
    
    group_totals, df = generate_random()
    
    while not ((group_totals.round(2).loc[1, 'percent'] > 0.245 and 
        group_totals.round(2).loc[1, 'percent'] <= 0.255) and
        (group_totals.round(2).loc[3, 'percent'] > 0.095 and 
            group_totals.round(2).loc[3, 'percent'] <= 0.105)):
        group_totals, df = generate_random()
    return group_totals, df



In [57]:

    
# Check distribution by random group
group_totals, df = generate_best_distribution()
group_totals









    Out[57]:







  
    
      
      total
      percent
    
    
      random_group
      
      
    
  
  
    
      1
      42
      0.253012
    
    
      2
      107
      0.644578
    
    
      3
      17
      0.102410



In [58]:

    
# Check distribution within paybands
df.pivot_table(index='payband', columns='random_group', values='headcount', aggfunc=np.size, margins=True)









    Out[58]:







  
    
      random_group
      1
      2
      3
      All
    
    
      payband
      
      
      
      
    
  
  
    
      1
      31.0
      89.0
      10.0
      130.0
    
    
      2
      10.0
      14.0
      5.0
      29.0
    
    
      3
      1.0
      4.0
      2.0
      7.0
    
    
      All
      42.0
      107.0
      17.0
      166.0



In [ ]:



In [ ]:

	payband	salary	fte	hire_date	headcount	fte_salary
emp_id
1	1	62000	1.0	1992-01-04	1	62000.0
2	1	63000	1.0	2007-10-29	1	63000.0
3	1	63000	1.0	2015-06-10	1	63000.0
4	1	63000	1.0	2015-10-26	1	63000.0
5	1	63066	1.0	2015-01-09	1	63066.0

	salary				fte
	sum	amin	amax	median	sum	count
payband
1	9738634	62000	98453	73267	124.51	130
2	3133471	86000	155250	105607	27.14	29
3	939355	122000	143350	135833	7.00	7

	salary				fte	headcount
	min	max	mean	median	sum	count
payband
1	62,000	98,453	74,912.6	73,267	124.5	130
2	86,000	155,250	108,050.7	105,607	27.1	29
3	122,000	143,350	134,193.6	135,833	7.0	7

	id	period_name	start_date	end_date	cutoff_date
0	1	2016/17	2016-04-01	2017-03-31	2017-01-01
1	2	2017/18	2017-04-01	2018-03-31	2018-01-01

	id	name	type	period_name	cutoff_date
0	1	Pay award 16/17 - £1k (consolidated)	Consolidated	2016/17	2017-01-01
1	2	Pay award 16/17 - Bonuses	Non-consolidated	2016/17	2017-01-01

	min	max	period_name
payband
1	64000	117800	2016/17
2	87000	162500	2016/17
3	106000	208100	2016/17

key	breakpoint	pay_award_tier1	pay_award_tier2	special_award	special_cases
payband
1	80000.0	1000.0	250.0	2000.0	10.0
2	110000.0	1000.0	250.0	2000.0	5.0
3	135000.0	1000.0	250.0	NaN	NaN

	emp_id	payband	salary	fte	hire_date	headcount	fte_salary	random_group	pay_award	new_salary	pro_rata_cost	bonus	pro_rata_bonus
147	148	2	116150	1.0	2011-05-12	1	116150.0	1	250.0	116400.0	250.0	10750.0	10750.0
120	121	1	87776	1.0	1992-01-04	1	87776.0	1	250.0	88026.0	250.0	10750.0	10750.0
98	99	1	80475	1.0	2007-01-06	1	80475.0	1	250.0	80725.0	250.0	10750.0	10750.0
78	79	1	76247	1.0	2013-01-06	1	76247.0	1	1000.0	77247.0	1000.0	10750.0	10750.0
102	103	1	81132	0.5	1992-01-04	1	40566.0	1	250.0	81382.0	125.0	10750.0	5375.0

	bonus_costs	bonus_percent	group1	group2	group3	pay_costs	pay_percent	uplift_costs	uplift_percent
count	500.00	500.00	500.00	500.00	500.00	500.00	500.00	500.0	500.00
mean	427019.46	3.09	0.25	0.65	0.10	135573.54	0.98	10419.0	0.08
std	58178.25	0.42	0.03	0.04	0.02	2952.64	0.02	0.0	0.00
min	230695.00	1.67	0.14	0.53	0.04	125326.50	0.91	10419.0	0.08
25%	386623.75	2.80	0.23	0.63	0.08	133585.25	0.97	10419.0	0.08
50%	425323.75	3.08	0.25	0.65	0.10	135669.00	0.98	10419.0	0.08
75%	464158.12	3.36	0.27	0.67	0.11	137798.38	1.00	10419.0	0.08
max	595980.00	4.32	0.36	0.74	0.17	143254.00	1.04	10419.0	0.08

random_group	1	2	3	All
payband
1	31.0	89.0	10.0	130.0
2	10.0	14.0	5.0	29.0
3	1.0	4.0	2.0	7.0
All	42.0	107.0	17.0	166.0