Monte-Carlo simulation pay model


In [1]:
%matplotlib inline

In [2]:
import os
import numpy as np
import numpy.random as npr
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from sqlalchemy import create_engine

In [3]:
plt.style.use('ggplot')
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['figure.figsize'] = [8, 6]
%config InlineBackend.figure_format = 'retina'

Database connection


In [4]:
db_pass = os.getenv('PASSWORD')

In [5]:
# Server
if db_pass:
    engine = create_engine("postgresql://tm470_owner:{}@172.17.0.1/tm470".format(db_pass))
    conn = engine.connect()
else:
    # Local environment
    engine = create_engine("postgresql://Joachim:@localhost/tm470")
    conn = engine.connect()

Load reference tables

Pay periods


In [6]:
pay_periods_df = pd.read_sql_table('pay_periods', conn)
pay_periods_df


Out[6]:
id period_name start_date end_date cutoff_date
0 1 2016/17 2016-04-01 2017-03-31 2017-01-01
1 2 2017/18 2017-04-01 2018-03-31 2018-01-01

In [7]:
pay_options_df = pd.read_sql_table('pay_options_v', conn)
pay_options_df


Out[7]:
id name type period_name cutoff_date
0 1 Pay award 16/17 - £1k (consolidated) Consolidated 2016/17 2017-01-01
1 2 Pay award 16/17 - Bonuses Non-consolidated 2016/17 2017-01-01

Pay ranges


In [8]:
pay_ranges_df = pd.read_sql_table('pay_ranges_v', conn, index_col='payband')
pay_ranges_df


Out[8]:
min max period_name
payband
1 64000 117800 2016/17
2 87000 162500 2016/17
3 106000 208100 2016/17

Consolidated pay option data


In [9]:
options_query = '''
SELECT key,
       payband_id AS payband,
       value
  FROM pay_options_data
 WHERE pay_option_id = 1
'''

In [10]:
option_df = pd.read_sql(options_query, conn)

In [11]:
option_df = option_df.pivot(index='payband', columns='key', values='value')
option_df


Out[11]:
key breakpoint pay_award_tier1 pay_award_tier2 special_award special_cases
payband
1 80000.0 1000.0 250.0 2000.0 10.0
2 110000.0 1000.0 250.0 2000.0 5.0
3 135000.0 1000.0 250.0 NaN NaN

Non-consolidated (bonus) option data


In [12]:
bonus_query = '''
SELECT key,
       payband_id AS payband,
       value
  FROM pay_options_data
 WHERE pay_option_id = 2
'''

In [13]:
bonus_df = pd.read_sql(bonus_query, conn)
bonus_df = bonus_df.pivot(index='payband', columns='key', values='value')
bonus_df


Out[13]:
key bonus
payband
1 10750.0
2 10750.0
3 10750.0

Load model data


In [14]:
def load_data_from_file():
    return pd.read_csv('data/model_data.csv', parse_dates=['hire_date'], 
                       thousands=',', index_col='emp_id')

def load_data_from_db():
    return pd.read_sql_table('model_data', conn, index_col='emp_id')

In [15]:
model_data = load_data_from_db()
df = model_data.copy()
df['headcount'] = 1
df['fte_salary'] = df['salary'] * df['fte']
df[:5]


Out[15]:
payband salary fte hire_date headcount fte_salary
emp_id
1 1 62000 1.0 1992-01-04 1 62000.0
2 1 63000 1.0 2007-10-29 1 63000.0
3 1 63000 1.0 2015-06-10 1 63000.0
4 1 63000 1.0 2015-10-26 1 63000.0
5 1 63066 1.0 2015-01-09 1 63066.0

In [16]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 166 entries, 1 to 166
Data columns (total 6 columns):
payband       166 non-null int64
salary        166 non-null int64
fte           166 non-null float64
hire_date     166 non-null datetime64[ns]
headcount     166 non-null int64
fte_salary    166 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(3)
memory usage: 9.1 KB

In [17]:
conn.close()

Overview analysis


In [18]:
payband_stats = df.groupby(['payband'])['salary','fte'].aggregate({'salary': [np.sum, np.min, np.max, np.median],
                                                                   'fte': [np.sum, 'count']})
payband_stats


Out[18]:
salary fte
sum amin amax median sum count
payband
1 9738634 62000 98453 73267 124.51 130
2 3133471 86000 155250 105607 27.14 29
3 939355 122000 143350 135833 7.00 7

In [19]:
paybill = 13811459 # for testing
print('Est Paybill: £{:,}'.format(paybill))


Est Paybill: £13,811,459

In [20]:
g = sns.boxplot(x='payband', y='salary', data=df)
#g.set_ylim(55000, 210000)


Random number generation


In [21]:
# Generate random number according to the specified probability distribution
def _generate_random(n=1):
    a = [1, 2, 3]
    p = [.25, .65, .10]
    r_n = npr.choice(a, n, p=p)
    return r_n

In [22]:
# Assign random group and show distribution
n_recs = len(df)
df['random_group'] = _generate_random(n_recs)

group_totals = (df.groupby('random_group')['headcount'].agg([np.size])).rename(columns={'size': 'total'})
group_totals['percent'] = group_totals.total/group_totals.total.sum()
group_totals


Out[22]:
total percent
random_group
1 37 0.222892
2 112 0.674699
3 17 0.102410

Pay uplift

New pay ranges


In [23]:
pay_ranges_df # from database


Out[23]:
min max period_name
payband
1 64000 117800 2016/17
2 87000 162500 2016/17
3 106000 208100 2016/17

Uplift to new min

Everyone is eligible


In [24]:
def _apply_pay_uplift(row):
    pb = row.payband
    new_min = pay_ranges_df.loc[pb, 'min']
    if new_min > row.salary:
        return new_min - row.salary
    else:
        return 0

In [25]:
# Create new salary column 
temp_df = df.copy()
# Apply the pay uplift function column-wise
temp_df['uplift'] = temp_df.apply(_apply_pay_uplift, axis=1)
df = temp_df

Calculate cost of uplift


In [26]:
uplift = df.uplift.sum()
'Cost of uplift: £{:,}'.format(uplift)


Out[26]:
'Cost of uplift: £10,419'

In [27]:
'Cost of uplift: {:0.00}%'.format(uplift / paybill * 100)


Out[27]:
'Cost of uplift: 0.08%'

Analyse model population


In [28]:
df.groupby(['payband'])['salary','fte','headcount'].aggregate({
    'salary': ['min', 'max', np.mean, np.median],
    'fte': [np.sum],
    'headcount': ['count']
}).round(1).style.format('{:,}')


Out[28]:
salary fte headcount
min max mean median sum count
payband
1 62,000 98,453 74,912.6 73,267 124.5 130
2 86,000 155,250 108,050.7 105,607 27.1 29
3 122,000 143,350 134,193.6 135,833 7.0 7

Pay options


In [29]:
eligibility_date = pay_periods_df.get_value(0, 'cutoff_date')

In [30]:
option_df # from pay_options_data table


Out[30]:
key breakpoint pay_award_tier1 pay_award_tier2 special_award special_cases
payband
1 80000.0 1000.0 250.0 2000.0 10.0
2 110000.0 1000.0 250.0 2000.0 5.0
3 135000.0 1000.0 250.0 NaN NaN

In [31]:
def get_special_cases(df, option_df):
    '''
    Returns lists of lists of employee numbers of records randomly selected as
    special cases, based on the parameters specified in the pay model options.
    '''
    pb1_bpoint = option_df.loc[1, 'breakpoint']
    pb2_bpoint = option_df.loc[2, 'breakpoint']
    pb1_cases = option_df.loc[1, 'special_cases']
    pb2_cases = option_df.loc[2, 'special_cases']
    pb1_special = df[(df['payband'] == 1) & 
                     (df['salary'] < pb1_bpoint) &
                     (df['random_group'] < 3)].reset_index().sample(int(pb1_cases))['emp_id']
    pb2_special = df[(df['payband'] == 2) & 
                     (df['salary'] < pb2_bpoint) &
                     (df['random_group'] < 3)].reset_index().sample(int(pb2_cases))['emp_id']
    return pb1_special.tolist() + pb2_special.tolist()

In [32]:
special_cases = get_special_cases(df, option_df)
print(sorted(special_cases))
print('Total: {}'.format(len(special_cases)))


[6, 12, 15, 26, 33, 46, 71, 78, 80, 84, 131, 133, 134, 138, 146]
Total: 15

In [33]:
def _pay_award(row):
    pb = row.payband
    pay_award = 0
    # Apply the pay award for eligible employees
    if ((row.hire_date < pd.to_datetime(eligibility_date)) & (row.random_group < 3)):
        if row.salary < option_df.loc[pb, 'breakpoint']:
            pay_award = option_df.loc[pb, 'pay_award_tier1']
        else:
            pay_award = option_df.loc[pb, 'pay_award_tier2']
    
    # If the employee is a special case, apply additional award
    if row.emp_id in special_cases:
        if row.payband == 1:
            pay_award = option_df.loc[1, 'special_award']
        if row.payband == 2:
            pay_award = option_df.loc[2, 'special_award']
    
    return pay_award

In [34]:
award_df = df.reset_index()
award_df['pay_award'] = award_df.apply(_pay_award, axis=1)

In [35]:
# Fill N/A's with 0 to enable totals
# award_df = award_df.fillna(value=0)
award_df['new_salary'] = award_df['salary'] + award_df['pay_award'] + award_df['uplift']
award_df['pro_rata_cost'] = (award_df['pay_award'] + award_df['uplift']) * award_df['fte']

In [36]:
'Cost of award: £{:,}'.format(award_df['pro_rata_cost'].sum())


Out[36]:
'Cost of award: £134,064.0'

In [37]:
'Cost of award: {:}%'.format(award_df['pro_rata_cost'].sum() / paybill * 100)


Out[37]:
'Cost of award: 0.9706722512082178%'

In [38]:
f, (ax1, ax2) = plt.subplots(1, 2, sharex='all', sharey='all', figsize=(12,5))
sns.boxplot(x='payband', y='salary', data=df, ax=ax1)
ax1.set_title('Before')
sns.boxplot(x='payband', y='new_salary', data=award_df, ax=ax2)
ax2.set_title('After')
ax2.set_ylabel('')
plt.show()


Non-consolidated pay award


In [39]:
def _bonus_award(row):
    pb = row.payband
    # Apply the pay award for eligible employees
    if ((row.hire_date < pd.to_datetime(eligibility_date)) & (row.random_group == 1)):
        return bonus_df.loc[pb, 'bonus']
    else:
        return 0

In [40]:
award_df['bonus'] = award_df.apply(_bonus_award, axis=1)
award_df['pro_rata_bonus'] = award_df['bonus'] * award_df['fte']

In [41]:
# Show a sample of current data
award_df[award_df['bonus']>0].sample(5)


Out[41]:
emp_id payband salary fte hire_date headcount fte_salary random_group uplift pay_award new_salary pro_rata_cost bonus pro_rata_bonus
147 148 2 116150 1.0 2011-05-12 1 116150.0 1 0 250.0 116400.0 250.0 10750.0 10750.0
120 121 1 87776 1.0 1992-01-04 1 87776.0 1 0 250.0 88026.0 250.0 10750.0 10750.0
98 99 1 80475 1.0 2007-01-06 1 80475.0 1 0 250.0 80725.0 250.0 10750.0 10750.0
78 79 1 76247 1.0 2013-01-06 1 76247.0 1 0 1000.0 77247.0 1000.0 10750.0 10750.0
102 103 1 81132 0.5 1992-01-04 1 40566.0 1 0 250.0 81382.0 125.0 10750.0 5375.0

In [42]:
bonus_total = award_df['pro_rata_bonus'].sum()
'Cost of bonus: £{:,}'.format(bonus_total)


Out[42]:
'Cost of bonus: £377,755.0'

In [43]:
'Cost of bonus: {:}%'.format(award_df['pro_rata_bonus'].sum() / paybill * 100)


Out[43]:
'Cost of bonus: 2.7350839618030216%'

Monte-Carlo Simulation


In [44]:
def run_simulation(df, paybill=None, N=100):
    df['headcount'] = 1
    
    # Calculate pro-rata salary
    df['fte_salary'] = df['salary'] * df['fte']
    
    # Check number of records to determine how many random numbers to generate
    n_recs = len(df)
    
    # If paybill amount not supplied, estimate it for calculations
    if paybill:
        paybill = paybill
    else:
        paybill = df['fte_salary'].sum()
    
    results = []
    
    for i in range(N):
        df['random_group'] = _generate_random(n_recs)
        
        model_distribution = df.groupby('random_group')['headcount'].agg([np.size]).rename(columns={'size':'total'})
        model_distribution['percent'] = model_distribution.total / model_distribution.total.sum()
        
        #############################
        # Apply uplift to new mins
        #############################
        temp_df = df.copy()
        temp_df['uplift'] = temp_df.apply(_apply_pay_uplift, axis=1)
        df = temp_df
        
        ############################
        # Apply pay awards
        ############################
        award_df = df.reset_index()
        df['pay_award'] = award_df.apply(_pay_award, axis=1)
        
        ############################
        # Apply bonus award
        ############################
        df['bonus'] = df.apply(_bonus_award, axis=1)
        df['pro_rata_bonus'] = df['bonus'] * df['fte']
        
        ############################
        # Analyse results 
        ############################
        # Fill N/A's with 0 to enable totals
        df = df.fillna(value=0)
        df['pro_rata_cost'] = (df['pay_award'] + df['uplift']) * df['fte']
        uplift_costs = (df['uplift'] * df['fte']).sum()
        costs = df['pro_rata_cost'].sum()
        bonus_total = df['pro_rata_bonus'].sum()
        
        results.append({
            'group1': model_distribution.loc[1, 'percent'],
            'group2': model_distribution.loc[2, 'percent'],
            'group3': model_distribution.loc[3, 'percent'],
            'pay_costs': costs,
            'pay_percent': (costs / paybill) * 100,
            'uplift_costs': uplift_costs,
            'uplift_percent': (uplift_costs / paybill) * 100,
            'bonus_costs': bonus_total,
            'bonus_percent': (bonus_total / paybill) * 100
        })
        
    return results

In [45]:
%%time
# source_df = load_data(n_recs=110)
source_df = model_data
runs = 500
paybill = 13811459
output = run_simulation(source_df, paybill=paybill, N=runs)


CPU times: user 44.2 s, sys: 457 ms, total: 44.7 s
Wall time: 46.5 s

In [46]:
results_df = pd.DataFrame(output)
results_df.mean().round(2)


Out[46]:
bonus_costs       427019.46
bonus_percent          3.09
group1                 0.25
group2                 0.65
group3                 0.10
pay_costs         135573.54
pay_percent            0.98
uplift_costs       10419.00
uplift_percent         0.08
dtype: float64

In [47]:
results_df.describe().round(2)


Out[47]:
bonus_costs bonus_percent group1 group2 group3 pay_costs pay_percent uplift_costs uplift_percent
count 500.00 500.00 500.00 500.00 500.00 500.00 500.00 500.0 500.00
mean 427019.46 3.09 0.25 0.65 0.10 135573.54 0.98 10419.0 0.08
std 58178.25 0.42 0.03 0.04 0.02 2952.64 0.02 0.0 0.00
min 230695.00 1.67 0.14 0.53 0.04 125326.50 0.91 10419.0 0.08
25% 386623.75 2.80 0.23 0.63 0.08 133585.25 0.97 10419.0 0.08
50% 425323.75 3.08 0.25 0.65 0.10 135669.00 0.98 10419.0 0.08
75% 464158.12 3.36 0.27 0.67 0.11 137798.38 1.00 10419.0 0.08
max 595980.00 4.32 0.36 0.74 0.17 143254.00 1.04 10419.0 0.08

Saving results in the database

Open database connection


In [48]:
if db_pass:
    # Server
    engine = create_engine("postgresql://tm470_owner:{}@172.17.0.1/tm470".format(db_pass))
    conn = engine.connect()
else:
    # Local environment
    engine = create_engine("postgresql://Joachim:@localhost/tm470")
    conn = engine.connect()

Store consolidated option results


In [49]:
# Results of the consolidated option
pay_option_id = 1
period_id = 1
model_params = option_df.to_json()
pay_costs = float(results_df[['pay_costs']].mean().round(2))
pay_percent = float(results_df[['pay_percent']].mean().round(2))
uplift_costs = float(results_df[['uplift_costs']].mean().round(2))
uplift_percent = float(results_df[['uplift_percent']].mean().round(2))

In [50]:
sql = '''
INSERT INTO model_results("option_id", "period_id", "model_params", "consolidated_costs", "consolidated_percent",
                          "paybill", "runs", "uplift_costs", "uplift_percent") 
VALUES({option_id}, {period_id}, '{model_params}', {consolidated_costs}, {consolidated_percent}, {paybill}, {runs},
       {uplift_costs}, {uplift_percent});
'''.format(option_id=pay_option_id, period_id=period_id, model_params=model_params, 
           consolidated_costs=pay_costs, consolidated_percent=pay_percent, paybill=paybill, runs=runs,
           uplift_costs=uplift_costs, uplift_percent=uplift_percent)

In [51]:
res = conn.execute(sql)

Store bonus option results


In [52]:
# Results of the bonus option
pay_option_id = 2
period_id = 1
model_params = bonus_df.to_json()
costs = float(results_df[['bonus_costs']].mean().round(2))
percent = float(results_df[['bonus_percent']].mean().round(2))

In [53]:
sql = '''
INSERT INTO model_results("option_id", "period_id", "model_params", "bonus_costs", "bonus_percent",
                          "paybill", "runs") 
VALUES({option_id}, {period_id}, '{model_params}', {bonus_costs}, {bonus_percent}, {paybill}, {runs});
'''.format(option_id=pay_option_id, period_id=period_id, model_params=model_params, 
           bonus_costs=costs, bonus_percent=percent, paybill=paybill, runs=runs)

In [54]:
res = conn.execute(sql)

In [55]:
conn.close()

Sample data for further analysis

Generate random groups corresponding exactly with the guided distribution, i.e. 25%, 65% and 10%


In [56]:
def generate_best_distribution():
    def generate_random():
        df = model_data
        #df = source_df.drop('random_group', 1)
        n_recs = len(df)
        df['random_group'] = _generate_random(n_recs)

        group_totals = (df.groupby('random_group')['headcount'].agg([np.size])).rename(columns={'size': 'total'})
        group_totals['percent'] = group_totals.total/group_totals.total.sum()
        return group_totals, df
    
    group_totals, df = generate_random()
    
    while not ((group_totals.round(2).loc[1, 'percent'] > 0.245 and 
        group_totals.round(2).loc[1, 'percent'] <= 0.255) and
        (group_totals.round(2).loc[3, 'percent'] > 0.095 and 
            group_totals.round(2).loc[3, 'percent'] <= 0.105)):
        group_totals, df = generate_random()
    return group_totals, df

In [57]:
# Check distribution by random group
group_totals, df = generate_best_distribution()
group_totals


Out[57]:
total percent
random_group
1 42 0.253012
2 107 0.644578
3 17 0.102410

In [58]:
# Check distribution within paybands
df.pivot_table(index='payband', columns='random_group', values='headcount', aggfunc=np.size, margins=True)


Out[58]:
random_group 1 2 3 All
payband
1 31.0 89.0 10.0 130.0
2 10.0 14.0 5.0 29.0
3 1.0 4.0 2.0 7.0
All 42.0 107.0 17.0 166.0

In [ ]:


In [ ]: