notebook.community

Edit and run

The objective of this notebook is to test the ability of the sampling weighted choice to reproduce probabilities under different conditions.

Vary the inputs for the sample size, the number of agent, the number of alternatives or the probability function itself to see how this performs.



In [1]:

    
import numpy as np
import pandas as pd

from smartpy_core.sampling import *
from smartpy_core.choice import *









    



time: 445 ms



In [2]:

    
# define some inputs
num_tests = 1000
sample_size = 50
num_choosers = 500
num_alts = 100
cap_per_alt = 10









    



time: 3 ms



In [3]:

    
# create our choosers
choosers = pd.DataFrame({'col': np.arange(num_choosers) / 10.0})
choosers.head(10)









    Out[3]:






  
    
      
      col
    
  
  
    
      0
       0.0
    
    
      1
       0.1
    
    
      2
       0.2
    
    
      3
       0.3
    
    
      4
       0.4
    
    
      5
       0.5
    
    
      6
       0.6
    
    
      7
       0.7
    
    
      8
       0.8
    
    
      9
       0.9
    
  

10 rows × 1 columns







    



time: 11 ms



In [4]:

    
# create alternatives
alts = pd.DataFrame({'alt_col': np.arange(num_alts) / 10.0})
alts['cap'] = cap_per_alt
alts['p'] = get_probs(np.exp(alts['alt_col']))

alts.head(10)









    Out[4]:






  
    
      
      alt_col
      cap
      p
    
  
  
    
      0
       0.0
       10
       0.000005
    
    
      1
       0.1
       10
       0.000005
    
    
      2
       0.2
       10
       0.000006
    
    
      3
       0.3
       10
       0.000006
    
    
      4
       0.4
       10
       0.000007
    
    
      5
       0.5
       10
       0.000008
    
    
      6
       0.6
       10
       0.000009
    
    
      7
       0.7
       10
       0.000010
    
    
      8
       0.8
       10
       0.000011
    
    
      9
       0.9
       10
       0.000012
    
  

10 rows × 3 columns







    



time: 16 ms

test 1 - just alternatives

define a simple probability function that just uses alternative values, see how well the results match the alternatives probabilities



In [5]:

    
# create a simple probability function that just uses the alternative values as utils
def simple_probs(interaction_data, num_choosers, sample_size):
    u = np.exp(interaction_data['alt_col'].values).reshape(num_choosers, sample_size)
    return u / u.sum(axis=1, keepdims=True)









    



time: 2 ms



In [6]:

    
# run the tests
res = alts.copy()
res['choices'] = 0
res['choices_w_cap'] = 0

for i in range(num_tests):
    # 1st run w/out capcities
    choices = choice_with_sampling(
        choosers, 
        alts,
        simple_probs,
        sample_size=sample_size
    )
    
    if i % 100 == 0:
        print 'on test: {}'.format(i)
    
    choice_sums = choices.groupby('alternative_id').size().reindex(res.index).fillna(0)
    res['choices'] += choice_sums
    
    # next run with capacities
    choices, capacities = capacity_choice_with_sampling(
        choosers,
        alts, 
        'cap',
        simple_probs,
        sample_size=sample_size
    )
    choice_sums = choices.value_counts().reindex(res.index).fillna(0)
    res['choices_w_cap'] += choice_sums
    if (choice_sums > cap_per_alt).any():
        print 'we blew capacity!'
        print choice_sums.max()
        print capacities.min()

res['avg_choices'] = res['choices'] / num_tests
res['avg_choices_w_cap'] = res['choices_w_cap'] / num_tests
res['choices_p'] = get_probs(res['choices'])
res['choices_w_cap_p'] = get_probs(res['choices_w_cap'])









    



on test: 0
on test: 100
on test: 200
on test: 300
on test: 400
on test: 500
on test: 600
on test: 700
on test: 800
on test: 900
time: 1min 42s



In [7]:

    
res.head(10)









    Out[7]:






  
    
      
      alt_col
      cap
      p
      choices
      choices_w_cap
      avg_choices
      avg_choices_w_cap
      choices_p
      choices_w_cap_p
    
  
  
    
      0
       0.0
       10
       0.000005
       3
       33
       0.003
       0.033
       0.000006
       0.000066
    
    
      1
       0.1
       10
       0.000005
       8
       38
       0.008
       0.038
       0.000016
       0.000076
    
    
      2
       0.2
       10
       0.000006
       3
       35
       0.003
       0.035
       0.000006
       0.000070
    
    
      3
       0.3
       10
       0.000006
       1
       40
       0.001
       0.040
       0.000002
       0.000080
    
    
      4
       0.4
       10
       0.000007
       4
       50
       0.004
       0.050
       0.000008
       0.000100
    
    
      5
       0.5
       10
       0.000008
       2
       39
       0.002
       0.039
       0.000004
       0.000078
    
    
      6
       0.6
       10
       0.000009
       6
       47
       0.006
       0.047
       0.000012
       0.000094
    
    
      7
       0.7
       10
       0.000010
       7
       51
       0.007
       0.051
       0.000014
       0.000102
    
    
      8
       0.8
       10
       0.000011
       3
       75
       0.003
       0.075
       0.000006
       0.000150
    
    
      9
       0.9
       10
       0.000012
       7
       79
       0.007
       0.079
       0.000014
       0.000158
    
  

10 rows × 9 columns







    



time: 27 ms



In [8]:

    
res.tail(10)









    Out[8]:






  
    
      
      alt_col
      cap
      p
      choices
      choices_w_cap
      avg_choices
      avg_choices_w_cap
      choices_p
      choices_w_cap_p
    
  
  
    
      90
       9.0
       10
       0.038692
       19672
       10000
       19.672
       10
       0.039344
       0.02
    
    
      91
       9.1
       10
       0.042761
       21856
       10000
       21.856
       10
       0.043712
       0.02
    
    
      92
       9.2
       10
       0.047258
       23411
       10000
       23.411
       10
       0.046822
       0.02
    
    
      93
       9.3
       10
       0.052229
       25844
       10000
       25.844
       10
       0.051688
       0.02
    
    
      94
       9.4
       10
       0.057722
       28797
       10000
       28.797
       10
       0.057594
       0.02
    
    
      95
       9.5
       10
       0.063792
       31434
       10000
       31.434
       10
       0.062868
       0.02
    
    
      96
       9.6
       10
       0.070501
       34604
       10000
       34.604
       10
       0.069208
       0.02
    
    
      97
       9.7
       10
       0.077916
       37975
       10000
       37.975
       10
       0.075950
       0.02
    
    
      98
       9.8
       10
       0.086111
       41382
       10000
       41.382
       10
       0.082764
       0.02
    
    
      99
       9.9
       10
       0.095167
       44929
       10000
       44.929
       10
       0.089858
       0.02
    
  

10 rows × 9 columns







    



time: 28 ms

test 2

use a social distance-esque approach to test matching agents with alternatives based on the charactersitics of both



In [9]:

    
choosers['c_grp'] = np.random.randint(0, 4, len(choosers))

choosers.head(10)









    Out[9]:






  
    
      
      col
      c_grp
    
  
  
    
      0
       0.0
       0
    
    
      1
       0.1
       1
    
    
      2
       0.2
       3
    
    
      3
       0.3
       2
    
    
      4
       0.4
       1
    
    
      5
       0.5
       3
    
    
      6
       0.6
       3
    
    
      7
       0.7
       3
    
    
      8
       0.8
       2
    
    
      9
       0.9
       1
    
  

10 rows × 2 columns







    



time: 11 ms



In [10]:

    
alts['a_grp'] = np.random.randint(0, 4, len(alts))

alts.head(10)









    Out[10]:






  
    
      
      alt_col
      cap
      p
      a_grp
    
  
  
    
      0
       0.0
       10
       0.000005
       1
    
    
      1
       0.1
       10
       0.000005
       1
    
    
      2
       0.2
       10
       0.000006
       2
    
    
      3
       0.3
       10
       0.000006
       0
    
    
      4
       0.4
       10
       0.000007
       0
    
    
      5
       0.5
       10
       0.000008
       1
    
    
      6
       0.6
       10
       0.000009
       0
    
    
      7
       0.7
       10
       0.000010
       3
    
    
      8
       0.8
       10
       0.000011
       3
    
    
      9
       0.9
       10
       0.000012
       2
    
  

10 rows × 4 columns







    



time: 14 ms



In [11]:

    
# a probability function that tries to match agents and choosers in a social distance like way
def match_probs(interaction_data, num_choosers, sample_size):
    dist = 1.0 + (interaction_data['c_grp'] - interaction_data['a_grp']).abs()
    u = np.exp(1 / dist**2).values.reshape(num_choosers, sample_size)
    return u / u.sum(axis=1, keepdims=True)









    



time: 4 ms



In [12]:

    
# test without capacity
choices = choice_with_sampling(
    choosers, 
    alts,
    match_probs,
    sample_size=sample_size
)









    



time: 77 ms



In [13]:

    
choices['c_grp'] = choosers['c_grp']
choices['a_grp'] = broadcast(alts['a_grp'], choices['alternative_id'])

choices.head(10)









    Out[13]:






  
    
      
      alternative_id
      prob
      c_grp
      a_grp
    
  
  
    
      0
       72
       0.036804
       0
       0
    
    
      1
        8
       0.013499
       1
       3
    
    
      2
       12
       0.016737
       3
       2
    
    
      3
       84
       0.035745
       2
       2
    
    
      4
       90
       0.015277
       1
       2
    
    
      5
       29
       0.016045
       3
       2
    
    
      6
       83
       0.016088
       3
       2
    
    
      7
        9
       0.016395
       3
       2
    
    
      8
       41
       0.033484
       2
       2
    
    
      9
       92
       0.015849
       1
       2
    
  

10 rows × 4 columns







    



time: 18 ms



In [14]:

    
choices.groupby(['c_grp', 'a_grp']).size().to_frame()









    Out[14]:






  
    
      
      
      0
    
    
      c_grp
      a_grp
      
    
  
  
    
      0
      0
       46
    
    
      1
       38
    
    
      2
       30
    
    
      3
       18
    
    
      1
      0
       20
    
    
      1
       61
    
    
      2
       26
    
    
      3
       29
    
    
      2
      0
       17
    
    
      1
       27
    
    
      2
       40
    
    
      3
       24
    
    
      3
      0
       17
    
    
      1
       22
    
    
      2
       34
    
    
      3
       51
    
  

16 rows × 1 columns







    



time: 13 ms



In [15]:

    
choices, capacities = capacity_choice_with_sampling(
    choosers,
    alts, 
    'cap',
    match_probs,
    sample_size=sample_size
)









    



time: 60 ms



In [16]:

    
c = choosers.copy()
c['alternative_id'] = choices
c['a_grp'] = broadcast(alts['a_grp'], c['alternative_id'])









    



time: 4 ms



In [17]:

    
c.groupby(['c_grp', 'a_grp']).size().to_frame()









    Out[17]:






  
    
      
      
      0
    
    
      c_grp
      a_grp
      
    
  
  
    
      0
      0
       49
    
    
      1
       36
    
    
      2
       28
    
    
      3
       19
    
    
      1
      0
       22
    
    
      1
       52
    
    
      2
       31
    
    
      3
       31
    
    
      2
      0
       20
    
    
      1
       18
    
    
      2
       46
    
    
      3
       24
    
    
      3
      0
       20
    
    
      1
       30
    
    
      2
       25
    
    
      3
       49
    
  

16 rows × 1 columns







    



time: 14 ms

test 3 - mnl style

This is the typical way we would apply these, via coefficient from an estimation, however, here I'm just making them up.



In [22]:

    
coeff = pd.Series(
    [0.000001, .5],
    index=['col', 'alt_col']
)
coeff









    Out[22]:





col        0.000001
alt_col    0.500000
dtype: float64






    



time: 5 ms



In [23]:

    
# run some tests
res = alts.copy()
res['choices'] = 0
res['choices_w_cap'] = 0

for i in range(num_tests):
    
    if i % 100 == 0:
        print 'on test: {}'.format(i)
    
    # 1st run w/out capcities
    choices = mnl_choice_with_sampling(
        choosers,
        alts,
        coeff,
        sample_size=sample_size
    )
    choice_sums = choices.value_counts().reindex(res.index).fillna(0)
    res['choices'] += choice_sums
    
    # next run with capacities
    choices = mnl_choice_with_sampling(
        choosers,
        alts,
        coeff,
        sample_size=sample_size,
        cap_col='cap'
    )
    
    choice_sums = choices.value_counts().reindex(res.index).fillna(0)
    res['choices_w_cap'] += choice_sums
    if (choice_sums > cap_per_alt).any():
        print 'we blew capacity!'
        print choice_sums.max()
        print capacities.min()

res['avg_choices'] = res['choices'] / num_tests
res['avg_choices_w_cap'] = res['choices_w_cap'] / num_tests
res['choices_p'] = get_probs(res['choices'])
res['choices_w_cap_p'] = get_probs(res['choices_w_cap'])









    



on test: 0
on test: 100
on test: 200
on test: 300
on test: 400
on test: 500
on test: 600
on test: 700
on test: 800
on test: 900
time: 1min 35s



In [24]:

    
res.head(10)









    Out[24]:






  
    
      
      alt_col
      cap
      p
      a_grp
      choices
      choices_w_cap
      avg_choices
      avg_choices_w_cap
      choices_p
      choices_w_cap_p
    
  
  
    
      0
       0.0
       10
       0.000005
       1
       169
       338
       0.169
       0.338
       0.000338
       0.000676
    
    
      1
       0.1
       10
       0.000005
       1
       211
       348
       0.211
       0.348
       0.000422
       0.000696
    
    
      2
       0.2
       10
       0.000006
       2
       193
       371
       0.193
       0.371
       0.000386
       0.000742
    
    
      3
       0.3
       10
       0.000006
       0
       217
       404
       0.217
       0.404
       0.000434
       0.000808
    
    
      4
       0.4
       10
       0.000007
       0
       241
       395
       0.241
       0.395
       0.000482
       0.000790
    
    
      5
       0.5
       10
       0.000008
       1
       242
       431
       0.242
       0.431
       0.000484
       0.000862
    
    
      6
       0.6
       10
       0.000009
       0
       202
       430
       0.202
       0.430
       0.000404
       0.000860
    
    
      7
       0.7
       10
       0.000010
       3
       278
       503
       0.278
       0.503
       0.000556
       0.001006
    
    
      8
       0.8
       10
       0.000011
       3
       286
       488
       0.286
       0.488
       0.000572
       0.000976
    
    
      9
       0.9
       10
       0.000012
       2
       270
       524
       0.270
       0.524
       0.000540
       0.001048
    
  

10 rows × 10 columns







    



time: 31 ms



In [25]:

    
res.tail(10)









    Out[25]:






  
    
      
      alt_col
      cap
      p
      a_grp
      choices
      choices_w_cap
      avg_choices
      avg_choices_w_cap
      choices_p
      choices_w_cap_p
    
  
  
    
      90
       9.0
       10
       0.038692
       2
       15579
       10000
       15.579
       10
       0.031158
       0.02
    
    
      91
       9.1
       10
       0.042761
       3
       16204
       10000
       16.204
       10
       0.032408
       0.02
    
    
      92
       9.2
       10
       0.047258
       2
       17117
       10000
       17.117
       10
       0.034234
       0.02
    
    
      93
       9.3
       10
       0.052229
       3
       17796
       10000
       17.796
       10
       0.035592
       0.02
    
    
      94
       9.4
       10
       0.057722
       1
       18912
       10000
       18.912
       10
       0.037824
       0.02
    
    
      95
       9.5
       10
       0.063792
       0
       19690
       10000
       19.690
       10
       0.039380
       0.02
    
    
      96
       9.6
       10
       0.070501
       3
       20550
       10000
       20.550
       10
       0.041100
       0.02
    
    
      97
       9.7
       10
       0.077916
       1
       21828
       10000
       21.828
       10
       0.043656
       0.02
    
    
      98
       9.8
       10
       0.086111
       1
       23193
       10000
       23.193
       10
       0.046386
       0.02
    
    
      99
       9.9
       10
       0.095167
       0
       23962
       10000
       23.962
       10
       0.047924
       0.02
    
  

10 rows × 10 columns







    



time: 76 ms



In [21]:

	alt_col	cap	p
0	0.0	10	0.000005
1	0.1	10	0.000005
2	0.2	10	0.000006
3	0.3	10	0.000006
4	0.4	10	0.000007
5	0.5	10	0.000008
6	0.6	10	0.000009
7	0.7	10	0.000010
8	0.8	10	0.000011
9	0.9	10	0.000012

	alt_col	cap	p	choices	choices_w_cap	avg_choices	avg_choices_w_cap	choices_p	choices_w_cap_p
0	0.0	10	0.000005	3	33	0.003	0.033	0.000006	0.000066
1	0.1	10	0.000005	8	38	0.008	0.038	0.000016	0.000076
2	0.2	10	0.000006	3	35	0.003	0.035	0.000006	0.000070
3	0.3	10	0.000006	1	40	0.001	0.040	0.000002	0.000080
4	0.4	10	0.000007	4	50	0.004	0.050	0.000008	0.000100
5	0.5	10	0.000008	2	39	0.002	0.039	0.000004	0.000078
6	0.6	10	0.000009	6	47	0.006	0.047	0.000012	0.000094
7	0.7	10	0.000010	7	51	0.007	0.051	0.000014	0.000102
8	0.8	10	0.000011	3	75	0.003	0.075	0.000006	0.000150
9	0.9	10	0.000012	7	79	0.007	0.079	0.000014	0.000158

	alt_col	cap	p	choices	choices_w_cap	avg_choices	avg_choices_w_cap	choices_p	choices_w_cap_p
90	9.0	10	0.038692	19672	10000	19.672	10	0.039344	0.02
91	9.1	10	0.042761	21856	10000	21.856	10	0.043712	0.02
92	9.2	10	0.047258	23411	10000	23.411	10	0.046822	0.02
93	9.3	10	0.052229	25844	10000	25.844	10	0.051688	0.02
94	9.4	10	0.057722	28797	10000	28.797	10	0.057594	0.02
95	9.5	10	0.063792	31434	10000	31.434	10	0.062868	0.02
96	9.6	10	0.070501	34604	10000	34.604	10	0.069208	0.02
97	9.7	10	0.077916	37975	10000	37.975	10	0.075950	0.02
98	9.8	10	0.086111	41382	10000	41.382	10	0.082764	0.02
99	9.9	10	0.095167	44929	10000	44.929	10	0.089858	0.02

	alternative_id	prob	c_grp	a_grp
0	72	0.036804	0	0
1	8	0.013499	1	3
2	12	0.016737	3	2
3	84	0.035745	2	2
4	90	0.015277	1	2
5	29	0.016045	3	2
6	83	0.016088	3	2
7	9	0.016395	3	2
8	41	0.033484	2	2
9	92	0.015849	1	2

		0
c_grp	a_grp
0	0	46
	1	38
	2	30
	3	18
1	0	20
	1	61
	2	26
	3	29
2	0	17
	1	27
	2	40
	3	24
3	0	17
	1	22
	2	34
	3	51

	alt_col	cap	p	a_grp	choices	choices_w_cap	avg_choices	avg_choices_w_cap	choices_p	choices_w_cap_p
0	0.0	10	0.000005	1	169	338	0.169	0.338	0.000338	0.000676
1	0.1	10	0.000005	1	211	348	0.211	0.348	0.000422	0.000696
2	0.2	10	0.000006	2	193	371	0.193	0.371	0.000386	0.000742
3	0.3	10	0.000006	0	217	404	0.217	0.404	0.000434	0.000808
4	0.4	10	0.000007	0	241	395	0.241	0.395	0.000482	0.000790
5	0.5	10	0.000008	1	242	431	0.242	0.431	0.000484	0.000862
6	0.6	10	0.000009	0	202	430	0.202	0.430	0.000404	0.000860
7	0.7	10	0.000010	3	278	503	0.278	0.503	0.000556	0.001006
8	0.8	10	0.000011	3	286	488	0.286	0.488	0.000572	0.000976
9	0.9	10	0.000012	2	270	524	0.270	0.524	0.000540	0.001048

	alt_col	cap	p	a_grp	choices	choices_w_cap	avg_choices	avg_choices_w_cap	choices_p	choices_w_cap_p
90	9.0	10	0.038692	2	15579	10000	15.579	10	0.031158	0.02
91	9.1	10	0.042761	3	16204	10000	16.204	10	0.032408	0.02
92	9.2	10	0.047258	2	17117	10000	17.117	10	0.034234	0.02
93	9.3	10	0.052229	3	17796	10000	17.796	10	0.035592	0.02
94	9.4	10	0.057722	1	18912	10000	18.912	10	0.037824	0.02
95	9.5	10	0.063792	0	19690	10000	19.690	10	0.039380	0.02
96	9.6	10	0.070501	3	20550	10000	20.550	10	0.041100	0.02
97	9.7	10	0.077916	1	21828	10000	21.828	10	0.043656	0.02
98	9.8	10	0.086111	1	23193	10000	23.193	10	0.046386	0.02
99	9.9	10	0.095167	0	23962	10000	23.962	10	0.047924	0.02