choice_with_sampling_testing


The objective of this notebook is to test the ability of the sampling weighted choice to reproduce probabilities under different conditions.

Vary the inputs for the sample size, the number of agent, the number of alternatives or the probability function itself to see how this performs.


In [1]:
import numpy as np
import pandas as pd

from smartpy_core.sampling import *
from smartpy_core.choice import *


time: 445 ms

In [2]:
# define some inputs
num_tests = 1000
sample_size = 50
num_choosers = 500
num_alts = 100
cap_per_alt = 10


time: 3 ms

In [3]:
# create our choosers
choosers = pd.DataFrame({'col': np.arange(num_choosers) / 10.0})
choosers.head(10)


Out[3]:
col
0 0.0
1 0.1
2 0.2
3 0.3
4 0.4
5 0.5
6 0.6
7 0.7
8 0.8
9 0.9

10 rows × 1 columns

time: 11 ms

In [4]:
# create alternatives
alts = pd.DataFrame({'alt_col': np.arange(num_alts) / 10.0})
alts['cap'] = cap_per_alt
alts['p'] = get_probs(np.exp(alts['alt_col']))

alts.head(10)


Out[4]:
alt_col cap p
0 0.0 10 0.000005
1 0.1 10 0.000005
2 0.2 10 0.000006
3 0.3 10 0.000006
4 0.4 10 0.000007
5 0.5 10 0.000008
6 0.6 10 0.000009
7 0.7 10 0.000010
8 0.8 10 0.000011
9 0.9 10 0.000012

10 rows × 3 columns

time: 16 ms

test 1 - just alternatives

define a simple probability function that just uses alternative values, see how well the results match the alternatives probabilities


In [5]:
# create a simple probability function that just uses the alternative values as utils
def simple_probs(interaction_data, num_choosers, sample_size):
    u = np.exp(interaction_data['alt_col'].values).reshape(num_choosers, sample_size)
    return u / u.sum(axis=1, keepdims=True)


time: 2 ms

In [6]:
# run the tests
res = alts.copy()
res['choices'] = 0
res['choices_w_cap'] = 0

for i in range(num_tests):
    # 1st run w/out capcities
    choices = choice_with_sampling(
        choosers, 
        alts,
        simple_probs,
        sample_size=sample_size
    )
    
    if i % 100 == 0:
        print 'on test: {}'.format(i)
    
    choice_sums = choices.groupby('alternative_id').size().reindex(res.index).fillna(0)
    res['choices'] += choice_sums
    
    # next run with capacities
    choices, capacities = capacity_choice_with_sampling(
        choosers,
        alts, 
        'cap',
        simple_probs,
        sample_size=sample_size
    )
    choice_sums = choices.value_counts().reindex(res.index).fillna(0)
    res['choices_w_cap'] += choice_sums
    if (choice_sums > cap_per_alt).any():
        print 'we blew capacity!'
        print choice_sums.max()
        print capacities.min()

res['avg_choices'] = res['choices'] / num_tests
res['avg_choices_w_cap'] = res['choices_w_cap'] / num_tests
res['choices_p'] = get_probs(res['choices'])
res['choices_w_cap_p'] = get_probs(res['choices_w_cap'])


on test: 0
on test: 100
on test: 200
on test: 300
on test: 400
on test: 500
on test: 600
on test: 700
on test: 800
on test: 900
time: 1min 42s

In [7]:
res.head(10)


Out[7]:
alt_col cap p choices choices_w_cap avg_choices avg_choices_w_cap choices_p choices_w_cap_p
0 0.0 10 0.000005 3 33 0.003 0.033 0.000006 0.000066
1 0.1 10 0.000005 8 38 0.008 0.038 0.000016 0.000076
2 0.2 10 0.000006 3 35 0.003 0.035 0.000006 0.000070
3 0.3 10 0.000006 1 40 0.001 0.040 0.000002 0.000080
4 0.4 10 0.000007 4 50 0.004 0.050 0.000008 0.000100
5 0.5 10 0.000008 2 39 0.002 0.039 0.000004 0.000078
6 0.6 10 0.000009 6 47 0.006 0.047 0.000012 0.000094
7 0.7 10 0.000010 7 51 0.007 0.051 0.000014 0.000102
8 0.8 10 0.000011 3 75 0.003 0.075 0.000006 0.000150
9 0.9 10 0.000012 7 79 0.007 0.079 0.000014 0.000158

10 rows × 9 columns

time: 27 ms

In [8]:
res.tail(10)


Out[8]:
alt_col cap p choices choices_w_cap avg_choices avg_choices_w_cap choices_p choices_w_cap_p
90 9.0 10 0.038692 19672 10000 19.672 10 0.039344 0.02
91 9.1 10 0.042761 21856 10000 21.856 10 0.043712 0.02
92 9.2 10 0.047258 23411 10000 23.411 10 0.046822 0.02
93 9.3 10 0.052229 25844 10000 25.844 10 0.051688 0.02
94 9.4 10 0.057722 28797 10000 28.797 10 0.057594 0.02
95 9.5 10 0.063792 31434 10000 31.434 10 0.062868 0.02
96 9.6 10 0.070501 34604 10000 34.604 10 0.069208 0.02
97 9.7 10 0.077916 37975 10000 37.975 10 0.075950 0.02
98 9.8 10 0.086111 41382 10000 41.382 10 0.082764 0.02
99 9.9 10 0.095167 44929 10000 44.929 10 0.089858 0.02

10 rows × 9 columns

time: 28 ms

test 2

use a social distance-esque approach to test matching agents with alternatives based on the charactersitics of both


In [9]:
choosers['c_grp'] = np.random.randint(0, 4, len(choosers))

choosers.head(10)


Out[9]:
col c_grp
0 0.0 0
1 0.1 1
2 0.2 3
3 0.3 2
4 0.4 1
5 0.5 3
6 0.6 3
7 0.7 3
8 0.8 2
9 0.9 1

10 rows × 2 columns

time: 11 ms

In [10]:
alts['a_grp'] = np.random.randint(0, 4, len(alts))

alts.head(10)


Out[10]:
alt_col cap p a_grp
0 0.0 10 0.000005 1
1 0.1 10 0.000005 1
2 0.2 10 0.000006 2
3 0.3 10 0.000006 0
4 0.4 10 0.000007 0
5 0.5 10 0.000008 1
6 0.6 10 0.000009 0
7 0.7 10 0.000010 3
8 0.8 10 0.000011 3
9 0.9 10 0.000012 2

10 rows × 4 columns

time: 14 ms

In [11]:
# a probability function that tries to match agents and choosers in a social distance like way
def match_probs(interaction_data, num_choosers, sample_size):
    dist = 1.0 + (interaction_data['c_grp'] - interaction_data['a_grp']).abs()
    u = np.exp(1 / dist**2).values.reshape(num_choosers, sample_size)
    return u / u.sum(axis=1, keepdims=True)


time: 4 ms

In [12]:
# test without capacity
choices = choice_with_sampling(
    choosers, 
    alts,
    match_probs,
    sample_size=sample_size
)


time: 77 ms

In [13]:
choices['c_grp'] = choosers['c_grp']
choices['a_grp'] = broadcast(alts['a_grp'], choices['alternative_id'])

choices.head(10)


Out[13]:
alternative_id prob c_grp a_grp
0 72 0.036804 0 0
1 8 0.013499 1 3
2 12 0.016737 3 2
3 84 0.035745 2 2
4 90 0.015277 1 2
5 29 0.016045 3 2
6 83 0.016088 3 2
7 9 0.016395 3 2
8 41 0.033484 2 2
9 92 0.015849 1 2

10 rows × 4 columns

time: 18 ms

In [14]:
choices.groupby(['c_grp', 'a_grp']).size().to_frame()


Out[14]:
0
c_grp a_grp
0 0 46
1 38
2 30
3 18
1 0 20
1 61
2 26
3 29
2 0 17
1 27
2 40
3 24
3 0 17
1 22
2 34
3 51

16 rows × 1 columns

time: 13 ms

In [15]:
choices, capacities = capacity_choice_with_sampling(
    choosers,
    alts, 
    'cap',
    match_probs,
    sample_size=sample_size
)


time: 60 ms

In [16]:
c = choosers.copy()
c['alternative_id'] = choices
c['a_grp'] = broadcast(alts['a_grp'], c['alternative_id'])


time: 4 ms

In [17]:
c.groupby(['c_grp', 'a_grp']).size().to_frame()


Out[17]:
0
c_grp a_grp
0 0 49
1 36
2 28
3 19
1 0 22
1 52
2 31
3 31
2 0 20
1 18
2 46
3 24
3 0 20
1 30
2 25
3 49

16 rows × 1 columns

time: 14 ms

test 3 - mnl style

This is the typical way we would apply these, via coefficient from an estimation, however, here I'm just making them up.


In [22]:
coeff = pd.Series(
    [0.000001, .5],
    index=['col', 'alt_col']
)
coeff


Out[22]:
col        0.000001
alt_col    0.500000
dtype: float64
time: 5 ms

In [23]:
# run some tests
res = alts.copy()
res['choices'] = 0
res['choices_w_cap'] = 0

for i in range(num_tests):
    
    if i % 100 == 0:
        print 'on test: {}'.format(i)
    
    # 1st run w/out capcities
    choices = mnl_choice_with_sampling(
        choosers,
        alts,
        coeff,
        sample_size=sample_size
    )
    choice_sums = choices.value_counts().reindex(res.index).fillna(0)
    res['choices'] += choice_sums
    
    # next run with capacities
    choices = mnl_choice_with_sampling(
        choosers,
        alts,
        coeff,
        sample_size=sample_size,
        cap_col='cap'
    )
    
    choice_sums = choices.value_counts().reindex(res.index).fillna(0)
    res['choices_w_cap'] += choice_sums
    if (choice_sums > cap_per_alt).any():
        print 'we blew capacity!'
        print choice_sums.max()
        print capacities.min()

res['avg_choices'] = res['choices'] / num_tests
res['avg_choices_w_cap'] = res['choices_w_cap'] / num_tests
res['choices_p'] = get_probs(res['choices'])
res['choices_w_cap_p'] = get_probs(res['choices_w_cap'])


on test: 0
on test: 100
on test: 200
on test: 300
on test: 400
on test: 500
on test: 600
on test: 700
on test: 800
on test: 900
time: 1min 35s

In [24]:
res.head(10)


Out[24]:
alt_col cap p a_grp choices choices_w_cap avg_choices avg_choices_w_cap choices_p choices_w_cap_p
0 0.0 10 0.000005 1 169 338 0.169 0.338 0.000338 0.000676
1 0.1 10 0.000005 1 211 348 0.211 0.348 0.000422 0.000696
2 0.2 10 0.000006 2 193 371 0.193 0.371 0.000386 0.000742
3 0.3 10 0.000006 0 217 404 0.217 0.404 0.000434 0.000808
4 0.4 10 0.000007 0 241 395 0.241 0.395 0.000482 0.000790
5 0.5 10 0.000008 1 242 431 0.242 0.431 0.000484 0.000862
6 0.6 10 0.000009 0 202 430 0.202 0.430 0.000404 0.000860
7 0.7 10 0.000010 3 278 503 0.278 0.503 0.000556 0.001006
8 0.8 10 0.000011 3 286 488 0.286 0.488 0.000572 0.000976
9 0.9 10 0.000012 2 270 524 0.270 0.524 0.000540 0.001048

10 rows × 10 columns

time: 31 ms

In [25]:
res.tail(10)


Out[25]:
alt_col cap p a_grp choices choices_w_cap avg_choices avg_choices_w_cap choices_p choices_w_cap_p
90 9.0 10 0.038692 2 15579 10000 15.579 10 0.031158 0.02
91 9.1 10 0.042761 3 16204 10000 16.204 10 0.032408 0.02
92 9.2 10 0.047258 2 17117 10000 17.117 10 0.034234 0.02
93 9.3 10 0.052229 3 17796 10000 17.796 10 0.035592 0.02
94 9.4 10 0.057722 1 18912 10000 18.912 10 0.037824 0.02
95 9.5 10 0.063792 0 19690 10000 19.690 10 0.039380 0.02
96 9.6 10 0.070501 3 20550 10000 20.550 10 0.041100 0.02
97 9.7 10 0.077916 1 21828 10000 21.828 10 0.043656 0.02
98 9.8 10 0.086111 1 23193 10000 23.193 10 0.046386 0.02
99 9.9 10 0.095167 0 23962 10000 23.962 10 0.047924 0.02

10 rows × 10 columns

time: 76 ms

In [21]: