# compare_weighted_sampling_approaches

Use this notebook to compare the performance of standard numpy.random.choice calls with the approach defined in:

import numpy as np
import pandas as pd

from smartpy_core.wrangling import *

def randomize_weights(w):
"""
Generate probabilties, such that when sorted, they can be used to do weighted shuffle.

"""
w = w.fillna(0)
w_sum = w.sum()

if w_sum == 0:
# no valid weights, just return random
return pd.Series(np.random.rand(len(w)), index=w.index)
else:
p = w / w_sum
#print p
return pd.Series(np.power(np.random.rand(len(w)), 1.0 / p), index=w.index)

df = pd.DataFrame({'weight': np.arange(10, -1, step=-1)})

df

weight

0
10

1
9

2
8

3
7

4
6

5
5

6
4

7
3

8
2

9
1

10
0

11 rows × 1 columns

# now look at the probabilities from the weights
probs = df['weight'] / df['weight'].sum()
df['p'] = probs
probs

0     0.181818
1     0.163636
2     0.145455
3     0.127273
4     0.109091
5     0.090909
6     0.072727
7     0.054545
8     0.036364
9     0.018182
10    0.000000
Name: weight, dtype: float64

# test and make sure we are reproducing the distributions
num_tests = 10000
num_samples = 5

def run_test(df):

df = df.copy()

df['np_replace'] = 0
df['sort_replace'] = 0
df['np_noReplace'] = 0
df['sort_noReplace'] = 0

for i in range(num_tests):

# numpy sample w replacement
np_rep_idx = np.random.choice(df.index.values, size=num_samples, p=probs.values)
cnts = pd.Series(np_rep_idx).value_counts()
df.loc[cnts.index, 'np_replace'] += cnts

# numpy sample no replacement
np_noRep_idx = np.random.choice(df.index.values, size=num_samples, p=probs.values, replace=False)
df.loc[np_noRep_idx, 'np_noReplace'] += 1

# my approach w replacement
for i in range(num_samples):
rw = randomize_weights(df['weight'])
choice_idx = rw.argmax()
df.loc[choice_idx, 'sort_replace'] += 1

# my approach w/out replacment
rw = randomize_weights(df['weight'])
rank = rw.rank(ascending=False, method='first')
choice_idx = rank <= num_samples
df.loc[choice_idx, 'sort_noReplace'] += 1

return df

results = run_test(df)

results

weight
p
np_replace
sort_replace
np_noReplace
sort_noReplace

0
10
0.181818
8938
9108
7639
7519

1
9
0.163636
8168
8093
7203
7192

2
8
0.145455
7127
7367
6800
6794

3
7
0.127273
6386
6385
6323
6389

4
6
0.109091
5513
5373
5677
5755

5
5
0.090909
4601
4578
5046
5076

6
4
0.072727
3694
3593
4266
4314

7
3
0.054545
2763
2764
3367
3409

8
2
0.036364
1885
1833
2380
2294

9
1
0.018182
925
906
1299
1258

10
0
0.000000
0
0
0
0

11 rows × 6 columns

# compute probabilities
results_p = pd.concat([
df[['weight', 'p']],
results[['np_replace', 'sort_replace', 'np_noReplace', 'sort_noReplace']] / (1.0 * num_tests * num_samples)
], axis=1)
results_p

weight
p
np_replace
sort_replace
np_noReplace
sort_noReplace

0
10
0.181818
0.17876
0.18216
0.15278
0.15038

1
9
0.163636
0.16336
0.16186
0.14406
0.14384

2
8
0.145455
0.14254
0.14734
0.13600
0.13588

3
7
0.127273
0.12772
0.12770
0.12646
0.12778

4
6
0.109091
0.11026
0.10746
0.11354
0.11510

5
5
0.090909
0.09202
0.09156
0.10092
0.10152

6
4
0.072727
0.07388
0.07186
0.08532
0.08628

7
3
0.054545
0.05526
0.05528
0.06734
0.06818

8
2
0.036364
0.03770
0.03666
0.04760
0.04588

9
1
0.018182
0.01850
0.01812
0.02598
0.02516

10
0
0.000000
0.00000
0.00000
0.00000
0.00000

11 rows × 6 columns

