compare_weighted_sampling_approaches


Use this notebook to compare the performance of standard numpy.random.choice calls with the approach defined in:


In [1]:
import numpy as np
import pandas as pd

from smartpy_core.wrangling import *


time: 404 ms

In [2]:
def randomize_weights(w):
    """
    Generate probabilties, such that when sorted, they can be used to do weighted shuffle.
    
    """
    w = w.fillna(0)
    w_sum = w.sum()
    
    if w_sum == 0:
        # no valid weights, just return random
        return pd.Series(np.random.rand(len(w)), index=w.index)
    else:
        p = w / w_sum
        #print p
        return pd.Series(np.power(np.random.rand(len(w)), 1.0 / p), index=w.index)


time: 6 ms

In [3]:
df = pd.DataFrame({'weight': np.arange(10, -1, step=-1)})

df


Out[3]:
weight
0 10
1 9
2 8
3 7
4 6
5 5
6 4
7 3
8 2
9 1
10 0

11 rows × 1 columns

time: 12 ms

In [4]:
# now look at the probabilities from the weights
probs = df['weight'] / df['weight'].sum()
df['p'] = probs
probs


Out[4]:
0     0.181818
1     0.163636
2     0.145455
3     0.127273
4     0.109091
5     0.090909
6     0.072727
7     0.054545
8     0.036364
9     0.018182
10    0.000000
Name: weight, dtype: float64
time: 4 ms

In [5]:
# test and make sure we are reproducing the distributions
num_tests = 10000
num_samples = 5

def run_test(df):
    
    df = df.copy()
    
    df['np_replace'] = 0
    df['sort_replace'] = 0
    df['np_noReplace'] = 0
    df['sort_noReplace'] = 0

    for i in range(num_tests):

        # numpy sample w replacement
        np_rep_idx = np.random.choice(df.index.values, size=num_samples, p=probs.values)
        cnts = pd.Series(np_rep_idx).value_counts()
        df.loc[cnts.index, 'np_replace'] += cnts
        
        # numpy sample no replacement
        np_noRep_idx = np.random.choice(df.index.values, size=num_samples, p=probs.values, replace=False)
        df.loc[np_noRep_idx, 'np_noReplace'] += 1

        # my approach w replacement
        for i in range(num_samples):
            rw = randomize_weights(df['weight'])
            choice_idx = rw.argmax()
            df.loc[choice_idx, 'sort_replace'] += 1

        # my approach w/out replacment
        rw = randomize_weights(df['weight'])
        rank = rw.rank(ascending=False, method='first')
        choice_idx = rank <= num_samples
        df.loc[choice_idx, 'sort_noReplace'] += 1

    return df

results = run_test(df)

results


Out[5]:
weight p np_replace sort_replace np_noReplace sort_noReplace
0 10 0.181818 8938 9108 7639 7519
1 9 0.163636 8168 8093 7203 7192
2 8 0.145455 7127 7367 6800 6794
3 7 0.127273 6386 6385 6323 6389
4 6 0.109091 5513 5373 5677 5755
5 5 0.090909 4601 4578 5046 5076
6 4 0.072727 3694 3593 4266 4314
7 3 0.054545 2763 2764 3367 3409
8 2 0.036364 1885 1833 2380 2294
9 1 0.018182 925 906 1299 1258
10 0 0.000000 0 0 0 0

11 rows × 6 columns

time: 2min

In [6]:
# compute probabilities
results_p = pd.concat([
    df[['weight', 'p']], 
    results[['np_replace', 'sort_replace', 'np_noReplace', 'sort_noReplace']] / (1.0 * num_tests * num_samples)
], axis=1)
results_p


Out[6]:
weight p np_replace sort_replace np_noReplace sort_noReplace
0 10 0.181818 0.17876 0.18216 0.15278 0.15038
1 9 0.163636 0.16336 0.16186 0.14406 0.14384
2 8 0.145455 0.14254 0.14734 0.13600 0.13588
3 7 0.127273 0.12772 0.12770 0.12646 0.12778
4 6 0.109091 0.11026 0.10746 0.11354 0.11510
5 5 0.090909 0.09202 0.09156 0.10092 0.10152
6 4 0.072727 0.07388 0.07186 0.08532 0.08628
7 3 0.054545 0.05526 0.05528 0.06734 0.06818
8 2 0.036364 0.03770 0.03666 0.04760 0.04588
9 1 0.018182 0.01850 0.01812 0.02598 0.02516
10 0 0.000000 0.00000 0.00000 0.00000 0.00000

11 rows × 6 columns

time: 25 ms

In [6]: