Use this notebook to compare the performance of standard numpy.random.choice calls with the approach defined in:

http://utopia.duth.gr/~pefraimi/research/data/2007EncOfAlg.pdf



In [1]:

    
import numpy as np
import pandas as pd

from smartpy_core.wrangling import *









    



time: 404 ms



In [2]:

    
def randomize_weights(w):
    """
    Generate probabilties, such that when sorted, they can be used to do weighted shuffle.
    
    """
    w = w.fillna(0)
    w_sum = w.sum()
    
    if w_sum == 0:
        # no valid weights, just return random
        return pd.Series(np.random.rand(len(w)), index=w.index)
    else:
        p = w / w_sum
        #print p
        return pd.Series(np.power(np.random.rand(len(w)), 1.0 / p), index=w.index)









    



time: 6 ms



In [3]:

    
df = pd.DataFrame({'weight': np.arange(10, -1, step=-1)})

df









    Out[3]:






  
    
      
      weight
    
  
  
    
      0 
       10
    
    
      1 
        9
    
    
      2 
        8
    
    
      3 
        7
    
    
      4 
        6
    
    
      5 
        5
    
    
      6 
        4
    
    
      7 
        3
    
    
      8 
        2
    
    
      9 
        1
    
    
      10
        0
    
  

11 rows × 1 columns







    



time: 12 ms



In [4]:

    
# now look at the probabilities from the weights
probs = df['weight'] / df['weight'].sum()
df['p'] = probs
probs









    Out[4]:





0     0.181818
1     0.163636
2     0.145455
3     0.127273
4     0.109091
5     0.090909
6     0.072727
7     0.054545
8     0.036364
9     0.018182
10    0.000000
Name: weight, dtype: float64






    



time: 4 ms



In [5]:

    
# test and make sure we are reproducing the distributions
num_tests = 10000
num_samples = 5

def run_test(df):
    
    df = df.copy()
    
    df['np_replace'] = 0
    df['sort_replace'] = 0
    df['np_noReplace'] = 0
    df['sort_noReplace'] = 0

    for i in range(num_tests):

        # numpy sample w replacement
        np_rep_idx = np.random.choice(df.index.values, size=num_samples, p=probs.values)
        cnts = pd.Series(np_rep_idx).value_counts()
        df.loc[cnts.index, 'np_replace'] += cnts
        
        # numpy sample no replacement
        np_noRep_idx = np.random.choice(df.index.values, size=num_samples, p=probs.values, replace=False)
        df.loc[np_noRep_idx, 'np_noReplace'] += 1

        # my approach w replacement
        for i in range(num_samples):
            rw = randomize_weights(df['weight'])
            choice_idx = rw.argmax()
            df.loc[choice_idx, 'sort_replace'] += 1

        # my approach w/out replacment
        rw = randomize_weights(df['weight'])
        rank = rw.rank(ascending=False, method='first')
        choice_idx = rank <= num_samples
        df.loc[choice_idx, 'sort_noReplace'] += 1

    return df

results = run_test(df)

results









    Out[5]:






  
    
      
      weight
      p
      np_replace
      sort_replace
      np_noReplace
      sort_noReplace
    
  
  
    
      0 
       10
       0.181818
       8938
       9108
       7639
       7519
    
    
      1 
        9
       0.163636
       8168
       8093
       7203
       7192
    
    
      2 
        8
       0.145455
       7127
       7367
       6800
       6794
    
    
      3 
        7
       0.127273
       6386
       6385
       6323
       6389
    
    
      4 
        6
       0.109091
       5513
       5373
       5677
       5755
    
    
      5 
        5
       0.090909
       4601
       4578
       5046
       5076
    
    
      6 
        4
       0.072727
       3694
       3593
       4266
       4314
    
    
      7 
        3
       0.054545
       2763
       2764
       3367
       3409
    
    
      8 
        2
       0.036364
       1885
       1833
       2380
       2294
    
    
      9 
        1
       0.018182
        925
        906
       1299
       1258
    
    
      10
        0
       0.000000
          0
          0
          0
          0
    
  

11 rows × 6 columns







    



time: 2min



In [6]:

    
# compute probabilities
results_p = pd.concat([
    df[['weight', 'p']], 
    results[['np_replace', 'sort_replace', 'np_noReplace', 'sort_noReplace']] / (1.0 * num_tests * num_samples)
], axis=1)
results_p









    Out[6]:






  
    
      
      weight
      p
      np_replace
      sort_replace
      np_noReplace
      sort_noReplace
    
  
  
    
      0 
       10
       0.181818
       0.17876
       0.18216
       0.15278
       0.15038
    
    
      1 
        9
       0.163636
       0.16336
       0.16186
       0.14406
       0.14384
    
    
      2 
        8
       0.145455
       0.14254
       0.14734
       0.13600
       0.13588
    
    
      3 
        7
       0.127273
       0.12772
       0.12770
       0.12646
       0.12778
    
    
      4 
        6
       0.109091
       0.11026
       0.10746
       0.11354
       0.11510
    
    
      5 
        5
       0.090909
       0.09202
       0.09156
       0.10092
       0.10152
    
    
      6 
        4
       0.072727
       0.07388
       0.07186
       0.08532
       0.08628
    
    
      7 
        3
       0.054545
       0.05526
       0.05528
       0.06734
       0.06818
    
    
      8 
        2
       0.036364
       0.03770
       0.03666
       0.04760
       0.04588
    
    
      9 
        1
       0.018182
       0.01850
       0.01812
       0.02598
       0.02516
    
    
      10
        0
       0.000000
       0.00000
       0.00000
       0.00000
       0.00000
    
  

11 rows × 6 columns







    



time: 25 ms



In [6]:

	weight	p	np_replace	sort_replace	np_noReplace	sort_noReplace
0	10	0.181818	8938	9108	7639	7519
1	9	0.163636	8168	8093	7203	7192
2	8	0.145455	7127	7367	6800	6794
3	7	0.127273	6386	6385	6323	6389
4	6	0.109091	5513	5373	5677	5755
5	5	0.090909	4601	4578	5046	5076
6	4	0.072727	3694	3593	4266	4314
7	3	0.054545	2763	2764	3367	3409
8	2	0.036364	1885	1833	2380	2294
9	1	0.018182	925	906	1299	1258
10	0	0.000000	0	0	0	0

	weight	p	np_replace	sort_replace	np_noReplace	sort_noReplace
0	10	0.181818	0.17876	0.18216	0.15278	0.15038
1	9	0.163636	0.16336	0.16186	0.14406	0.14384
2	8	0.145455	0.14254	0.14734	0.13600	0.13588
3	7	0.127273	0.12772	0.12770	0.12646	0.12778
4	6	0.109091	0.11026	0.10746	0.11354	0.11510
5	5	0.090909	0.09202	0.09156	0.10092	0.10152
6	4	0.072727	0.07388	0.07186	0.08532	0.08628
7	3	0.054545	0.05526	0.05528	0.06734	0.06818
8	2	0.036364	0.03770	0.03666	0.04760	0.04588
9	1	0.018182	0.01850	0.01812	0.02598	0.02516
10	0	0.000000	0.00000	0.00000	0.00000	0.00000