Use this notebook to compare the performance of standard numpy.random.choice calls with the approach defined in:
In [1]:
import numpy as np
import pandas as pd
from smartpy_core.wrangling import *
In [2]:
def randomize_weights(w):
"""
Generate probabilties, such that when sorted, they can be used to do weighted shuffle.
"""
w = w.fillna(0)
w_sum = w.sum()
if w_sum == 0:
# no valid weights, just return random
return pd.Series(np.random.rand(len(w)), index=w.index)
else:
p = w / w_sum
#print p
return pd.Series(np.power(np.random.rand(len(w)), 1.0 / p), index=w.index)
In [3]:
df = pd.DataFrame({'weight': np.arange(10, -1, step=-1)})
df
Out[3]:
In [4]:
# now look at the probabilities from the weights
probs = df['weight'] / df['weight'].sum()
df['p'] = probs
probs
Out[4]:
In [5]:
# test and make sure we are reproducing the distributions
num_tests = 10000
num_samples = 5
def run_test(df):
df = df.copy()
df['np_replace'] = 0
df['sort_replace'] = 0
df['np_noReplace'] = 0
df['sort_noReplace'] = 0
for i in range(num_tests):
# numpy sample w replacement
np_rep_idx = np.random.choice(df.index.values, size=num_samples, p=probs.values)
cnts = pd.Series(np_rep_idx).value_counts()
df.loc[cnts.index, 'np_replace'] += cnts
# numpy sample no replacement
np_noRep_idx = np.random.choice(df.index.values, size=num_samples, p=probs.values, replace=False)
df.loc[np_noRep_idx, 'np_noReplace'] += 1
# my approach w replacement
for i in range(num_samples):
rw = randomize_weights(df['weight'])
choice_idx = rw.argmax()
df.loc[choice_idx, 'sort_replace'] += 1
# my approach w/out replacment
rw = randomize_weights(df['weight'])
rank = rw.rank(ascending=False, method='first')
choice_idx = rank <= num_samples
df.loc[choice_idx, 'sort_noReplace'] += 1
return df
results = run_test(df)
results
Out[5]:
In [6]:
# compute probabilities
results_p = pd.concat([
df[['weight', 'p']],
results[['np_replace', 'sort_replace', 'np_noReplace', 'sort_noReplace']] / (1.0 * num_tests * num_samples)
], axis=1)
results_p
Out[6]:
In [6]: