notebook.community

Edit and run



In [52]:

    
nbuckets = 256  # Number of buckets



In [53]:

    
counters = np.array([0] * nbuckets)  # Counter per bucket



In [54]:

    
def hash_index(s, nbuckets=nbuckets):
    '''Hash string to a integer in the range 0..nbuckets-1'''
    return hash(s) % nbuckets



In [55]:

    
with open('users.txt') as fo:  # Populate the counts
    for user in (line.strip() for line in fo):
        counters[hash_index(user)] += 1



In [56]:

    
# Plot the distribution over buckets
xs = np.arange(nbuckets)
plot(xs, counters, marker='o')
plot([0, size-1], [counters.mean(), counters.mean()], color='red', label='mean')
grid()
xlim(0, size)
legend(frameon=False)









    Out[56]:





<matplotlib.legend.Legend at 0x51746d0>



In [57]:

    
counters.std()  # Standard diviation









    Out[57]:





51.444682958135587



In [58]:

    
# Plot density
from statsmodels.nonparametric import KDE
kde = KDE(counters.astype(np.double))
kde.fit()
fill(kde.support, kde.density)









    Out[58]:





[<matplotlib.patches.Polygon at 0x530af50>]



In [58]: