In [52]:
nbuckets = 256  # Number of buckets

In [53]:
counters = np.array([0] * nbuckets)  # Counter per bucket

In [54]:
def hash_index(s, nbuckets=nbuckets):
    '''Hash string to a integer in the range 0..nbuckets-1'''
    return hash(s) % nbuckets

In [55]:
with open('users.txt') as fo:  # Populate the counts
    for user in (line.strip() for line in fo):
        counters[hash_index(user)] += 1

In [56]:
# Plot the distribution over buckets
xs = np.arange(nbuckets)
plot(xs, counters, marker='o')
plot([0, size-1], [counters.mean(), counters.mean()], color='red', label='mean')
grid()
xlim(0, size)
legend(frameon=False)


Out[56]:
<matplotlib.legend.Legend at 0x51746d0>

In [57]:
counters.std()  # Standard diviation


Out[57]:
51.444682958135587

In [58]:
# Plot density
from statsmodels.nonparametric import KDE
kde = KDE(counters.astype(np.double))
kde.fit()
fill(kde.support, kde.density)


Out[58]:
[<matplotlib.patches.Polygon at 0x530af50>]

In [58]: