In [52]:
nbuckets = 256 # Number of buckets
In [53]:
counters = np.array([0] * nbuckets) # Counter per bucket
In [54]:
def hash_index(s, nbuckets=nbuckets):
'''Hash string to a integer in the range 0..nbuckets-1'''
return hash(s) % nbuckets
In [55]:
with open('users.txt') as fo: # Populate the counts
for user in (line.strip() for line in fo):
counters[hash_index(user)] += 1
In [56]:
# Plot the distribution over buckets
xs = np.arange(nbuckets)
plot(xs, counters, marker='o')
plot([0, size-1], [counters.mean(), counters.mean()], color='red', label='mean')
grid()
xlim(0, size)
legend(frameon=False)
Out[56]:
In [57]:
counters.std() # Standard diviation
Out[57]:
In [58]:
# Plot density
from statsmodels.nonparametric import KDE
kde = KDE(counters.astype(np.double))
kde.fit()
fill(kde.support, kde.density)
Out[58]:
In [58]: