Interesting and quite simple problem to cound 1 bits in buffer.

I think you might get different results if your data is not long enough. I got these from some links and then modified to use numpy and bytearray. These are generally untested.

http://www.valuedlessons.com/2009/01/popcount-in-python-with-benchmarks.html

http://stackoverflow.com/questions/9829578/fast-way-of-counting-bits-in-python

http://blog.philippklaus.de/2014/10/counting-bits-set-to-1-in-bytes-with-python-popcount-or-hamming-weight/

https://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation

http://www.expobrain.net/2013/07/29/hamming-weights-python-implementation/

http://stackoverflow.com/questions/8220801/how-to-use-timeit-module

http://graphics.stanford.edu/~seander/bithacks.html

Clearly best algorithm depends on data and CPU you currently have. Is data 32-bit, 64-bit or big blob of bytearray? I think it clearly matters how data is arranged. Can you use big numpy darrays or only small arrays or no array at all?


In [1]:
import random, struct
import numpy as np
import gmpy2
from gmpy2 import mpz
# not meant to be random
random.seed(1)
d = bytearray([random.randint(0,255) for i in range(4096)])
print(len(d))
v = np.frombuffer(d, dtype=np.uint32)
print(v.shape, v[0])
v = np.frombuffer(d, dtype=np.uint64)*mpz(1)
print(v.shape, v[0], gmpy2.popcount(v[0]))

def count1s64(d):
    v = np.frombuffer(d, dtype=np.uint64)
    v = np.bitwise_and(v, 0x5555555555555555) + np.right_shift(np.bitwise_and(v, 0xAAAAAAAAAAAAAAAA), 1)
    v = np.bitwise_and(v, 0x3333333333333333) + np.right_shift(np.bitwise_and(v, 0xCCCCCCCCCCCCCCCC), 2)
    v = np.bitwise_and(v, 0x0F0F0F0F0F0F0F0F) + np.right_shift(np.bitwise_and(v, 0xF0F0F0F0F0F0F0F0), 4)
    v = np.bitwise_and(v, 0x00FF00FF00FF00FF) + np.right_shift(np.bitwise_and(v, 0xFF00FF00FF00FF00), 8)
    v = np.bitwise_and(v, 0x0000FFFF0000FFFF) + np.right_shift(np.bitwise_and(v, 0xFFFF0000FFFF0000), 16)
    v = np.bitwise_and(v, 0x00000000FFFFFFFF) + np.right_shift(v, 32)
    return v.sum()
v = np.frombuffer(d, dtype=np.uint64)
print(count1s64(d))


4096
(1024,) 1015160900
(512,) 14047262688061562948 29
16203

In [2]:
import timeit
from functools import partial
import random
import copy

In [3]:
class popcount:
    # Everty actual function has prefix: count1s_
    #
    # after that is function arg type:
    # s: takes a number
    # l: takes list etc for iterable
    # a: takes numpy.ndarray
    # d: takes bitarray
    #
    # If function is limited to 32/64 bits it has suffix _32/_64
    
    def count1s_s_naive(v):
        c = 0
        while(v):
            c += v & 1
            v >>= 1
        return c

    def count1s_l_naive(ve):
        c = 0
        for v in ve:
            while(v):
                c += v & 1
                v >>= 1
        return c

    def count1s_ls_naive(ve):
        return sum(popcount.count1s_s_naive(v) for v in ve)

    def count1s_al_naive(ve):
        c = 0
        for v in ve.tolist():
            while(v):
                c += v & 1
                v >>= 1
        return c

    def count1s_als_naive(ve):
        return sum(popcount.count1s_s_naive(v) for v in ve.tolist())

    def count1s_ai_naive(ve):
        c = 0
        for v in np.nditer(ve, flags=['buffered']):
            x = int(v)
            while(x):
                c += x & 1
                x >>= 1
        return c

    def count1s_ais_naive(ve):
        return sum(popcount.count1s_s_naive(int(v)) for v in np.nditer(ve))

    def count1s_d_naive(d):
        c = 0
        dtype = np.uint64
        if len(d) <= 4:
            dtype = np.uint32        
        for v in np.frombuffer(d, dtype=dtype).tolist():
            while(v):
                c += v & 1
                v >>= 1
        return c
    
    ########################################################################
    def count1s_s_pythonic(v):
        return bin(v).count("1")

    def count1s_l_pythonic(ve):
        c = 0
        for v in ve:
            c += bin(v).count("1")
        return c

    def count1s_ls_pythonic(ve):
        return sum(popcount.count1s_s_pythonic(v) for v in ve)

    def count1s_al_pythonic(ve):
        c = 0
        for v in ve.tolist():
            c += bin(v).count("1")
        return c

    def count1s_als_pythonic(ve):
        return sum(popcount.count1s_s_pythonic(v) for v in ve.tolist())

    def count1s_ai_pythonic(ve):
        c = 0
        for v in np.nditer(ve):
            c += bin(v).count("1")
        return c

    def count1s_ais_pythonic(ve):
        return sum(popcount.count1s_s_pythonic(v) for v in np.nditer(ve))

    def count1s_dl_pythonic(d):
        c = 0
        dtype = np.uint64
        if len(d) <= 4:
            dtype = np.uint32        
        for v in np.frombuffer(d, dtype=dtype).tolist():
            c += bin(v).count("1")
        return c

    def count1s_di_pythonic(d):
        c = 0
        dtype = np.uint64
        if len(d) <= 4:
            dtype = np.uint32        
        for v in np.nditer(np.frombuffer(d, dtype=dtype)):
            c += bin(v).count("1")
        return c

    ########################################################################
    # Brian Kernighan way
    def count1s_s_zero(x):
        c = 0
        while x:
            x &= x - 1
            c += 1
        return c

    def count1s_l_zero(ve):
        c = 0
        for x in ve:
            while x:
                x &= x - 1
                c += 1
        return c

    def count1s_al_zero(ve):
        c = 0
        for x in ve.tolist():
            while x:
                x &= x - 1
                c += 1
        return c

    def count1s_ai_zero(ve):
        c = 0
        for v in np.nditer(ve):
            x = int(v)
            while x:
                x &= x - 1
                c += 1
        return c

    def count1s_du_zero(d):
        c = 0
        (div, mod) = divmod(len(d), 8)
        fmts = '{:d}Q'.format(div)
        if mod: fmts += 'I'            
        for x in struct.unpack(fmts, d):
            while x:
                x &= x - 1
                c += 1
        return c

    # Seems I can't get this to work, result of minus is l but x can be I
    def broken_count1s_di_zero(d):
        c = 0
        dtype = np.uint64
        if len(d) <= 4:
            dtype = np.uint32        
        for x in np.nditer(np.frombuffer(d, dtype=dtype), op_flags=['readwrite']):
            while x:
                x &= x - 1
                c += 1
        return c

    ########################################################################
    TABLE8 = [0] * 2**8
    for index in range(len(TABLE8)):
        TABLE8[index] = (index & 1) + TABLE8[index >> 1]
    def popcount32_table8(v):
        return (popcount.TABLE8[ v        & 0xff ] +
                popcount.TABLE8[(v >>  8) & 0xff ] +
                popcount.TABLE8[(v >> 16) & 0xff ] +
                popcount.TABLE8[(v >> 24)        ])
    def popcount64_table8(v):
        return (popcount.TABLE8[ v        & 0xff ] +
                popcount.TABLE8[(v >>  8) & 0xff ] +
                popcount.TABLE8[(v >> 16) & 0xff ] +
                popcount.TABLE8[(v >> 24) & 0xff ] +
                popcount.TABLE8[(v >> 32) & 0xff ] +
                popcount.TABLE8[(v >> 40) & 0xff ] +
                popcount.TABLE8[(v >> 48) & 0xff ] +
                popcount.TABLE8[(v >> 56)        ])

    def count1s_s_lut8_32(v):
        return popcount.popcount32_table8(v)

    def count1s_l1_lut8_32(ve):
        return sum(popcount.popcount32_table8(v) for v in ve)

    def count1s_l2_lut8_32(ve):
        c = 0
        for v in ve:
            c += popcount.popcount32_table8(v)
        return c

    def count1s_dl_lut8_32(d):
        c = 0
        for v in np.frombuffer(d, dtype=np.uint32).tolist():
            c += popcount.popcount32_table8(v)
        return c

    def count1s_di_lut8_32(d):
        return sum(popcount.popcount32_table8(v)
                    for v in np.nditer(np.frombuffer(d, dtype=np.uint32)))

    def count1s_l_lut8_32(ve):
        c = 0
        for v in ve:
            c += popcount.popcount32_table8(v)
        return c

    def count1s_s_lut8_64(v):
        return popcount.popcount64_table8(v)

    def count1s_l_lut8_64(ve):
        c = 0
        for v in ve:
            c += popcount.popcount64_table8(v)
        return c

    def count1s_l_lut8kc_64(ve):
        # Variant from: https://gist.github.com/KrzysztofCiba/5714765
        c = 0
        e = lambda x, y: x + int( bool( y ) ) << 3
        for x in ve:
            arg = int(x)
            c += sum([
                        popcount.TABLE8[ ( arg >> i ) & 255 ] 
                          for i in range(0,
                                         e( *divmod( arg.bit_length(), 8)),
                                         min(8,divmod( arg.bit_length(), 4)[0]))
                    ])
        return c
    
    TABLE16 = [0] * 2**16
    for index in range(len(TABLE16)):
        TABLE16[index] = (index & 1) + TABLE16[index >> 1]
    def popcount32_table16(v):
        return (popcount.TABLE16[ v & 0xffff ] +
                popcount.TABLE16[ v >> 16    ])
    def popcount64_table16(v):
        return (popcount.TABLE16[ v        & 0xffff] +
                popcount.TABLE16[(v >> 16) & 0xffff] +
                popcount.TABLE16[(v >> 32) & 0xffff] +
                popcount.TABLE16[(v >> 48)         ])
    def count1s_l_lut16_32(ve):
        c = 0
        for v in ve:
            c += popcount.popcount32_table16(v)
        return c
    def count1s_l_lut16_64(ve):
        c = 0
        for v in ve:
            c += popcount.popcount64_table16(v)
        return c

    def count1s_l_lut8kc_64(ve):
        # Adapted from: https://gist.github.com/KrzysztofCiba/5714765
        e = lambda x, y: x + int( bool( y ) ) << 7
        c = 0
        for x in ve:
            arg = int(x)
            c += sum([
                        popcount.TABLE16[ ( arg >> i ) & 0xffff ] 
                            for i in range(0,
                                           e(*divmod( arg.bit_length(), 16)),
                                           16)
                    ])
        return c

    TABLE16_npi = np.zeros(2**16, dtype=int) #has to be an array
    for index in range(len(TABLE16_npi)):
        TABLE16_npi[index] = (index & 1) + TABLE16_npi[index >> 1]
    def popcount32_table16_npi(v):
        return (popcount.TABLE16_npi[ v & 0xffff ] +
                popcount.TABLE16_npi[ v >> 16    ])
    def popcount64_table16_npi(v):
        return (popcount.TABLE16_npi[ v        & 0xffff ] +
                popcount.TABLE16_npi[(v >> 16) & 0xffff ] +
                popcount.TABLE16_npi[(v >> 32) & 0xffff ] +
                popcount.TABLE16_npi[(v >> 48)          ])
    def count1s_s_nplut16i_32(v):
        return popcount.popcount32_table16_npi(np.array(v)).sum()
    def count1s_l_nplut16i_32(v):
        return popcount.popcount32_table16_npi(np.array(v)).sum()
    def count1s_a_nplut16i_32(v):
        return popcount.popcount32_table16_npi(v).sum()
    def count1s_d_nplut16i_32(d):
        v = np.frombuffer(d, dtype=np.uint32)
        return popcount.popcount32_table16_npi(v).sum()
    def count1s_a_nplut16i_64(v):
        return popcount.popcount64_table16_npi(v).sum()
    def count1s_l_lut16i_32(ve):
        c = 0
        for v in ve:
            c += popcount.popcount32_table16_npi(v)
        return c
    def count1s_l_lut16i_64(ve):
        c = 0
        for v in ve:
            c += popcount.popcount64_table16_npi(v)
        return c

    TABLE16_npb = np.zeros(2**16, dtype=np.byte) #has to be an array
    for index in range(len(TABLE16_npb)):
        TABLE16_npb[index] = (index & 1) + TABLE16_npb[index >> 1]
    def popcount32_table16_npb(v):
        return (popcount.TABLE16_npb[ v & 0xffff ] +
                popcount.TABLE16_npb[ v >> 16    ])
    def popcount64_table16_npb(v):
        return (popcount.TABLE16_npb[ v        & 0xffff ] +
                popcount.TABLE16_npb[(v >> 16) & 0xffff ] +
                popcount.TABLE16_npb[(v >> 32) & 0xffff ] +
                popcount.TABLE16_npb[(v >> 48)          ])
    def count1s_a_nplut16b_32(v):
        return popcount.popcount32_table16_npb(v).sum()
    def count1s_a_nplut16b_64(v):
        return popcount.popcount64_table16_npb(v).sum()
    def count1s_l_lut16b_32(ve):
        c = 0
        for v in ve:
            c += popcount.popcount32_table16_npb(v)
        return c
    def count1s_l_lut16b_64(ve):
        c = 0
        for v in ve:
            c += popcount.popcount64_table16_npb(v)
        return c

    ########################################################################
    m1   = 0x5555555555555555
    m1b  = 0xAAAAAAAAAAAAAAAA
    m2   = 0x3333333333333333
    m2b  = 0xCCCCCCCCCCCCCCCC
    m4   = 0x0f0f0f0f0f0f0f0f
    m4b  = 0xf0f0f0f0f0f0f0f0
    m8   = 0x00ff00ff00ff00ff
    m8b  = 0xff00ff00ff00ff00
    m16  = 0x0000ffff0000ffff
    m16b = 0xffff0000ffff0000
    m32  = 0x00000000ffffffff
    h01  = 0x0101010101010101

    def count1s_s_bw1a_64(v):
        v = (v & popcount.m1 ) + ((v & popcount.m1b ) >> 1 )
        v = (v & popcount.m2 ) + ((v & popcount.m2b ) >> 2 )
        v = (v & popcount.m4 ) + ((v & popcount.m4b ) >> 4 )
        v = (v & popcount.m8 ) + ((v & popcount.m8b ) >> 8 )
        v = (v & popcount.m16) + ((v & popcount.m16b) >> 16)
        return (v & popcount.m32) + (v >> 32)

    def count1s_l_bw1a_64(ve):
        c = 0
        for v in ve:
            v = (v & popcount.m1 ) + ((v & popcount.m1b ) >> 1 )
            v = (v & popcount.m2 ) + ((v & popcount.m2b ) >> 2 )
            v = (v & popcount.m4 ) + ((v & popcount.m4b ) >> 4 )
            v = (v & popcount.m8 ) + ((v & popcount.m8b ) >> 8 )
            v = (v & popcount.m16) + ((v & popcount.m16b) >> 16)
            v = (v & popcount.m32) + (v >> 32)
            c += v
        return c

    def count1s_a_npbw1a_64(v):
        v = np.bitwise_and(v, popcount.m1 ) + np.right_shift(np.bitwise_and(v, popcount.m1b ),  1)
        v = np.bitwise_and(v, popcount.m2 ) + np.right_shift(np.bitwise_and(v, popcount.m2b ),  2)
        v = np.bitwise_and(v, popcount.m4 ) + np.right_shift(np.bitwise_and(v, popcount.m4b ),  4)
        v = np.bitwise_and(v, popcount.m8 ) + np.right_shift(np.bitwise_and(v, popcount.m8b ),  8)
        v = np.bitwise_and(v, popcount.m16) + np.right_shift(np.bitwise_and(v, popcount.m16b), 16)
        v = np.bitwise_and(v, popcount.m32) + np.right_shift(v, 32)
        return v.sum()

    def count1s_d_npbw1a_64(d):
        dtype = np.uint64
        if len(d) <= 4:
            dtype = np.uint32        
        return popcount.count1s_a_npbw1a_64(np.frombuffer(d, dtype=dtype))

    def count1s_s_bw1b_64(v):
        v = (v & popcount.m1 ) + ((v >>  1) & popcount.m1 )
        v = (v & popcount.m2 ) + ((v >>  2) & popcount.m2 )
        v = (v & popcount.m4 ) + ((v >>  4) & popcount.m4 )
        v = (v & popcount.m8 ) + ((v >>  8) & popcount.m8 )
        v = (v & popcount.m16) + ((v >> 16) & popcount.m16)
        return (v & popcount.m32) + ((v >> 32) & popcount.m32)
        
    def count1s_l_bw1b_64(ve):
        c = 0
        for v in ve:
            v = (v & popcount.m1 ) + ((v >>  1) & popcount.m1 )
            v = (v & popcount.m2 ) + ((v >>  2) & popcount.m2 )
            v = (v & popcount.m4 ) + ((v >>  4) & popcount.m4 )
            v = (v & popcount.m8 ) + ((v >>  8) & popcount.m8 )
            v = (v & popcount.m16) + ((v >> 16) & popcount.m16)
            v = (v & popcount.m32) + ((v >> 32) & popcount.m32)
            c += v
        return c

    def count1s_a_npbw1b_64(v):
        v = np.bitwise_and(v, popcount.m1 ) + np.bitwise_and(np.right_shift(v,  1), popcount.m1 )
        v = np.bitwise_and(v, popcount.m2 ) + np.bitwise_and(np.right_shift(v,  2), popcount.m2 )
        v = np.bitwise_and(v, popcount.m4 ) + np.bitwise_and(np.right_shift(v,  4), popcount.m4 )
        v = np.bitwise_and(v, popcount.m8 ) + np.bitwise_and(np.right_shift(v,  8), popcount.m8 )
        v = np.bitwise_and(v, popcount.m16) + np.bitwise_and(np.right_shift(v, 16), popcount.m16)
        v = np.bitwise_and(v, popcount.m32) + np.bitwise_and(np.right_shift(v, 32), popcount.m32)
        return v.sum()
    
    def count1s_d_npbw1b_64(d):
        dtype = np.uint64
        if len(d) <= 4:
            dtype = np.uint32        
        return popcount.count1s_a_npbw1b_64(np.frombuffer(d, dtype=dtype))

    def count1s_s_bw2_64(v):
        v -= (v >> 1) & popcount.m1
        v = (v & popcount.m2) + ((v >> 2) & popcount.m2)
        v = (v + (v >> 4)) & popcount.m4
        v += v >> 8
        v += v >> 16
        v += v >> 32
        return (v & 0x7f)

    def count1s_l_bw2_64(ve):
        c = 0
        for v in ve:
            v -= (v >> 1) & popcount.m1
            v = (v & popcount.m2) + ((v >> 2) & popcount.m2)
            v = (v + (v >> 4)) & popcount.m4
            v += v >> 8
            v += v >> 16
            v += v >> 32
            c += (v & 0x7f)
        return c

    def count1s_a_npbw2_64(v):
        v = v - np.bitwise_and(np.right_shift(v, 1), popcount.m1)
        v = np.bitwise_and(v, popcount.m2) + np.bitwise_and(np.right_shift(v, 2), popcount.m2)
        v = np.bitwise_and(v + np.right_shift(v, 4), popcount.m4)
        v += np.right_shift(v, 8)
        v += np.right_shift(v, 16)
        v = np.bitwise_and(v + np.right_shift(v, 32), 0x7f)
        return v.sum()

    def count1s_d_npbw2_64(d):
        dtype = np.uint64
        if len(d) <= 4:
            dtype = np.uint32        
        return popcount.count1s_a_npbw2_64(np.frombuffer(d, dtype=dtype))

    def count1s_s_bw3_64(v):
        v -= (v >> 1) & popcount.m1
        v = (v & popcount.m2) + ((v >> 2) & popcount.m2)
        v = (v + (v >> 4)) & popcount.m4
        return (v * popcount.h01 & 0xffffffffffffffff) >> 56

    def count1s_l_bw3_64(ve):
        c = 0
        for v in ve:
            v -= (v >> 1) & popcount.m1
            v = (v & popcount.m2) + ((v >> 2) & popcount.m2)
            v = (v + (v >> 4)) & popcount.m4
            v = (v * popcount.h01 & 0xffffffffffffffff) >> 56
            c += v
        return c

    def count1s_a_npbw3_64(v):
        v = v - np.bitwise_and(np.right_shift(v, 1), popcount.m1)
        v = np.bitwise_and(v, popcount.m2) + np.bitwise_and(np.right_shift(v, 2), popcount.m2)
        v = np.bitwise_and(v + np.right_shift(v, 4), popcount.m4)
        v = np.right_shift(v * popcount.h01, 56)
        return v.sum()

    def count1s_d_npbw3_64(d):
        dtype = np.uint64
        if len(d) <= 4:
            dtype = np.uint32        
        return popcount.count1s_a_npbw3_64(np.frombuffer(d, dtype=dtype))

    ma = 0x01001001001001
    mb = 0x84210842108421

    def count1s_s_bw64_32(v):
        return ((((v & 0xfff) * popcount.ma & popcount.mb) % 0x1f)
              + ((((v & 0xfff000) >> 12) * popcount.ma & popcount.mb) % 0x1f)
              + (((v >> 24) * popcount.ma & popcount.mb) % 0x1f))

    def count1s_l_bw64_32(ve):
        c = 0
        for v in ve:
            c += ((((v & 0xfff) * popcount.ma & popcount.mb) % 0x1f)
                  + ((((v & 0xfff000) >> 12) * popcount.ma & popcount.mb) % 0x1f)
                  + (((v >> 24) * popcount.ma & popcount.mb) % 0x1f))
        return c

    def count1s_a_npbw64_32(v):
        c = (  np.mod(np.bitwise_and(np.bitwise_and(v, 0xfff) * popcount.ma, popcount.mb), 0x1f)
             + np.mod(np.bitwise_and(np.right_shift(np.bitwise_and(v, 0xfff000), 12) * popcount.ma, popcount.mb), 0x1f)
             + np.mod(np.bitwise_and(np.right_shift(v, 24) * popcount.ma, popcount.mb), 0x1f)
             )
        return c.sum()

    ########################################################################
    def count1s_s_gmpy2(v):
        return gmpy2.popcount(v*mpz(1))

    def count1s_a_gmpy2(v):
        v2 = v*mpz(1)
        return sum(gmpy2.popcount(a) for a in v2)

    ########################################################################
    def test_helper_start(a, d, dadi):
        fmt = '' # formatstring for unpack
        dlen = 0 # real bytelen for calcs
        v = None # value to test
        orig_v = None # original value before testing
        typ = None # type of func to test

        # Can't test too short array
        if a.endswith('_64') and dadi < 8: return None
        if a.endswith('_32') or dadi < 8:
            fmt = 'I'
            dlen = 4
        else:
            fmt = 'Q'
            dlen = 8
        if a.startswith('count1s_s'): # scalar
            if dadi >= 16: return None
            v = struct.unpack(fmt, d[:dlen])[0]
            orig_v = v
            typ = 's'
        elif a.startswith('count1s_l'): # list
            fmts = '{:d}{:s}'.format(divmod(len(d), dlen)[0], fmt)
            v = list(struct.unpack(fmts, d))
            orig_v = copy.copy(v)
            dlen = dadi
            typ = 'l'
        elif a.startswith('count1s_a'): # np.array
            if dlen == 4:
                v = np.frombuffer(d, dtype=np.uint32)
            elif dlen == 8:
                v = np.frombuffer(d, dtype=np.uint64)
            else: raise
            orig_v = np.copy(v)
            dlen = dadi
            typ = 'a'
        elif a.startswith('count1s_d'): # bytearray
            v = d
            orig_v = d
            dlen = dadi
            typ = 'd'
        else: raise
        return (dlen, v, orig_v, typ)

    def test_helper_check(a, dlen, v, orig_v, typ):
        # Check if data has survived
        ok = True
        if typ == 'd':
            if len(v) != len(orig_v):
                print('d: len')
                ok = False
            elif not v.startswith(orig_v):
                print("d: startswith")
                ok = False
        elif typ == 'a':
            if len(v.tolist()) != len(orig_v.tolist()):
                print("a: len")
                ok = False
            else:
                for x in range(len(v.tolist())):
                    if v[x] != orig_v[x]:
                        print("a: item", v[x], orig_v[x])
                        ok = False
                        break                
        elif typ == 'l':
            if len(v) != len(orig_v):
                print('l: len')
                ok = False
            else:
                for x in range(len(v)):
                    if v[x] != orig_v[x]:
                        print("l: item", v[x], orig_v[x])
                        ok = False
                        break
        elif typ == 's':
            if v != orig_v:
                print('s: !=')
                ok = False
        else:
            print("Could not check:", a, dlen)

        if not ok:
            print("ERROR no data ok:", typ, a, dlen, v, orig_v)

popcount_methods = [ a for a in dir(popcount) if a.startswith('count1s') ]
popcount_methods.sort()
popcount_methods


Out[3]:
['count1s_a_gmpy2',
 'count1s_a_npbw1a_64',
 'count1s_a_npbw1b_64',
 'count1s_a_npbw2_64',
 'count1s_a_npbw3_64',
 'count1s_a_npbw64_32',
 'count1s_a_nplut16b_32',
 'count1s_a_nplut16b_64',
 'count1s_a_nplut16i_32',
 'count1s_a_nplut16i_64',
 'count1s_ai_naive',
 'count1s_ai_pythonic',
 'count1s_ai_zero',
 'count1s_ais_naive',
 'count1s_ais_pythonic',
 'count1s_al_naive',
 'count1s_al_pythonic',
 'count1s_al_zero',
 'count1s_als_naive',
 'count1s_als_pythonic',
 'count1s_d_naive',
 'count1s_d_npbw1a_64',
 'count1s_d_npbw1b_64',
 'count1s_d_npbw2_64',
 'count1s_d_npbw3_64',
 'count1s_d_nplut16i_32',
 'count1s_di_lut8_32',
 'count1s_di_pythonic',
 'count1s_dl_lut8_32',
 'count1s_dl_pythonic',
 'count1s_du_zero',
 'count1s_l1_lut8_32',
 'count1s_l2_lut8_32',
 'count1s_l_bw1a_64',
 'count1s_l_bw1b_64',
 'count1s_l_bw2_64',
 'count1s_l_bw3_64',
 'count1s_l_bw64_32',
 'count1s_l_lut16_32',
 'count1s_l_lut16_64',
 'count1s_l_lut16b_32',
 'count1s_l_lut16b_64',
 'count1s_l_lut16i_32',
 'count1s_l_lut16i_64',
 'count1s_l_lut8_32',
 'count1s_l_lut8_64',
 'count1s_l_lut8kc_64',
 'count1s_l_naive',
 'count1s_l_nplut16i_32',
 'count1s_l_pythonic',
 'count1s_l_zero',
 'count1s_ls_naive',
 'count1s_ls_pythonic',
 'count1s_s_bw1a_64',
 'count1s_s_bw1b_64',
 'count1s_s_bw2_64',
 'count1s_s_bw3_64',
 'count1s_s_bw64_32',
 'count1s_s_gmpy2',
 'count1s_s_lut8_32',
 'count1s_s_lut8_64',
 'count1s_s_naive',
 'count1s_s_nplut16i_32',
 'count1s_s_pythonic',
 'count1s_s_zero']

In [4]:
da = (4, 8, 16)
numa = (0, 1, 255)
ra = (0, 1, 8) # result len per item
for di in range(len(da)):
    for i in range(len(numa)):
        num = numa[i]
        d = bytearray([num for i in range(da[di])])
        l = []
        for a in popcount_methods:
            start = popcount.test_helper_start(a, d, da[di])
            if start is None: continue
            (dlen, v, orig_v, typ) = start
            r = getattr(popcount, a)(v)
            popcount.test_helper_check(a, dlen, v, orig_v, typ)
            l.append({a: r})
            if r != ra[i]*dlen:
                print("ERROR:", a, dlen, '*', numa[i], ',', r, "!=", ra[i]*dlen)
        #print(num, l)

Run through different vector sizes.


In [5]:
da = (4,      8,      16,     1024,  4096,  409600)
for di in range(len(da)):
    random.seed(1)
    d = bytearray([random.randint(0,255) for i in range(da[di])])
    repeat = 3
    l = []
    res = {4: 0, 8: 0}
    for a in popcount_methods:
        start = popcount.test_helper_start(a, d, da[di])
        if start is None: continue
        (dlen, v, orig_v, typ) = start

        r = getattr(popcount, a)(v)
        # Need to take into account scalars
        if dlen == 4: ri = 4
        else: ri = 8
        if res[ri] == 0: res[ri] = r
        if res[ri] != r:
            print("ERROR:", a, di, dlen, r, "!=", res[ri])
        l.append({a: r})
    #print(da[di], l)

In [ ]:
unit = [['Mi', 1024*1024], ['ki', 1024], ['', 1]]
ro15 = [['.0f', 150], ['.1f', 15], ['.2f',1.5]]
test_target = 2.0 # seconds, real test can be about twice as much
repeat = 3
max_repeats = 30
for di in range(len(da)):
    random.seed(1)
    d = bytearray([random.randint(0,255) for i in range(da[di])])
    for a in popcount_methods:
        if a.endswith('_64') and da[di] < 8: continue
        r = [0.0001]
        number = 1
        total_number = 0
        last_run_repeat = 0
        while min(r) < (test_target / 2.0):
            if min(r) < (test_target / 2.0 / 10.0):
                number *= 10
            else:
                number_new = int(number * test_target / min(r))
                if number_new <= (number*1.25):
                    number *= 1.25
                else:
                    number = number_new

            start = popcount.test_helper_start(a, d, da[di])
            if start is None: continue
            (dlen, v, orig_v, typ) = start

            # Try to get minimum by running tests again if there is too big values
            # But only run tests limited times
            # And only if this is last run, aka min(r) is big enough
            r = []
            last_run_repeats = 0
            max_r = None
            while len(r) < repeat and max_repeats > last_run_repeats:
                r += timeit.repeat(partial(getattr(popcount, a), v), number=number, repeat=repeat)
                total_number += number * repeat
                last_run_repeats += repeat
                if min(r) < (test_target / 2.0):
                    break
                while max(r) > (min(r) * 1.025):
                    if max_r is None or max(r) < max_r:
                        max_r = max(r)
                    r.remove(max(r))

            popcount.test_helper_check(a, dlen, v, orig_v, typ)
            
        ra = dlen * number / min(r)
        fmt = ''
        for u in range(len(unit)):
            for ro in range(len(ro15)):
                if fmt == '' and ra > unit[u][1] * ro15[ro][1]:
                    fmt = '{0:' + ro15[ro][0] + '}' + unit[u][0] + 'B/s'
                    ra = fmt.format(ra/unit[u][1])
        if fmt == '':
            ra = '{0:.2f}B/s'.format(ra)
        out = '{:24s} {:6d} {:>12s} {:2d} {:9d} {:10d}'.format(a, da[di], ra, last_run_repeats, number, total_number)
        r.sort()
        for re in r:
            out += ' {0:.03f}'.format(re)
        if max_r is not None:
            out += ' max:{0:.03f}'.format(max_r)
        print(out)


count1s_a_gmpy2               4     175kiB/s  9    100004     933366 2.234 2.237 2.250 max:2.303
count1s_a_npbw64_32           4    66.2kiB/s  3     35384     139482 2.089 2.117 2.140
count1s_a_nplut16b_32         4   139.1kiB/s  3     75201     258933 2.112 2.124 2.142
count1s_a_nplut16i_32         4     186kiB/s 30     77427    2356140 1.622 1.635 max:1.688
count1s_ai_naive              4     388kiB/s 15    100000    1533330 1.008 1.017 1.026 max:1.035
count1s_ai_pythonic           4    1266kiB/s 18    591825   10986180 1.827 1.828 1.829 1.851 max:1.880
count1s_ai_zero               4     951kiB/s 30    475255   14590980 1.952 max:2.012
count1s_ais_naive             4     371kiB/s  6    196449    1212024 2.069 2.081 2.087 max:2.142
count1s_ais_pythonic          4     863kiB/s 30    427552   13159890 1.935 1.973 max:1.993
count1s_al_naive              4     512kiB/s  3    273058    1152504 2.082 2.125 2.133
count1s_al_pythonic           4    2.68MiB/s 30   1260198   38139270 1.794 max:1.867
count1s_al_zero               4    1.75MiB/s 30    876036   26614410 1.909 1.936 max:1.969
count1s_als_naive             4     518kiB/s 21    238345    5338575 1.799 1.824 1.830 max:1.866
count1s_als_pythonic          4    1.51MiB/s 30    747316   22752810 1.893 1.920 max:1.949
count1s_d_naive               4     367kiB/s  9    194687    1785513 2.071 2.082 2.086 2.116 max:2.176
count1s_d_nplut16i_32         4   123.0kiB/s  9     67107     637293 2.132 2.141 2.156 max:2.186
count1s_di_lut8_32            4     152kiB/s  6     75528     486498 1.940 1.951 1.971 max:2.040
count1s_di_pythonic           4     584kiB/s 30    288050    8974830 1.927 max:1.984
count1s_dl_lut8_32            4     715kiB/s 12    366752    4734354 2.004 2.020 2.049 max:2.060
count1s_dl_pythonic           4     775kiB/s 21    360423    7902213 1.815 1.831 1.832 max:1.869
count1s_du_zero               4     956kiB/s 30    397729   12265200 1.626 1.659 max:1.702
count1s_l1_lut8_32            4    1411kiB/s  9    678906    6443484 1.880 1.881 1.882 1.892 max:1.978
count1s_l2_lut8_32            4    1.96MiB/s  6   1118830    7046310 2.178 2.230 2.231 max:2.245
count1s_l_bw64_32             4    1.83MiB/s 30    951220   28869930 1.980 max:2.035
count1s_l_lut16_32            4    3.15MiB/s 30   1981588   59780970 2.401 2.402 2.407 max:2.462
count1s_l_lut16b_32           4     585kiB/s  6    305596    2166906 2.041 2.051 2.080 max:2.105
count1s_l_lut16i_32           4    1.75MiB/s  9   1029052    9594798 2.242 2.272 2.276 2.281 max:2.356
count1s_l_lut8_32             4    2.00MiB/s  9   1084166   10090824 2.069 2.093 2.095 2.101 max:2.124
count1s_l_naive               4     565kiB/s  6    300255    2134860 2.075 2.076 2.095 2.097 max:2.131
count1s_l_nplut16i_32         4     193kiB/s  6    101855     644460 2.061 2.100 2.105 2.112 max:2.198
count1s_l_pythonic            4    4.02MiB/s 30   1727298   52152270 1.641 max:1.689
count1s_l_zero                4    1.67MiB/s  3    891572    3008046 2.034 2.037 2.054
count1s_ls_naive              4     466kiB/s  3    248620    1079190 2.084 2.086 2.125
count1s_ls_pythonic           4    1.63MiB/s 12    885982   10965114 2.072 2.106 2.113 max:2.124
count1s_s_bw64_32             4    2.06MiB/s 15   1181080   18049530 2.189 2.210 2.219 max:2.249
count1s_s_gmpy2               4    5.60MiB/s  6   2827248   20296818 1.926 1.953 1.961 max:2.035
count1s_s_lut8_32             4    2.48MiB/s 12   1168263   14352486 1.799 1.807 1.829 max:1.875
count1s_s_naive               4     589kiB/s  9    292403    2964957 1.941 1.954 1.967 max:1.999
count1s_s_nplut16i_32         4     237kiB/s 24    121000    2937330 1.998 2.015 2.024 max:2.051
count1s_s_pythonic            4    4.23MiB/s  6   2303295   17153100 2.076 2.080 2.086 max:2.162
count1s_s_zero                4    1.92MiB/s 12   1019382   12565914 2.028 2.038 2.053 max:2.108
count1s_a_gmpy2               8     351kiB/s  9     82063     771897 1.825 1.834 1.838 max:1.909
count1s_a_npbw1a_64           8   108.0kiB/s  9     28176     286914 2.038 2.058 2.083 max:2.118
count1s_a_npbw1b_64           8   107.9kiB/s  9     26972     276078 1.954 1.971 1.992 max:2.008
count1s_a_npbw2_64            8     153kiB/s  3     41204     156942 2.106 2.133 2.137
count1s_a_npbw3_64            8     196kiB/s 30     47961    1472160 1.911 1.938 max:1.959
count1s_a_npbw64_32           8   142.0kiB/s  3     35955     141195 1.979 2.021 2.022
count1s_a_nplut16b_32         8     298kiB/s  6     74239     478764 1.944 1.986 1.989 max:2.106
count1s_a_nplut16b_64         8     181kiB/s 12     38796     498882 1.675 1.703 1.708 max:1.732
count1s_a_nplut16i_32         8     340kiB/s 21     89544    1913754 2.055 2.079 2.088 max:2.127
count1s_a_nplut16i_64         8     174kiB/s  3     50806     185748 2.283 2.290 2.326
count1s_ai_naive              8     474kiB/s 18    112295    2054640 1.850 1.893 1.894 max:1.897
count1s_ai_pythonic           8    1.92MiB/s  6    546097    3609912 2.172 2.187 2.223 max:2.235
count1s_ai_zero               8     872kiB/s  9    209541    2219199 1.878 1.913 1.918 max:1.925
count1s_ais_naive             8     413kiB/s 27    111238    3036756 2.102 2.122 2.130 max:2.157
count1s_ais_pythonic          8    1327kiB/s  3    344252    1366086 2.027 2.057 2.069
count1s_al_naive              8     532kiB/s  3    136350     442380 2.004 2.027 2.039
count1s_al_pythonic           8    3.99MiB/s 30   1012348   30703770 1.935 1.983 max:2.012
count1s_al_zero               8    1369kiB/s 12    305333    3997326 1.743 1.750 1.759 max:1.808
count1s_als_naive             8     480kiB/s  9    133241    1232499 2.168 2.186 2.210 max:2.255
count1s_als_pythonic          8    2.33MiB/s 24    599898   14730882 1.963 1.964 1.964 1.967 max:2.036
count1s_d_naive               8     432kiB/s  9    116191    1079049 2.101 2.107 2.146 max:2.156
count1s_d_npbw1a_64           8   103.2kiB/s  9     27163     277797 2.056 2.069 2.107 max:2.109
count1s_d_npbw1b_64           8   100.7kiB/s 12     26695     353670 2.071 2.096 2.111 max:2.125
count1s_d_npbw2_64            8     171kiB/s 30     35270    1091430 1.607 max:1.660
count1s_d_npbw3_64            8     159kiB/s  3     42255     160095 2.077 2.102 2.122
count1s_d_nplut16i_32         8     245kiB/s  3     66324     232302 2.114 2.124 2.157
count1s_di_lut8_32            8     195kiB/s 27     50022    1383924 1.999 2.026 2.039 max:2.058
count1s_di_pythonic           8    1150kiB/s 12    292202    3839754 1.986 2.013 2.019 max:2.072
count1s_dl_lut8_32            8    1056kiB/s  3    268522    1138896 1.987 2.014 2.028
count1s_dl_pythonic           8    1.51MiB/s  9    378385    3738795 1.914 1.916 1.930 max:2.034
count1s_du_zero               8     891kiB/s 12    223400    3014130 1.959 1.969 1.984 2.007 max:2.055
count1s_l1_lut8_32            8    1.74MiB/s  3    534644    1937262 2.348 2.358 2.393
count1s_l2_lut8_32            8    2.34MiB/s 18    618265   11462100 2.015 2.038 2.051 max:2.072
count1s_l_bw1a_64             8    2.51MiB/s  9    646418    6151092 1.966 1.972 1.994 2.011 max:2.019
count1s_l_bw1b_64             8    2.45MiB/s  6    664660    4321290 2.067 2.099 2.103 2.107 max:2.131
count1s_l_bw2_64              8    3.65MiB/s 18    896479   16469952 1.875 1.875 1.918 max:1.938
count1s_l_bw3_64              8    3.77MiB/s 12   1031111   12706662 2.085 2.115 2.117 2.124 max:2.149
count1s_l_bw64_32             8    1.99MiB/s  6    560332    3695322 2.144 2.145 2.161 max:2.228
count1s_l_lut16_32            8    3.80MiB/s  6   1033861    6536496 2.074 2.084 2.113 max:2.141
count1s_l_lut16_64            8    4.19MiB/s  9   1071882    9980268 1.953 1.958 1.998 max:2.034
count1s_l_lut16b_32           8     916kiB/s  6    241551    1782636 2.059 2.065 2.094 max:2.139
count1s_l_lut16b_64           8    1171kiB/s  9    276743    2824017 1.847 1.861 1.881 max:1.916
count1s_l_lut16i_32           8    2.61MiB/s 30    672908   20520570 1.966 max:2.279
count1s_l_lut16i_64           8    2.21MiB/s  3    610531    2164923 2.108 2.112 2.140
count1s_l_lut8_32             8    2.36MiB/s  9    629931    6002709 2.037 2.039 2.062 max:2.091
count1s_l_lut8_64             8    2.25MiB/s  3    634812    2237766 2.152 2.160 2.170
count1s_l_lut8kc_64           8     499kiB/s 12    126494    1551258 1.980 2.021 2.022 max:2.035
count1s_l_naive               8     568kiB/s 21    143960    3056490 1.982 1.989 2.005 2.029 max:2.035

In [ ]: