Count all possible kmers in a string


In [1]:
from collections import defaultdict
%load_ext cython

In [2]:
def count_kmers(string, k, counter=None, gap=0):
    """
    Count occurrence of kmers in a given string.
    """
    if counter is None:
        counter = defaultdict(int)
    for i in range(len(string)-k-gap+1):
        if gap:
            counter[string[i]+string[i+gap+1:i+k+gap]] += 1
        else:
            counter[string[i:i+k]] += 1
    return counter

In [3]:
%%timeit
count_kmers('ABCDAABDPGHKLRJKJIHJIJIJIOJIJIOJIOJIOJIOJIOIOPLNVRRRRPPYUIHJSBNDJKHASFDUIHDJIOSAJKDFHSUAFYDIUASD', 2, gap=1)


27 µs ± 80.5 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

In [4]:
%%cython
from collections import defaultdict
def count_kmers_cython(str string, int k, counter=None, int gap=0):
    """
    Count occurrence of kmers in a given string.
    """
    if counter is None:
        counter = defaultdict(int)
    cdef int i
    cdef int N = len(string)
    for i in range(N-k-gap+1):
        if gap:
            counter[string[i]+string[i+gap+1:i+k+gap]] += 1
        else:
            counter[string[i:i+k]] += 1
    return counter

In [5]:
%%timeit
count_kmers_cython('ABCDAABDPGHKLRJKJIHJIJIJIOJIJIOJIOJIOJIOJIOIOPLNVRRRRPPYUIHJSBNDJKHASFDUIHDJIOSAJKDFHSUAFYDIUASD', 2, gap=1)


15.1 µs ± 219 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

In [ ]: