In [1]:
from collections import defaultdict
from multiprocessing import Pool, TimeoutError, Manager
from itertools import repeat
from datetime import datetime

In [2]:
def gatherStats(file):    
    infoMap = defaultdict()
    with open(file, 'r') as f:
        while (f.readline()[0] == '@'):
            continue
        for l in f:
            cols = l.split('\t')
            readParts = cols[0].split(';')
            rid = readParts[0].split('/')[0].split('read')[1]
            tid = readParts[0].split('/')[1]
            mate1pos = readParts[1].split(':')[1].split('-')[0]
            mate2pos = readParts[2].split(':')[1].split('-')[0]

            aligned_tid = cols[2]
            isPrimary = tid == aligned_tid
            aligned_rpos = cols[3]
            isExact = (mate1pos == aligned_rpos or mate2pos == str(int(aligned_rpos)+1))
            if rid not in infoMap:
                infoMap[rid] = [0,0,0,0]
            infoMap[rid][0]+=1
            if (isPrimary):
                infoMap[rid][1]+=1
            if (isExact):
                infoMap[rid][2]+=1            
    return infoMap

In [5]:
def calcLineStat(args):
    infoMap = args[0]
    lines = args[1]
    """Make a dict out of the parsed, supplied lines"""
    for l in lines.split('\n'):
        cols = l.split('\t')
        readParts = cols[0].split(';')
        rid = readParts[0].split('/')[0].split('read')[1]
        tid = readParts[0].split('/')[1]
        mate1pos = readParts[1].split(':')[1].split('-')[0]
        mate2pos = readParts[2].split(':')[1].split('-')[0]

        aligned_tid = cols[3]
        isPrimary1 = (cols[1] == '99')
        isPrimary2 = (cols[1] == '147')
        aligned_rpos = cols[3]
        isExact = (mate1pos == aligned_rpos or mate2pos == str(int(aligned_rpos)+1))
        if rid not in infoMap:
            infoMap[rid] = [0,0,0,0] # total count, mate1 is primary, mate2 is primary, correct position
        infoMap[rid][0]+=1
        if (isPrimary1):
            infoMap[rid][1]+=1
        elif (isPrimary2):
            infoMap[rid][2]+=1
        if isExact:
            infoMap[rid][3]+=1
    return infoMap

def para_gatherStats(file, numthreads=2):
    numlines = 100

    lines = open(file).readlines()

    # create the process pool
    pool = Pool(processes=numthreads)

    resMap = Manager().dict()
    # map the list of lines into a list of result dicts
    result_list = pool.map(calcLineStat, \
        (zip(repeat(resMap), lines[line:line+numlines for line in range(0,len(lines),numlines)] )) )

    # reduce the result dicts into a single dict
    result = {}
    map(result.update, result_list)
    return result


  File "<ipython-input-5-30e40f5d95ea>", line 39
    result_list = pool.map(calcLineStat,         (zip(repeat(resMap), lines[line:line+numlines for line in range(0,len(lines),numlines)] )) )
                                                                                                 ^
SyntaxError: invalid syntax

In [11]:
puffMapPara=para_gatherStats('/mnt/scratch6/pufferfish_experiment/small_test/samtest.sam', 2)


---------------------------------------------------------------------------
RemoteTraceback                           Traceback (most recent call last)
RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/home/fatemeh/miniconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/fatemeh/miniconda3/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
TypeError: calcLineStat() missing 1 required positional argument: 'lines'
"""

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
<ipython-input-11-59deeedf15bb> in <module>()
----> 1 puffMapPara=para_gatherStats('/mnt/scratch6/pufferfish_experiment/small_test/samtest.sam', 2)

<ipython-input-10-035199b52981> in para_gatherStats(file, numthreads)
     36     # map the list of lines into a list of result dicts
     37     result_list = pool.map(calcLineStat, 
---> 38         (resMap, [lines[line:line+numlines] for line in range(0,len(lines),numlines)] ) )
     39 
     40     # reduce the result dicts into a single dict

~/miniconda3/lib/python3.6/multiprocessing/pool.py in map(self, func, iterable, chunksize)
    258         in a list that is returned.
    259         '''
--> 260         return self._map_async(func, iterable, mapstar, chunksize).get()
    261 
    262     def starmap(self, func, iterable, chunksize=None):

~/miniconda3/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
    606             return self._value
    607         else:
--> 608             raise self._value
    609 
    610     def _set(self, i, obj):

TypeError: calcLineStat() missing 1 required positional argument: 'lines'

In [9]:
st = datetime.now()
#kalMap = gatherStats('/mnt/scratch2/kal_out.sam')
#rapMap = gatherStats('/mnt/scratch2/rapmap_out.sam')
puffMap = gatherStats('/mnt/scratch6/pufferfish_experiment/small_test/puff_out.sam')#/mnt/scratch2/puff_out.sam')
print('took {}'.format(datetime.now()-st))


took 0:06:41.348663

In [10]:
print('actual # of reads: 33,464,798')
print(len(puffMap))
#print(len(rapMap))
#print(len(kalMap))


actual # of reads: 33,464,798
33428384

In [11]:
maxv = 0
maxk = ''
correctlyMapped = 0
correctlyMappedCnt = 0
isExact = 0
isExactCnt = 0
cntDist = []
for k, v in puffMap.items():
    cntDist += [v[0]]
    if (v[0] > maxv):
        maxv = v[0]
        maxk = k
    if (v[1] > 0):
        correctlyMapped += 1
        correctlyMappedCnt += v[1]
    if (v[2] > 0):
        isExact += 1
        isExactCnt += v[2]
print('maximum multimapping ==> key:{}, val:{}'.format(maxk,maxv))
print('correctlyMapped:{}, cnt:{}'.format(correctlyMapped, correctlyMappedCnt))
print('isExact:{}, cnt:{}'.format(isExact, isExactCnt))


maximum multimapping ==> key:32279386, val:456
correctlyMapped:33202776, cnt:66455019
isExact:32507683, cnt:57694355

In [12]:
from numpy import histogram
a = histogram(cntDist)

In [14]:
print(a)
for j in range(len(a[0])):
    print('{}\t{}'.format(a[1][j], a[0][j]))


(array([33427983,       96,       37,       50,       88,       70,
             37,       14,        5,        4]), array([   1. ,   46.5,   92. ,  137.5,  183. ,  228.5,  274. ,  319.5,
        365. ,  410.5,  456. ]))
1.0	33427983
46.5	96
92.0	37
137.5	50
183.0	88
228.5	70
274.0	37
319.5	14
365.0	5
410.5	4

In [15]:
good_cnt = 0
bad_cnt = 0
for k,v in readMap.items():
    if (v[0] > 10):
        bad_cnt+=1
        #print('{}:  {},{},{},{}'.format(k, v[0], v[1], v[2], v[3]))
    else:
        good_cnt+=1
print(good_cnt)
print(bad_cnt)


33213471
218234

In [ ]:
def f(x):
    return x*x

if __name__ == '__main__':
    with Pool(5) as p:
        print(p.map(f, [1, 2, 3]))