In [1]:
from collections import defaultdict
from multiprocessing import Pool, TimeoutError, Manager
from itertools import repeat
from datetime import datetime
In [2]:
def gatherStats(file):
infoMap = defaultdict()
with open(file, 'r') as f:
while (f.readline()[0] == '@'):
continue
for l in f:
cols = l.split('\t')
readParts = cols[0].split(';')
rid = readParts[0].split('/')[0].split('read')[1]
tid = readParts[0].split('/')[1]
mate1pos = readParts[1].split(':')[1].split('-')[0]
mate2pos = readParts[2].split(':')[1].split('-')[0]
aligned_tid = cols[2]
isPrimary = tid == aligned_tid
aligned_rpos = cols[3]
isExact = (mate1pos == aligned_rpos or mate2pos == str(int(aligned_rpos)+1))
if rid not in infoMap:
infoMap[rid] = [0,0,0,0]
infoMap[rid][0]+=1
if (isPrimary):
infoMap[rid][1]+=1
if (isExact):
infoMap[rid][2]+=1
return infoMap
In [5]:
def calcLineStat(args):
infoMap = args[0]
lines = args[1]
"""Make a dict out of the parsed, supplied lines"""
for l in lines.split('\n'):
cols = l.split('\t')
readParts = cols[0].split(';')
rid = readParts[0].split('/')[0].split('read')[1]
tid = readParts[0].split('/')[1]
mate1pos = readParts[1].split(':')[1].split('-')[0]
mate2pos = readParts[2].split(':')[1].split('-')[0]
aligned_tid = cols[3]
isPrimary1 = (cols[1] == '99')
isPrimary2 = (cols[1] == '147')
aligned_rpos = cols[3]
isExact = (mate1pos == aligned_rpos or mate2pos == str(int(aligned_rpos)+1))
if rid not in infoMap:
infoMap[rid] = [0,0,0,0] # total count, mate1 is primary, mate2 is primary, correct position
infoMap[rid][0]+=1
if (isPrimary1):
infoMap[rid][1]+=1
elif (isPrimary2):
infoMap[rid][2]+=1
if isExact:
infoMap[rid][3]+=1
return infoMap
def para_gatherStats(file, numthreads=2):
numlines = 100
lines = open(file).readlines()
# create the process pool
pool = Pool(processes=numthreads)
resMap = Manager().dict()
# map the list of lines into a list of result dicts
result_list = pool.map(calcLineStat, \
(zip(repeat(resMap), lines[line:line+numlines for line in range(0,len(lines),numlines)] )) )
# reduce the result dicts into a single dict
result = {}
map(result.update, result_list)
return result
In [11]:
puffMapPara=para_gatherStats('/mnt/scratch6/pufferfish_experiment/small_test/samtest.sam', 2)
In [9]:
st = datetime.now()
#kalMap = gatherStats('/mnt/scratch2/kal_out.sam')
#rapMap = gatherStats('/mnt/scratch2/rapmap_out.sam')
puffMap = gatherStats('/mnt/scratch6/pufferfish_experiment/small_test/puff_out.sam')#/mnt/scratch2/puff_out.sam')
print('took {}'.format(datetime.now()-st))
In [10]:
print('actual # of reads: 33,464,798')
print(len(puffMap))
#print(len(rapMap))
#print(len(kalMap))
In [11]:
maxv = 0
maxk = ''
correctlyMapped = 0
correctlyMappedCnt = 0
isExact = 0
isExactCnt = 0
cntDist = []
for k, v in puffMap.items():
cntDist += [v[0]]
if (v[0] > maxv):
maxv = v[0]
maxk = k
if (v[1] > 0):
correctlyMapped += 1
correctlyMappedCnt += v[1]
if (v[2] > 0):
isExact += 1
isExactCnt += v[2]
print('maximum multimapping ==> key:{}, val:{}'.format(maxk,maxv))
print('correctlyMapped:{}, cnt:{}'.format(correctlyMapped, correctlyMappedCnt))
print('isExact:{}, cnt:{}'.format(isExact, isExactCnt))
In [12]:
from numpy import histogram
a = histogram(cntDist)
In [14]:
print(a)
for j in range(len(a[0])):
print('{}\t{}'.format(a[1][j], a[0][j]))
In [15]:
good_cnt = 0
bad_cnt = 0
for k,v in readMap.items():
if (v[0] > 10):
bad_cnt+=1
#print('{}: {},{},{},{}'.format(k, v[0], v[1], v[2], v[3]))
else:
good_cnt+=1
print(good_cnt)
print(bad_cnt)
In [ ]:
def f(x):
return x*x
if __name__ == '__main__':
with Pool(5) as p:
print(p.map(f, [1, 2, 3]))