In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import time
import matplotlib.pyplot as plt
import cytoolz.curried as cyt
from bokeh.plotting import figure, show, output_file
In [5]:
import mitty.analysis.bamtoolz as bamtoolz
import mitty.analysis.bamfilters as mab
import mitty.analysis.plots as mapl
In [4]:
# import logging
# FORMAT = "[%(filename)s:%(lineno)s - %(funcName)20s() ] %(message)s"
# logging.basicConfig(format=FORMAT, level=logging.DEBUG)
In [6]:
# fname = '../../../mitty-demo-data/filter-demo-data/HG00119-bwa.bam'
# scar_fname = '../../../mitty-demo-data/generating-reads/HG00119-truncated-reads-corrupt-lq.txt'
# 180255 read pairs
fname = '../../../mitty-demo-data/alignment-accuracy/HG00119-bwa.bam'
scar_fname = '../../../mitty-demo-data/generating-reads/HG00119-reads-corrupt-lq.txt'
In [7]:
max_d = 200
n1 = mab.parse_read_qnames(scar_fname) # This gives us a curried function
n2 = mab.compute_derr(max_d=max_d)
f_dict = {
'd = 0': lambda mate: mate['d_err'] == 0,
'0 < d <= 50': lambda mate: 0 < mate['d_err'] <= 50,
'50 < d': lambda mate: 50 < mate['d_err'] <= max_d,
'WC': lambda mate: mate['d_err'] == max_d + 1,
'UM': lambda mate: mate['d_err'] == max_d + 2
}
n3 = mab.categorize_reads(f_dict)
n4 = mab.count_reads
pipeline = [n1, n2, n3, n4]
In [8]:
%%time
counts = cyt.pipe(
bamtoolz.read_bam_st(bam_fname=fname), *pipeline)
print(counts, sum(counts.values(), 0))
In [10]:
%%time
counts = sum(
(c for c in bamtoolz.scatter(
pipeline, bam_fname=fname, ncpus=4)
), mab.Counter())
print(counts, sum(counts.values(), 0))
In [121]:
max_d = 200
n1 = mab.parse_read_qnames(scar_fname) # This gives us a curried function
n2 = mab.compute_derr(max_d=max_d)
f_dict = {
'd = 0': lambda mate: mate['d_err'] == 0,
'0 < d <= 50': lambda mate: 0 < mate['d_err'] <= 50,
'50 < d': lambda mate: 50 < mate['d_err'] <= max_d,
'WC': lambda mate: mate['d_err'] == max_d + 1,
'UM': lambda mate: mate['d_err'] == max_d + 2
}
n3 = mab.categorize_reads(f_dict)
n4 = mab.count_reads
pipeline = [n1, n2, n3, n4]
In [124]:
%%time
counts = sum((c for c in bamtoolz.scatter(pipeline, bam_fname=fname, paired=True, ncpus=2)), mab.Counter())
print(counts, sum(counts.values(), 0))
In [25]:
max_d = 200
n1 = mab.parse_read_qnames(scar_fname) # This gives us a curried function
n2 = mab.compute_derr(max_d=max_d)
p1 = mab.default_histogram_parameters() # The full 8D histogram
p1
Out[25]:
In [26]:
p2 = mab.default_histogram_parameters() # We'll do a detailed dive into MQ and d_err here
p2['mq_bin_edges'] = list(range(62))
p2['v_bin_edges'] = None
p2['t_bin_edges'] = None
p2['xt_bin_edges'] = None
p2['v_bin_edges'] = None
In [27]:
n3 = mab.histograminator(histogram_def=[p1, p2])
In [28]:
pipeline = [n1, n2, n3]
In [34]:
%%time
h8 = cyt.pipe(
bamtoolz.read_bam_paired_st(bam_fname=fname),
*pipeline)
In [46]:
h8[0].attrs
Out[46]:
In [36]:
h8[1].sum()
Out[36]:
In [37]:
p2 = mab.collapse(h8[0], xd1=None, xd2=None)
mapl.plot_hist2D(p2)
plt.show()
In [41]:
p2 = mab.collapse(h8[1], mq1=None)
mapl.plot_hist1D(p2)
plt.show()
In [42]:
p2 = mab.collapse(h8[1], mq2=None)
mapl.plot_hist1D(p2)
plt.show()
In [44]:
p2 = mab.collapse(h8[1], mq1=None, mq2=None)
mapl.plot_hist2D(p2)
plt.show()
In [151]:
max_d = 200
n1 = mab.parse_read_qnames(scar_fname) # This gives us a curried function
n2 = mab.compute_derr(max_d=max_d)
p1 = mab.initialize_pah(name='FullHist')
n3 = mab.histogramize(pah=p1)
pipeline = [n1, n2, n3]
In [152]:
%%time
h8 = None
for h in bamtoolz.scatter(
pipeline,
bam_fname=fname, paired=True, ncpus=2):
print(h.sum())
if h8 is None:
h8 = h
else:
h8 += h
print(h8.sum())
In [153]:
p2 = mab.collapse(h8, xd1=None, xd2=None)
mapl.plot_hist(p2)
plt.show()
In [75]:
h8.sum()
Out[75]:
In [86]:
h10.sum()
Out[86]:
More involved example showing, in sequence:
At the end the category counts are comparatively plotted.
In [5]:
r1 = mab.read_bam(bam_fname=fname)
r2 = mab.make_pairs
r3 = mab.parse_read_qnames(scar_fname)
r4 = mab.compute_derr(max_d=200)
f_dict = {
'd = 0': lambda mate: mate['d_err'] == 0,
'0 < d <= 50': lambda mate: 0 < mate['d_err'] <= 50,
'50 < d': lambda mate: 50 < mate['d_err'] < 200,
'WC': lambda mate: 200 < mate['d_err'],
'UM': lambda mate: mate['read'].is_unmapped
}
r5 = mab.categorize_reads(f_dict)
all_counts = {}
r6 = mab.count_reads(all_counts)
r7 = mab.filter_reads(mab.non_ref(), all)
r8 = mab.categorize_reads(f_dict)
nr_counts = {}
r9 = mab.count_reads(nr_counts)
for r in cyt.pipe(r1, r2, r3, r4, r5, r6, r7, r8, r9):
pass
The new concept here is the use of a dictionary of filters to supply to the categorization function. The result is stored in the all_counts
and nr_counts
dictionaries which need to be preallocated and passed to the counting function which modifes them.
In [6]:
mapl.plot_read_counts(ax=plt.subplot(1, 1, 1),
counts_l=[all_counts, nr_counts],
labels=['All', 'non-ref'],
keys=['d = 0', '0 < d <= 50', '50 < d', 'WC', 'UM'],
colors=None)
plt.show()
In [7]:
max_d = 200
r1 = mab.read_bam(bam_fname=fname)
r2 = mab.make_pairs
r3 = mab.parse_read_qnames(scar_fname)
r4 = mab.compute_derr(max_d=200)
p1 = mab.initialize_pah(name='FullHist')
mab.histogramize(pah=p1)(cyt.pipe(r1, r2, cyt.take(100000), r3, r4))
#mab.save_pah(p1, 'test_hist.pklz')
In [29]:
p2 = mab.collapse(p1, xd1=None, xd2=None)
mapl.plot_hist(p2)
plt.show()
The plot suggests that the error in aligning mate1 is largely unrelated to the error in aligning mate2 except for some cases where both mates are off by about the same amount in the same direction (top, right and bottom left corners) or in opposite directions.
In [28]:
p2 = mab.collapse(p1, xd1=None, xd2=None)
mapl.plot_hist(p2)
plt.show()
Out[28]:
In [68]:
p2 = mab.collapse(p1, mq1=None, mq2=None)
mapl.plot_hist(p2)
plt.show()
In [47]:
p2.attrs
Out[47]:
In [66]:
p2 = mab.collapse(p1, xd1=None, mq1=None)
mapl.plot_hist(p2)
plt.show()
In [100]:
p1.coords['v1']
Out[100]:
In [103]:
p2_r = mab.collapse(p1, xt=None, v1=(5, 6), v2=(5, 6))
p2_snp = mab.collapse(p1, xt=None, v1=(2, 3), v2=(5, 6))
plt.semilogy(p2_r/p2_r.max(), label='Ref')
plt.semilogy(p2_snp/p2_snp.max(), label='SNP(1)')
plt.legend(loc='best')
plt.xticks(range(p2_r.shape[0]), p2_r.coords['xt'].data, rotation='vertical')
plt.show()
In [137]:
p2_1 = mab.collapse(p1, xt=None, mq1=None)
p2_2 = mab.collapse(p1, xt=None, mq2=None)
fig, ax = plt.subplots(1, 2, figsize=(13, 5))
plt.subplots_adjust(wspace=0.5)
mapl.plot_hist(p2_1.T, ax[0])
mapl.plot_hist(p2_2.T, ax[1])
plt.show()
In [72]:
p2 = mab.collapse(p1, t=None, mq1=None)
mapl.plot_hist(p2)
plt.show()
In [27]:
p2 = mab.collapse(p1, mq1=None)
plt.semilogy(p2)
plt.xticks(range(p2.coords['mq1'].values.size), p2.coords['mq1'].values, rotation='vertical')
plt.show()
In [58]:
p2 = mab.collapse(p1, xd1=None, xd2=None, v1=(1, 2), v2=(5, 6))
mab.plot_hist(p2)
plt.show()
In [34]:
r1 = mab.read_bam(bam_fname=fname)
for r in cyt.pipe(r1, cyt.take(20)):
tostring(r[0]['read'])
In [38]:
read = r[0]['read']
In [41]:
read.tostring(mab.pysam.AlignmentFile(fname))
Out[41]:
In [47]:
mab.pysam.AlignmentFile?
In [131]:
mab.plot_hist(p2)
plt.show()
In [122]:
p2.coords[p2.dims[0]].values
Out[122]:
In [105]:
ax = plt.subplot(1,1,1)
mapl.plot_mean_MQ_vs_derr(ax=ax, dmv_mat=dmv_mat, fmt='yo', ms=5)
plt.show()
In [72]:
p1 = mab.initialize_pah()
In [74]:
len(p1.coords)
Out[74]:
In [149]:
ax1 = plt.subplot(1,1,1)
mapl.plot_perr_vs_MQ(ax=ax1, dmv_mat=dmv_mat, yscale='log')
ax2 = ax1.twinx()
mapl.plot_read_count_vs_MQ(ax=ax2, dmv_mat=dmv_mat)
ax2.set_ylabel('Read count', color='r')
ax2.tick_params('y', colors='r')
ax1.legend(loc='lower right', fontsize=9)
plt.show()
In [117]:
for n, v_bin_label in enumerate(
['Ref', 'SNP', 'DEL <= 10']):
ax = plt.subplot(1, 3, n + 1)
mapl.hengli_plot(ax=ax, dmv_mat=dmv_mat, v_bin_label=v_bin_label)
plt.show()
In [150]:
r1 = mab.read_bam(bam_fname=fname, sidecar_fname=scar_fname)
r2 = mab.compute_derr(max_d=200)
r3 = mab.to_df(tags=['NM'])
df = cyt.pipe(r1, r2, cyt.take(20), r3)
df
Out[150]:
In [151]:
r1 = mab.read_bam(bam_fname=fname, sidecar_fname=scar_fname)
r2 = mab.compute_derr(max_d=200)
r3 = mab.make_pairs
r4 = mab.to_df(tags=['NM'])
df = cyt.pipe(r1, r2, r3, cyt.take(20), r4)
df
Out[151]:
In [51]:
import io
import pysam
In [52]:
fp = pysam.AlignmentFile(fname)
In [54]:
fout_str = io.StringIO()
In [56]:
pysam.AlignmentFile(fout_str, 'r')
In [57]:
read.tostring(mab.pysam.AlignmentFile(fname)).split('\t')
Out[57]:
In [86]:
def fromstring(s, ref_dict):
"""Inverse of pysam.AlignedSegment.tostring(): given a string, create an aligned segment
:param s:
:param ref_dict: ref_dict = {r: n for n, r in enumerate(fp.references)}
:return:
"""
def _split(_s):
qname, flag, rname, pos, \
mapping_quality, cigarstring, \
rnext, pnext, template_length, seq, qual, *_tg = _s.split('\t')
flag = int(flag)
rname = ref_dict[rname]
pos = int(pos)
mapping_quality = int(mapping_quality)
rnext = ref_dict[rnext]
pnext = int(pnext)
template_length = int(template_length)
return qname, flag, rname, pos, \
mapping_quality, cigarstring, \
rnext, pnext, template_length, seq, qual, _tg
# So close, pysam.tostring, so close
def _tags(_t):
_tl = _t.split(':')
if _tl[1] == 'i':
_tl[2] = int(_tl[2])
elif _tl[1] == 'f':
_tl[2] = float(_tl[2])
return _tl[0], _tl[2], _tl[1]
r = pysam.AlignedSegment()
r.qname, r.flag, r.rname, r.pos, \
r.mapping_quality, r.cigarstring, \
r.rnext, r.pnext, r.template_length, r.seq, r.qual, tags = _split(s)
r.set_tags([_tags(t) for t in tags])
return r
In [87]:
read2 = fromstring(read.tostring(mab.pysam.AlignmentFile(fname)), ref_dict)
In [90]:
read2.tostring(mab.pysam.AlignmentFile(fname)).split()
Out[90]:
In [92]:
read.tostring(mab.pysam.AlignmentFile(fname)).split()
Out[92]:
In [11]:
fp = bamtoolz.pysam.AlignmentFile(fname)
fp.unmapped
Out[11]:
In [9]:
import logging
logging.debug("hello world")
In [48]:
mab.save_histogram(p2, 'test_me.pklz')
In [49]:
p3 = mab.load_histogram('test_me.pklz')
In [52]:
p3.dims
Out[52]:
In [53]:
p3.dims
Out[53]:
In [54]:
p3
Out[54]:
In [ ]: