List files in current working directory:
In [1]:
import os
In [3]:
!ls
If a ery_30-15.sorted.cov
does not exist yet, create one using bamtools coverage
:
In [11]:
if not os.path.exists("ery_30-15.sorted.cov.gz"):
try:
# 'call' is from the subprocess module
retcode = os.system("bamtools coverage -in ery_30-15.sorted.bam | gzip > ery_30-15.sorted.cov.gz")
if retcode < 0:
print "Child was terminated by signal", -retcode
else:
print "Child returned", retcode
except OSError as e:
print "Execution failed", e
else:
print "file already exists"
Let's have a quick look at the new coverage file:
In [12]:
!gzip -dc ery_30-15.sorted.cov.gz | head
The file has three columns: contig ID TAB position TAB coverage. The position should be 0-based, since I have padded the contigs with an "N" at beginning and end, so that the lowest mapping position should be 2 with 1-based coordinates.
Let's tally the coverage:
In [16]:
with os.popen("gzip -dc ery_30-15.sorted.cov.gz | cut -f 3", "r") as cov:
for _ in xrange(10):
print cov.readline().rstrip()
In [65]:
CovCountDict = {}
with os.popen("gzip -dc ery_30-15.sorted.cov.gz | cut -f 3", "r") as cov:
for c in cov:
c = int(c.rstrip())
try:
CovCountDict[c] += 1
except KeyError:
CovCountDict[c] = 1
In [71]:
# sort key-value pairs by key (coverage) and show the first 100:
cov_sorted = sorted(CovCountDict.items(), key=lambda e: e[0])
cov_sorted[:100]
Out[71]:
In [48]:
import pylab
%matplotlib inline
In [72]:
pylab.plot([i[0] for i in cov_sorted][:50], [i[1] for i in cov_sorted][:50])
pylab.title("Coverage distribution")
pylab.xlabel("coverage")
pylab.ylabel("count")
Out[72]:
In [78]:
coverage = [i[0] for i in cov_sorted]
counts = [i[1] for i in cov_sorted]
pylab.hist(coverage, weights=counts, bins=50, range=(0,50), normed=True, cumulative=True, histtype="step")
pylab.title("Coverage distribution")
pylab.xlabel("coverage")
pylab.ylabel("count")
pylab.grid()
In [61]:
len(count.keys())
Out[61]:
In [ ]: