In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
def calc_pearson(a, b):
    return numpy.corrcoef(a[:,2], b[:,2])[0,1]

Spectral error detection/correction


In [3]:
kmers_raw = numpy.loadtxt('simple-genome-kmers.hist')
kmers_dn = numpy.loadtxt('simple-genome-dn-kmers.hist')

In [4]:
plot(kmers_raw[:,0], kmers_raw[:,1], 'b-', label='raw')
plot(kmers_dn[:,0], kmers_dn[:,1], 'r--', label='diginorm')

axis(ymax=500)
xlabel('k-mer abundance')
ylabel('N k-mers at that abundance')
legend(loc='upper right')
savefig('../paper/figures/kmer-spectrum.pdf')


Saturation curve


In [5]:
import numpy
ecoli_dn = numpy.loadtxt('ecoli-dn-report.txt')
ecoli_errfree = numpy.loadtxt('ecoli-errfree-report.txt')

In [6]:
plot(ecoli_dn[:,0] / 1e6, ecoli_dn[:,1] / 1e6, 'b-', label='E. coli real data')
plot(ecoli_errfree[:,0] / 1e6, ecoli_errfree[:,1] / 1e6, 'g--', label='E. coli synthetic/no errors')
legend(loc='upper left')
axis(ymax=2.5)
xlabel('N of reads examined (m)')
ylabel('N of reads collected < C (m)')
savefig('../paper/figures/saturation.pdf')



In [7]:
ls *.errhist


ecoli-mapped.fq.gz.errhist            rseq-mapped.fq.gz.subset.0.errhist
ecoli-mapped.fq.gz.subset.0.errhist   rseq-mapped.fq.gz.subset.1.errhist
ecoli-mapped.fq.gz.subset.1.errhist   rseq-mapped.fq.gz.subset.2.errhist
ecoli-mapped.fq.gz.subset.2.errhist   rseq-mapped.fq.gz.subset.3.errhist
ecoli-mapped.fq.gz.subset.3.errhist   rseq-mapped.fq.gz.subset.4.errhist
ecoli-mapped.fq.gz.subset.4.errhist   rseq-reads.sam.errhist
ecoli-reads.sam.errhist               simple-genome-reads-even.fa.errhist
rseq-mapped.fq.gz.errhist             simple-genome-reads-even.sam.errhist

In [8]:
ecoli_sam_err = numpy.loadtxt('ecoli-reads.sam.errhist', skiprows=1)
ecoli_spec_err = numpy.loadtxt('ecoli-mapped.fq.gz.errhist', skiprows=1)
ecoli_spec_err2 = numpy.loadtxt('ecoli-mapped.fq.gz.errhist2', skiprows=1)

plot(ecoli_sam_err[:,0], ecoli_sam_err[:,2], 'b--', label='bowtie2 mismatches')
plot(ecoli_spec_err[:,0], ecoli_spec_err[:,2] * 4, 'g-', label='k-mer spectra (scaled)')
legend(loc='upper left')
xlabel('position in read')
ylabel('frequency of errors at that position')
savefig('../paper/figures/ecoli-errhist.pdf')



In [14]:
ecoli_spec_sub0 = numpy.loadtxt('ecoli-mapped.fq.gz.subset.0.errhist', skiprows=1)
ecoli_spec_sub1 = numpy.loadtxt('ecoli-mapped.fq.gz.subset.1.errhist', skiprows=1)
ecoli_spec_sub2 = numpy.loadtxt('ecoli-mapped.fq.gz.subset.2.errhist', skiprows=1)
ecoli_spec_sub3 = numpy.loadtxt('ecoli-mapped.fq.gz.subset.3.errhist', skiprows=1)
ecoli_spec_sub4 = numpy.loadtxt('ecoli-mapped.fq.gz.subset.4.errhist', skiprows=1)


plot(ecoli_sam_err[:,0], ecoli_sam_err[:,2], 'b--', label='bowtie2 mismatches')
plot(ecoli_spec_err[:,0], ecoli_spec_err[:,2] * 4, 'g-', label='k-mer spectra (scaled)')
plot(ecoli_spec_err2[:,0], ecoli_spec_err2[:,2] * 2, 'r-', label='k-mer spectra (scaled)')

#plot(ecoli_spec_sub0[:,0], ecoli_spec_sub0[:,2] * 4, 'g-', label='k-mer spectra (scaled)')
#plot(ecoli_spec_sub1[:,0], ecoli_spec_sub1[:,2] * 4, 'g-', label='k-mer spectra (scaled)')
#plot(ecoli_spec_sub2[:,0], ecoli_spec_sub2[:,2] * 4, 'g-', label='k-mer spectra (scaled)')
#plot(ecoli_spec_sub3[:,0], ecoli_spec_sub3[:,2] * 4, 'g-', label='k-mer spectra (scaled)')
#plot(ecoli_spec_sub4[:,0], ecoli_spec_sub4[:,2] * 4, 'g-', label='k-mer spectra (scaled)')

legend(loc='upper left')
xlabel('position in read')
ylabel('frequency of errors at that position')
savefig('../paper/figures/ecoli-errhist.pdf')



In [21]:
print calc_pearson(ecoli_sam_err, ecoli_spec_err)
print calc_pearson(ecoli_sam_err, ecoli_spec_err2)

print calc_pearson(ecoli_sam_err, ecoli_spec_sub0)
print calc_pearson(ecoli_sam_err, ecoli_spec_sub1)
print calc_pearson(ecoli_sam_err, ecoli_spec_sub2)
print calc_pearson(ecoli_sam_err, ecoli_spec_sub3)
print calc_pearson(ecoli_sam_err, ecoli_spec_sub4)

print calc_pearson(ecoli_spec_err, ecoli_spec_sub0)
print calc_pearson(ecoli_spec_err, ecoli_spec_sub1)
print calc_pearson(ecoli_spec_err, ecoli_spec_sub2)
print calc_pearson(ecoli_spec_err, ecoli_spec_sub3)
print calc_pearson(ecoli_spec_err, ecoli_spec_sub4)

print calc_pearson(ecoli_spec_err2, ecoli_spec_sub0)
print calc_pearson(ecoli_spec_err2, ecoli_spec_sub1)
print calc_pearson(ecoli_spec_err2, ecoli_spec_sub2)
print calc_pearson(ecoli_spec_err2, ecoli_spec_sub3)
print calc_pearson(ecoli_spec_err2, ecoli_spec_sub4)


0.993759765853
0.980782034146
0.994525704126
0.995122314692
0.994697946266
0.994959256989
0.994433095393
0.998885543013
0.999253682253
0.99876336827
0.999085864946
0.999169493905
0.960352946971
0.96264688776
0.961760039051
0.962613344174
0.960641749875

In [25]:
rseq_sam_err = numpy.loadtxt('rseq-reads.sam.errhist', skiprows=1)
rseq_spec_err = numpy.loadtxt('rseq-mapped.fq.gz.errhist', skiprows=1)
rseq_spec_err2 = numpy.loadtxt('rseq-mapped.fq.gz.errhist2', skiprows=1)
rseq_sub0 = numpy.loadtxt('rseq-mapped.fq.gz.subset.0.errhist', skiprows=1)
rseq_sub1 = numpy.loadtxt('rseq-mapped.fq.gz.subset.1.errhist', skiprows=1)
rseq_sub2 = numpy.loadtxt('rseq-mapped.fq.gz.subset.2.errhist', skiprows=1)
rseq_sub3 = numpy.loadtxt('rseq-mapped.fq.gz.subset.3.errhist', skiprows=1)
rseq_sub4 = numpy.loadtxt('rseq-mapped.fq.gz.subset.4.errhist', skiprows=1)

plot(rseq_sam_err[:,0], rseq_sam_err[:,2], 'b--', label='bowtie2 mismatches')
plot(rseq_spec_err[:,0], 4* rseq_spec_err[:,2], 'g-', label='k-mer spectral (scaled)')
#plot(rseq_sub2[:,0], 4* rseq_sub2[:,2], 'g-.', label='k-mer spectral (scaled) sub2')
plot(rseq_spec_err2[:,0], 4* rseq_spec_err2[:,2], 'r-', label='k-mer spectral (2pass)')

legend(loc='upper left')
axis(ymax=0.05)
xlabel('position in read')
ylabel('frequency of errors at that position')
savefig('../paper/figures/rseq-errhist.pdf')



In [27]:
print calc_pearson(rseq_spec_err, rseq_sam_err)
print calc_pearson(rseq_spec_err, rseq_spec_err2)

print calc_pearson(rseq_spec_err, rseq_sub0)
print calc_pearson(rseq_spec_err, rseq_sub1)
print calc_pearson(rseq_spec_err, rseq_sub2)
print calc_pearson(rseq_spec_err, rseq_sub3)
print calc_pearson(rseq_spec_err, rseq_sub4)

print calc_pearson(rseq_sam_err, rseq_sub0)
print calc_pearson(rseq_sam_err, rseq_sub1)
print calc_pearson(rseq_sam_err, rseq_sub2)
print calc_pearson(rseq_sam_err, rseq_sub3)
print calc_pearson(rseq_sam_err, rseq_sub4)


0.973509862487
0.982721466824
0.981546980221
0.979367867833
0.981570040141
0.980559503024
0.982046059673
0.983128084292
0.981893922576
0.984133742128
0.981897296317
0.984814082732

Appendix - Error histograms


In [105]:
simple_errhist_pred = numpy.loadtxt('simple-genome-reads-even.fa.errhist', skiprows=1)
simple_errhist_map = numpy.loadtxt('simple-genome-reads-even.sam.errhist', skiprows=1)

In [106]:
plot(simple_errhist_pred[:,0], simple_errhist_pred[:,2])
plot(simple_errhist_map[:,0], simple_errhist_map[:,2])
#reshape(simple_errhist_map, 
axis(xmin=-5, xmax=30)


Out[106]:
(-5, 30, 0.0, 0.035000000000000003)

In [107]:
print 'sumposdiff', sum( (simple_errhist_pred[:,2] - simple_errhist_map[:,2]) > 0)
print 'sumnegdiff', sum( (simple_errhist_pred[:,2] - simple_errhist_map[:,2]) < 0)
print 'avg', sum( (simple_errhist_pred[:,2] - simple_errhist_map[:,2])) / float(len(simple_errhist_pred))
print 'var', numpy.var(simple_errhist_pred[:,2]- simple_errhist_map[:,2])


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-107-1e266bf3ed29> in <module>()
----> 1 print 'sumposdiff', sum( (simple_errhist_pred[:,2] - simple_errhist_map[:,2]) > 0)
      2 print 'sumnegdiff', sum( (simple_errhist_pred[:,2] - simple_errhist_map[:,2]) < 0)
      3 print 'avg', sum( (simple_errhist_pred[:,2] - simple_errhist_map[:,2])) / float(len(simple_errhist_pred))
      4 print 'var', numpy.var(simple_errhist_pred[:,2]- simple_errhist_map[:,2])

ValueError: operands could not be broadcast together with shapes (100) (99) 
sumposdiff

In [ ]:
diff = simple_errhist_map[:,2] - simple_errhist_pred[:,2]
plot(simple_errhist_pred[:,0], diff)
#axis(xmin=50, xmax=100)
axis(xmin=-5)

In [ ]:


In [ ]: