In [1]:
%pylab inline
In [2]:
def calc_pearson(a, b):
return numpy.corrcoef(a[:,2], b[:,2])[0,1]
In [3]:
kmers_raw = numpy.loadtxt('simple-genome-kmers.hist')
kmers_dn = numpy.loadtxt('simple-genome-dn-kmers.hist')
In [4]:
plot(kmers_raw[:,0], kmers_raw[:,1], 'b-', label='raw')
plot(kmers_dn[:,0], kmers_dn[:,1], 'r--', label='diginorm')
axis(ymax=500)
xlabel('k-mer abundance')
ylabel('N k-mers at that abundance')
legend(loc='upper right')
savefig('../paper/figures/kmer-spectrum.pdf')
In [5]:
import numpy
ecoli_dn = numpy.loadtxt('ecoli-dn-report.txt')
ecoli_errfree = numpy.loadtxt('ecoli-errfree-report.txt')
In [6]:
plot(ecoli_dn[:,0] / 1e6, ecoli_dn[:,1] / 1e6, 'b-', label='E. coli real data')
plot(ecoli_errfree[:,0] / 1e6, ecoli_errfree[:,1] / 1e6, 'g--', label='E. coli synthetic/no errors')
legend(loc='upper left')
axis(ymax=2.5)
xlabel('N of reads examined (m)')
ylabel('N of reads collected < C (m)')
savefig('../paper/figures/saturation.pdf')
In [7]:
ls *.errhist
In [8]:
ecoli_sam_err = numpy.loadtxt('ecoli-reads.sam.errhist', skiprows=1)
ecoli_spec_err = numpy.loadtxt('ecoli-mapped.fq.gz.errhist', skiprows=1)
ecoli_spec_err2 = numpy.loadtxt('ecoli-mapped.fq.gz.errhist2', skiprows=1)
plot(ecoli_sam_err[:,0], ecoli_sam_err[:,2], 'b--', label='bowtie2 mismatches')
plot(ecoli_spec_err[:,0], ecoli_spec_err[:,2] * 4, 'g-', label='k-mer spectra (scaled)')
legend(loc='upper left')
xlabel('position in read')
ylabel('frequency of errors at that position')
savefig('../paper/figures/ecoli-errhist.pdf')
In [14]:
ecoli_spec_sub0 = numpy.loadtxt('ecoli-mapped.fq.gz.subset.0.errhist', skiprows=1)
ecoli_spec_sub1 = numpy.loadtxt('ecoli-mapped.fq.gz.subset.1.errhist', skiprows=1)
ecoli_spec_sub2 = numpy.loadtxt('ecoli-mapped.fq.gz.subset.2.errhist', skiprows=1)
ecoli_spec_sub3 = numpy.loadtxt('ecoli-mapped.fq.gz.subset.3.errhist', skiprows=1)
ecoli_spec_sub4 = numpy.loadtxt('ecoli-mapped.fq.gz.subset.4.errhist', skiprows=1)
plot(ecoli_sam_err[:,0], ecoli_sam_err[:,2], 'b--', label='bowtie2 mismatches')
plot(ecoli_spec_err[:,0], ecoli_spec_err[:,2] * 4, 'g-', label='k-mer spectra (scaled)')
plot(ecoli_spec_err2[:,0], ecoli_spec_err2[:,2] * 2, 'r-', label='k-mer spectra (scaled)')
#plot(ecoli_spec_sub0[:,0], ecoli_spec_sub0[:,2] * 4, 'g-', label='k-mer spectra (scaled)')
#plot(ecoli_spec_sub1[:,0], ecoli_spec_sub1[:,2] * 4, 'g-', label='k-mer spectra (scaled)')
#plot(ecoli_spec_sub2[:,0], ecoli_spec_sub2[:,2] * 4, 'g-', label='k-mer spectra (scaled)')
#plot(ecoli_spec_sub3[:,0], ecoli_spec_sub3[:,2] * 4, 'g-', label='k-mer spectra (scaled)')
#plot(ecoli_spec_sub4[:,0], ecoli_spec_sub4[:,2] * 4, 'g-', label='k-mer spectra (scaled)')
legend(loc='upper left')
xlabel('position in read')
ylabel('frequency of errors at that position')
savefig('../paper/figures/ecoli-errhist.pdf')
In [21]:
print calc_pearson(ecoli_sam_err, ecoli_spec_err)
print calc_pearson(ecoli_sam_err, ecoli_spec_err2)
print calc_pearson(ecoli_sam_err, ecoli_spec_sub0)
print calc_pearson(ecoli_sam_err, ecoli_spec_sub1)
print calc_pearson(ecoli_sam_err, ecoli_spec_sub2)
print calc_pearson(ecoli_sam_err, ecoli_spec_sub3)
print calc_pearson(ecoli_sam_err, ecoli_spec_sub4)
print calc_pearson(ecoli_spec_err, ecoli_spec_sub0)
print calc_pearson(ecoli_spec_err, ecoli_spec_sub1)
print calc_pearson(ecoli_spec_err, ecoli_spec_sub2)
print calc_pearson(ecoli_spec_err, ecoli_spec_sub3)
print calc_pearson(ecoli_spec_err, ecoli_spec_sub4)
print calc_pearson(ecoli_spec_err2, ecoli_spec_sub0)
print calc_pearson(ecoli_spec_err2, ecoli_spec_sub1)
print calc_pearson(ecoli_spec_err2, ecoli_spec_sub2)
print calc_pearson(ecoli_spec_err2, ecoli_spec_sub3)
print calc_pearson(ecoli_spec_err2, ecoli_spec_sub4)
In [25]:
rseq_sam_err = numpy.loadtxt('rseq-reads.sam.errhist', skiprows=1)
rseq_spec_err = numpy.loadtxt('rseq-mapped.fq.gz.errhist', skiprows=1)
rseq_spec_err2 = numpy.loadtxt('rseq-mapped.fq.gz.errhist2', skiprows=1)
rseq_sub0 = numpy.loadtxt('rseq-mapped.fq.gz.subset.0.errhist', skiprows=1)
rseq_sub1 = numpy.loadtxt('rseq-mapped.fq.gz.subset.1.errhist', skiprows=1)
rseq_sub2 = numpy.loadtxt('rseq-mapped.fq.gz.subset.2.errhist', skiprows=1)
rseq_sub3 = numpy.loadtxt('rseq-mapped.fq.gz.subset.3.errhist', skiprows=1)
rseq_sub4 = numpy.loadtxt('rseq-mapped.fq.gz.subset.4.errhist', skiprows=1)
plot(rseq_sam_err[:,0], rseq_sam_err[:,2], 'b--', label='bowtie2 mismatches')
plot(rseq_spec_err[:,0], 4* rseq_spec_err[:,2], 'g-', label='k-mer spectral (scaled)')
#plot(rseq_sub2[:,0], 4* rseq_sub2[:,2], 'g-.', label='k-mer spectral (scaled) sub2')
plot(rseq_spec_err2[:,0], 4* rseq_spec_err2[:,2], 'r-', label='k-mer spectral (2pass)')
legend(loc='upper left')
axis(ymax=0.05)
xlabel('position in read')
ylabel('frequency of errors at that position')
savefig('../paper/figures/rseq-errhist.pdf')
In [27]:
print calc_pearson(rseq_spec_err, rseq_sam_err)
print calc_pearson(rseq_spec_err, rseq_spec_err2)
print calc_pearson(rseq_spec_err, rseq_sub0)
print calc_pearson(rseq_spec_err, rseq_sub1)
print calc_pearson(rseq_spec_err, rseq_sub2)
print calc_pearson(rseq_spec_err, rseq_sub3)
print calc_pearson(rseq_spec_err, rseq_sub4)
print calc_pearson(rseq_sam_err, rseq_sub0)
print calc_pearson(rseq_sam_err, rseq_sub1)
print calc_pearson(rseq_sam_err, rseq_sub2)
print calc_pearson(rseq_sam_err, rseq_sub3)
print calc_pearson(rseq_sam_err, rseq_sub4)
In [105]:
simple_errhist_pred = numpy.loadtxt('simple-genome-reads-even.fa.errhist', skiprows=1)
simple_errhist_map = numpy.loadtxt('simple-genome-reads-even.sam.errhist', skiprows=1)
In [106]:
plot(simple_errhist_pred[:,0], simple_errhist_pred[:,2])
plot(simple_errhist_map[:,0], simple_errhist_map[:,2])
#reshape(simple_errhist_map,
axis(xmin=-5, xmax=30)
Out[106]:
In [107]:
print 'sumposdiff', sum( (simple_errhist_pred[:,2] - simple_errhist_map[:,2]) > 0)
print 'sumnegdiff', sum( (simple_errhist_pred[:,2] - simple_errhist_map[:,2]) < 0)
print 'avg', sum( (simple_errhist_pred[:,2] - simple_errhist_map[:,2])) / float(len(simple_errhist_pred))
print 'var', numpy.var(simple_errhist_pred[:,2]- simple_errhist_map[:,2])
In [ ]:
diff = simple_errhist_map[:,2] - simple_errhist_pred[:,2]
plot(simple_errhist_pred[:,0], diff)
#axis(xmin=50, xmax=100)
axis(xmin=-5)
In [ ]:
In [ ]: