In [130]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['f']
`%pylab --no-import-all` prevents importing * from pylab and numpy

In [131]:
ls


Makefile                           podar.16s.x.wgs.k20.reads.out
Untitled0.ipynb                    podar.16s.x.wgs.k32.kmers.out
podar.16s.x.16s.k20.kmers.out      podar.16s.x.wgs.k32.reads.out
podar.16s.x.16s.k20.reads.out      podar.wgs.report
podar.16s.x.16s.k32.kmers.out      podar_16s.hist
podar.16s.x.16s.k32.reads.out      saturate-1.ipynb
podar.16s.x.ref.kmers.out          saturate-2.ipynb
podar.16s.x.sim.k20.kmers.m10.out  saturate-by-kmers.py*
podar.16s.x.sim.k20.kmers.m2.out   saturate-by-reads.py*
podar.16s.x.sim.k20.kmers.m20.out  soil.16s.x.16s.k20.kmers.out
podar.16s.x.sim.k20.kmers.m40.out  soil.16s.x.16s.k20.reads.out
podar.16s.x.sim.k20.kmers.m5.out   soil.16s.x.16s.k32.kmers.out
podar.16s.x.sim.kmers.out          soil.16s.x.16s.k32.reads.out
podar.16s.x.wgs.k20.kmers.m10.out  soil.16s.x.wgs.k20.kmers.m2.out
podar.16s.x.wgs.k20.kmers.m2.out   soil.16s.x.wgs.k20.kmers.m20.out
podar.16s.x.wgs.k20.kmers.m20.out  soil.16s.x.wgs.k20.kmers.out
podar.16s.x.wgs.k20.kmers.m40.out  soil.16s.x.wgs.k20.reads.out
podar.16s.x.wgs.k20.kmers.m5.out   soil.16s.x.wgs.k32.kmers.out
podar.16s.x.wgs.k20.kmers.out      soil.16s.x.wgs.k32.reads.out

In [132]:
p16sx16s_k20_reads = numpy.loadtxt('podar.16s.x.16s.k20.reads.out')
p16sx16s_k32_reads = numpy.loadtxt('podar.16s.x.16s.k32.reads.out')
p16sx16s_k20_kmers = numpy.loadtxt('podar.16s.x.16s.k20.kmers.out')
p16sx16s_k32_kmers = numpy.loadtxt('podar.16s.x.16s.k32.kmers.out')
p16sxwgs_k20_reads = numpy.loadtxt('podar.16s.x.wgs.k20.reads.out')
p16sxwgs_k32_reads = numpy.loadtxt('podar.16s.x.wgs.k32.reads.out')
p16sxwgs_k20_kmers = numpy.loadtxt('podar.16s.x.wgs.k20.kmers.out')
p16sxwgs_k32_kmers = numpy.loadtxt('podar.16s.x.wgs.k32.kmers.out')
p16sxref_kmers = numpy.loadtxt('podar.16s.x.ref.kmers.out')
p16sxsim_kmers = numpy.loadtxt('podar.16s.x.sim.kmers.out')

In [133]:
#plot(p16sx16s_k20_reads[:,0], p16sx16s_k20_reads[:,3,], label='k20/reads')
#plot(p16sx16s_k32_reads[:,0], p16sx16s_k32_reads[:,3,], label='k32/reads')
plot(p16sx16s_k20_kmers[:,0], p16sx16s_k20_kmers[:,3,], label='k20/kmers')
plot(p16sx16s_k32_kmers[:,0], p16sx16s_k32_kmers[:,3,], label='k32/kmers')

title('podar 16s x 16s')
axis(ymin=0, xmin=0)
xlabel('reads')
ylabel('fraction covered')
legend(loc='upper left')


Out[133]:
<matplotlib.legend.Legend at 0x115423c50>

In [134]:
plot(p16sxref_kmers[:,0], p16sxref_kmers[:,3,], label='k20/kmers')

title('podar 16s x ref')
axis(ymin=0, xmin=0)
xlabel('reads')
ylabel('fraction covered')
legend(loc='upper left')


Out[134]:
<matplotlib.legend.Legend at 0x11543a310>

In [135]:
#plot(p16sxwgs_k20_reads[:,0], p16sxwgs_k20_reads[:,3,], label='k20/reads')
#plot(p16sxwgs_k32_reads[:,0], p16sxwgs_k32_reads[:,3,], label='k32/reads')
plot(p16sxwgs_k20_kmers[:,0], p16sxwgs_k20_kmers[:,3,], label='k20/kmers')
#plot(p16sxwgs_k32_kmers[:,0], p16sxwgs_k32_kmers[:,3,], label='k32/kmers')


title('podar 16s x wgs')
axis(ymin=0, xmin=0, ymax=12)
xlabel('reads')
ylabel('fraction covered (%)')
legend(loc='upper left')


Out[135]:
<matplotlib.legend.Legend at 0x1154ebbd0>

In [136]:
plot(p16sxsim_kmers[:,0], p16sxsim_kmers[:,3], label='sim reads')
title('podar 16s x wgs')
axis(ymin=0, xmin=0)
xlabel('reads')
ylabel('fraction covered (%)')
legend(loc='upper left')


Out[136]:
<matplotlib.legend.Legend at 0x1155c5890>

In [137]:
s16sx16s_k20_reads = numpy.loadtxt('soil.16s.x.16s.k20.reads.out')
s16sx16s_k32_reads = numpy.loadtxt('soil.16s.x.16s.k32.reads.out')
s16sx16s_k20_kmers = numpy.loadtxt('soil.16s.x.16s.k20.kmers.out')
s16sx16s_k32_kmers = numpy.loadtxt('soil.16s.x.16s.k32.kmers.out')
s16sxwgs_k20_reads = numpy.loadtxt('soil.16s.x.wgs.k20.reads.out')
s16sxwgs_k32_reads = numpy.loadtxt('soil.16s.x.wgs.k32.reads.out')
s16sxwgs_k20_kmers = numpy.loadtxt('soil.16s.x.wgs.k20.kmers.out')
s16sxwgs_k32_kmers = numpy.loadtxt('soil.16s.x.wgs.k32.kmers.out')

In [138]:
#plot(s16sx16s_k20_reads[:,0], s16sx16s_k20_reads[:,3,], label='k20/reads')
#plot(s16sx16s_k32_reads[:,0], s16sx16s_k32_reads[:,3,], label='k32/reads')
plot(s16sx16s_k20_kmers[:,0], s16sx16s_k20_kmers[:,3,], label='k20/kmers')
plot(s16sx16s_k32_kmers[:,0], s16sx16s_k32_kmers[:,3,], label='k32/kmers')

title('soil 16s x 16s, reads')
axis(ymin=0, ymax=100, xmin=0)
xlabel('reads')
ylabel('fraction covered')
legend(loc='upper left')


Out[138]:
<matplotlib.legend.Legend at 0x116492b10>

In [139]:
#plot(s16sxwgs_k20_reads[:,0], s16sxwgs_k20_reads[:,3,], label='k20/reads')
#plot(s16sxwgs_k32_reads[:,0], s16sxwgs_k32_reads[:,3,], label='k32/reads')
plot(s16sxwgs_k20_kmers[:,0], s16sxwgs_k20_kmers[:,3,], label='k20/kmers')
#plot(s16sxwgs_k32_kmers[:,0], s16sxwgs_k32_kmers[:,3,], label='k32/kmers')

title('soil 16s x wgs')
axis(ymin=0, xmin=0)
xlabel('reads')
ylabel('fraction covered')
legend(loc='upper left')


Out[139]:
<matplotlib.legend.Legend at 0x116b7c150>

Conclusions/questions --

  • kmer size is irrelevant
  • ignore reads, or develop better method; should we use kadian measure?
  • are kmers good/real?

Digital normalization saturation curve / podar data


In [171]:
lines = []
last_total = -1
base_total = 0
base_kept = 0
data = [] 
for line in open('podar.wgs.report'):
    total, kept, f = line.split()
    total, kept = int(total), int(kept)
    if total < last_total:
        base_total = last_total
        base_kept = last_kept
    
    data.append((total + base_total, kept + base_kept))
    last_kept = kept
    last_total = total
    
diginorm = numpy.array(data)

diginorm_C1 = numpy.loadtxt('podar.wgs.C1.report')

In [172]:
plot(diginorm[:,0], diginorm[:,1])


Out[172]:
[<matplotlib.lines.Line2D at 0x11910ead0>]

In [173]:
plot(diginorm_C1[:,0], diginorm_C1[:,1], label='C1')
plot(diginorm[:,0] / 10, diginorm[:,1] / 13., label='C10')
axis(xmax=1e7)
legend(loc='lower right')


Out[173]:
<matplotlib.legend.Legend at 0x1191e9410>

podar 16s kmer abundance


In [142]:
podar_16s_kmer = numpy.loadtxt('podar_16s.hist')

In [143]:
plot(podar_16s_kmer[:,0], podar_16s_kmer[:,1])
axis(xmax=20, ymax=1000)


Out[143]:
(0.0, 20, 0.0, 1000)

Effects of different minimum k-mer abundances


In [144]:
m10 = numpy.loadtxt('podar.16s.x.wgs.k20.kmers.m10.out')
m2 = numpy.loadtxt('podar.16s.x.wgs.k20.kmers.m2.out')
m20 = numpy.loadtxt('podar.16s.x.wgs.k20.kmers.m20.out')
m40 = numpy.loadtxt('podar.16s.x.wgs.k20.kmers.m40.out')
m5 = numpy.loadtxt('podar.16s.x.wgs.k20.kmers.m5.out')
m2_k32 = numpy.loadtxt('podar.16s.x.wgs.k32.kmers.out')

In [145]:
plot(m40[:,0], m40[:,3], label='m40')
plot(m20[:,0], m20[:,3], label='m20')
plot(m2_k32[:,0], m2_k32[:,3], label='m20/k=32')
plot(m10[:,0], m10[:,3], label='m10')
plot(m5[:,0], m5[:,3], label='m5')
plot(m2[:,0], m2[:,3], label='m2')

legend()


Out[145]:
<matplotlib.legend.Legend at 0x11653ebd0>

In [273]:
plot(m20[:,0] / 1e6, m20[:,3], 'r-', label='16s (v1-3)', linewidth=2)
ylabel('% information in 16s')
xlabel('number of shotgun reads examined (m)')
axis(xmax=5)
savefig('/tmp/fig8.pdf')



In [146]:
plot(m20[:,0], m20[:,3], label='m20')
plot(m2_k32[:,0], m2_k32[:,3], label='m20/k=32')
legend(loc='lower right')
title('choice of k doesn\'t matter')


Out[146]:
<matplotlib.text.Text at 0x11651c090>

In [147]:
plot(diginorm[:,0], diginorm[:,1])
axis(xmax=2e7)


Out[147]:
(0.0, 20000000.0, 0.0, 30000000.0)

In [148]:
plot(m2[:,0], m2[:,3], label='m2')
plot(m5[:,0], m5[:,3], label='m5')
plot(m10[:,0], m10[:,3], label='m10')
plot(m20[:,0], m20[:,3], label='m20')
plot(m40[:,0], m40[:,3], label='m40')
axis(xmax=0.5e7)
legend()


Out[148]:
<matplotlib.legend.Legend at 0x11670cdd0>

In [149]:
plot(diginorm[:,0], diginorm[:,1])
axis(xmax=0.5e7)


Out[149]:
(0.0, 5000000.0, 0.0, 30000000.0)

In [150]:
sim_m10 = numpy.loadtxt('podar.16s.x.sim.k20.kmers.m10.out')
sim_m2 = numpy.loadtxt('podar.16s.x.sim.k20.kmers.m2.out')
sim_m20 = numpy.loadtxt('podar.16s.x.sim.k20.kmers.m20.out')
sim_m40 = numpy.loadtxt('podar.16s.x.sim.k20.kmers.m40.out')
sim_m5 = numpy.loadtxt('podar.16s.x.sim.k20.kmers.m5.out')

In [151]:
plot(sim_m2[:,0], sim_m2[:,3], label='m2')
plot(sim_m5[:,0], sim_m5[:,3], label='m5')
plot(sim_m10[:,0], sim_m10[:,3], label='m10')
plot(sim_m20[:,0], sim_m20[:,3], label='m20')
plot(sim_m40[:,0], sim_m40[:,3], label='m40')
legend()


Out[151]:
<matplotlib.legend.Legend at 0x11658e550>

In [152]:
#xref
#xsim
#on soil

In [174]:
soil_m2 = numpy.loadtxt('soil.16s.x.wgs.k20.kmers.m2.out')
soil_m20 = numpy.loadtxt('soil.16s.x.wgs.k20.kmers.m20.out')

plot(soil_m2[:,0], soil_m2[:,3])
plot(soil_m20[:,0], soil_m20[:,3])


Out[174]:
[<matplotlib.lines.Line2D at 0x11910ee10>]

In [175]:
podar_v13 = numpy.loadtxt('podar.v13.x.wgs.k20.kmers.m20.out')
podar_v4arch = numpy.loadtxt('podar.v4arch.x.wgs.k20.kmers.m20.out')
podar_v69 = numpy.loadtxt('podar.v69.x.wgs.k20.kmers.m20.out')

In [182]:
plot(podar_v13[:,0], podar_v13[:,3], label='v13')
plot(m20[:,0], m20[:,3], label='v35')
#plot(podar_v4arch[:,0], podar_v4arch[:,3], label='v4 (arch)')
plot(podar_v69[:,0], podar_v69[:,3], label='v69')
legend(loc='lower right')


Out[182]:
<matplotlib.legend.Legend at 0x119e38350>

# Beetle results


In [186]:
beetle_m20 = numpy.loadtxt('beetle.kmers.m20.out')
plot(beetle_m20[:,0], beetle_m20[:,3])

title('beetle gut: 16s saturation curve')
ylabel('fraction of informative 16s k-mers seen in shotgun data')
xlabel('number of shotgun reads examined')


Out[186]:
<matplotlib.text.Text at 0x11a9a6d10>

Lanier results


In [206]:
lanier_16s_x_wgs = numpy.loadtxt('lanier.16s1.x.wgs.kmers.out')
plot(lanier_16s_x_wgs[:,0], lanier_16s_x_wgs[:,3], label='Lake Lanier')
plot(beetle_m20[:,0] * 4, beetle_m20[:,3], label='Beetle')
plot(soil_m20[:,0], soil_m20[:,3], label='soil')
legend(loc='lower right')


Out[206]:
<matplotlib.legend.Legend at 0x11c2b49d0>

Lanier diginorm


In [192]:
lines = []
last_total = -1
base_total = 0
base_kept = 0
data = [] 
for line in open('lanier.wgs.C1.report'):
    total, kept, f = line.split()
    total, kept = int(total), int(kept)
    if total < last_total:
        print total
        base_total = last_total + base_total
        base_kept = last_kept + base_kept
    
    data.append((total + base_total, kept + base_kept))
    last_kept = kept
    last_total = total
    
lanier_C1 = numpy.array(data)


100000
100000
100000
100000
100000
100000
100000

In [194]:
plot(lanier_C1[:,0], lanier_C1[:,1])


Out[194]:
[<matplotlib.lines.Line2D at 0x11aeacf90>]

Breaking out podar by OTUs


In [244]:
podar_c0 = numpy.loadtxt('podar.c0.x.wgs.k20.kmers.m20.out')
podar_c01 = numpy.loadtxt('podar.c01.x.wgs.k20.kmers.m20.out')
podar_c02 = numpy.loadtxt('podar.c02.x.wgs.k20.kmers.m20.out')
podar_c03 = numpy.loadtxt('podar.c03.x.wgs.k20.kmers.m20.out')
podar_c04 = numpy.loadtxt('podar.c04.x.wgs.k20.kmers.m20.out')
podar_c05 = numpy.loadtxt('podar.c05.x.wgs.k20.kmers.m20.out')
podar_c20 = numpy.loadtxt('podar.c20.x.wgs.k20.kmers.m20.out')
podar_c40 = numpy.loadtxt('podar.c40.x.wgs.k20.kmers.m20.out')

#plot(podar_c0[:,0], podar_c0[:,3], label='otu 0')
#plot(podar_c01[:,0] / 1e6, podar_c01[:,3], 'g-', label='otu 1', linewidth=2)
plot(podar_c02[:,0] / 1e6, podar_c02[:,3], 'b-', label='otu 2', linewidth=2)
plot(podar_c03[:,0] / 1e6, podar_c03[:,3], 'c-', label='otu 3', linewidth=2)
plot(podar_c04[:,0] / 1e6, podar_c04[:,3], 'y-', label='otu 4', linewidth=2)
plot(podar_c05[:,0] / 1e6, podar_c05[:,3], 'r-', label='otu 5', linewidth=2)
#plot(podar_c20[:,0], podar_c20[:,3], label='otu 20')
#plot(podar_c40[:,0], podar_c40[:,3], label='otu 40')

legend(loc='lower right')
axis(xmax=2)
ylabel('% 16s information seen')
xlabel('number of shotgun reads examined (m)')
savefig('/tmp/fig6.pdf')



In [235]:
# otu5 is acidobacterium; one species, Acidobacterium capsulatum, with one rRNA; 4.6% of BA community, 4.7% of Illumina reads;
# otu2 is chlorobium; five species, total of 10 rRNA; 9.1% of Illumina. 

corr_factor = 5

plot(podar_c02[:,0] / 1e6, podar_c02[:,3], 'b-', label='Chlorobium (uncorrected)', linewidth=2)
plot(podar_c02[:,0] * 5 / 1e6, podar_c02[:,3], 'b--', label='Chlorobium (abund adjusted)', linewidth=2)

plot(podar_c05[:,0] / 1e6, podar_c05[:,3], 'r-', label='Acidobacterium', linewidth=2)

legend(loc='lower right')
axis(xmax=2)
ylabel('% of 16s information')
xlabel('number of shotgun reads examined (m)')
savefig('/tmp/fig5.pdf')



In [224]:
wa_16s = numpy.loadtxt('wrighton.16s.x.wgs-a.kmers.out')
wb_16s = numpy.loadtxt('wrighton.16s.x.wgs-b.kmers.out')
wc_16s = numpy.loadtxt('wrighton.16s.x.wgs-c.kmers.out')

In [229]:
plot(wa_16s[:,0], wa_16s[:,3], label='A')
plot(wb_16s[:,0], wa_16s[:,3], label='B')
plot(wc_16s[:,0], wa_16s[:,3], label='C')
axis(xmax=1200000)


Out[229]:
(0.0, 1200000, 0.0, 100.0)

In [267]:
plot(wa_16s[:,1] / 1e6, wa_16s[:,3], label='groundwater (Wrighton et al.)', linewidth=2)
plot(beetle_m20[:,1] / 1e6, beetle_m20[:,3], label='beetle gut (Scully et al.)', linewidth=2)
plot(lanier_16s_x_wgs[:,1] / 1e6, lanier_16s_x_wgs[:,3], label='Lake Lanier (Poretsky et al.)', linewidth=2)
plot(soil_m20[:,1] / 1e6, soil_m20[:,3], label='corn rhizosphere (unpub)', linewidth=2)
legend(loc='upper right')
xlabel('Total bp seen (Mbp)')
ylabel('% 16s information seen')
axis(xmax=2000)
savefig('/tmp/fig7.pdf')



In [ ]: