In [59]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['f']
`%pylab --no-import-all` prevents importing * from pylab and numpy

In [60]:
lines = []
last_total = -1
base_total = 0
base_kept = 0
data = [] 
for line in open('podar.wgs.report'):
    total, kept, f = line.split()
    total, kept = int(total), int(kept)
    if total < last_total:
        base_total = last_total
        base_kept = last_kept
    
    data.append((total + base_total, kept + base_kept))
    last_kept = kept
    last_total = total
    
diginorm = numpy.array(data)
diginorm_C1 = numpy.loadtxt('podar.wgs.C1.report')

In [69]:
m10 = numpy.loadtxt('podar.16s.x.wgs.k20.kmers.m10.out')
m2 = numpy.loadtxt('podar.16s.x.wgs.k20.kmers.m2.out')
m20 = numpy.loadtxt('podar.16s.x.wgs.k20.kmers.m20.out')
m40 = numpy.loadtxt('podar.16s.x.wgs.k20.kmers.m40.out')
m5 = numpy.loadtxt('podar.16s.x.wgs.k20.kmers.m5.out')

podar_v13 = numpy.loadtxt('podar.v13.x.wgs.k20.kmers.m20.out')
podar_v4arch = numpy.loadtxt('podar.v4arch.x.wgs.k20.kmers.m20.out')
podar_v69 = numpy.loadtxt('podar.v69.x.wgs.k20.kmers.m20.out')

In [62]:
soil_m2 = numpy.loadtxt('soil.16s.x.wgs.k20.kmers.m2.out')
soil_m20 = numpy.loadtxt('soil.16s.x.wgs.k20.kmers.m20.out')

Digital normalization saturation curve / podar data


In [130]:
figsize(7, 4)
plot(diginorm[:,0] / 1e6, diginorm[:,1] / 1e6, 'b-', label='shotgun (cov=10)', linewidth=2)
plot(diginorm_C1[:,0] / 1e6, diginorm_C1[:,1]*13 / 1e6, 'r-', label='shotgun (cov=1)', linewidth=2)
plot(diginorm[:,0] / 10 / 1e6, diginorm[:,1] / 1e6, 'b--', label='shotgun (cov=10), adjusted', linewidth=3)

#title("shotgun sequencing collector's curve")
xlabel("total reads (m)")
ylabel("informative reads (m)")
legend(loc='lower right')
axis(xmax=100)
savefig('/tmp/fig1.pdf')



In [71]:
plot(m40[:,0], m40[:,3], label='min=40')
plot(m20[:,0], m20[:,3], label='min=20')
plot(m10[:,0], m10[:,3], label='min=10')
plot(m5[:,0], m5[:,3], label='min=5')
plot(m2[:,0], m2[:,3], label='min=2')
legend()
ylabel('fraction of informative 16s k-mers seen in shotgun data')
xlabel('number of shotgun reads examined')
title('podar: 16s x wgs, saturation curves')


Out[71]:
<matplotlib.text.Text at 0x10e6a9f90>

In [136]:
plot(diginorm_C1[:,0] / 1e6, diginorm_C1[:,1] / 3e5 * 11, 'r-', label='shotgun (cov=1)')
plot(diginorm[:,0] / 1e6, diginorm[:,1] / 3e5, 'b-', label='shotgun (cov=10)', linewidth=2)
#plot(m40[:,0], m40[:,3], label='16s/min=40')
plot(m20[:,0] / 1e6, m20[:,3], 'g-', label='16s (v1-3, 454)', linewidth=2)
#plot(m10[:,0], m10[:,3], label='16s/min=10')
#title('collectors\' curves: shotgun sequencing & 16s')
xlabel('number of reads (m)')
ylabel('% 16s information seen')
axis(xmax=20, ymax=100)
legend(loc='lower right')
xlabel('number of shotgun reads examined (m)')
savefig('/tmp/fig2.pdf')



In [77]:
plot(soil_m2[:,0], soil_m2[:,3], label='16s/min=2')
plot(soil_m20[:,0], soil_m20[:,3], label='16s/min=20')
title('soil: C2 data set (real community)')
ylabel('fraction of  16s k-mers seen in data / reads kept')
xlabel('number of shotgun reads examined')


Out[77]:
(0.0, 20000000.0, 0.0, 100)

Method is robust to different 16s regions


In [135]:
plot(podar_v13[:,0] / 1e6, podar_v13[:,3], label='16s (v1-3, 454)', linewidth=2)
plot(m20[:,0] / 1e6, m20[:,3], label='16s (v3-5, 454)', linewidth=2)
plot(podar_v69[:,0] / 1e6, podar_v69[:,3], label='16s (v6-9, 454)', linewidth=2)
legend(loc='lower right')
axis(xmax=8)
#title('16s saturation curves for different regions')
ylabel('% 16s information seen')
xlabel('number of shotgun reads examined (m)')
savefig('/tmp/fig3.pdf')


Side notes & thoughts

  • The 16s "information" calculation seems robust to different k-mer sizes and regions
  • Measuring "nt diversity" => includes phage, etc.
  • Can we cross-link the Y axis with specific OTUs? (Absolutely.)

Suggestions

  • subset data sets - does this make any sense?
  • non-16s amplicon/paired data - Jaron?
  • 16s clustering for Y axis (Jaron)
  • lake lanier data
  • Low-diversity data set (paired 16s 454/Illumina with Illumina) - AMD?
  • Beetle gut (Josh)

Points:

  • two very pragmatic: what is my sensitivity? how much more money do I need?
  • lets you examine public data sets in interesting ways

In [138]:
figsize(4, 4)

contam = numpy.loadtxt('podar.16s.x.contam.k20.kmers.m20.out')
plot(m20[:,0] / 1e6, m20[:,3], 'b-', label='before addition', linewidth=2)
plot(contam[:,0] / 1e6, contam[:,3], 'g-', label='after addition', linewidth=2)
legend(loc='lower right')
#title('adding simulated phage')
ylabel('% 16s information seen')
xlabel('number of shotgun reads examined (m)')
savefig('/tmp/fig4.pdf')



In [127]:
figsize(7, 4)

In [ ]: