Description:

  • For emperical data, most taxa (>0.1% abundance) are detected across the entire gradient.
  • Checking whether a similar pattern is seen with the simulated genome data

Setting variables


In [65]:
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome3/validation/'
R_dir = '/home/nick/notebook/SIPSim/lib/R/'
figDir = '/home/nick/notebook/SIPSim/figures/'

nprocs = 3

Init


In [66]:
import os
import numpy as np
import dill
import pandas as pd
%load_ext rpy2.ipython


The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython

In [67]:
%%R
library(ggplot2)
library(plyr)
library(dplyr)
library(tidyr)
library(gridExtra)

In [68]:
if not os.path.isdir(workDir):
    os.makedirs(workDir)

Determining the probability of detecting the taxa across the entire gradient


In [69]:
# max 13C shift
max_13C_shift_in_BD = 0.036
# min BD (that we care about)
min_GC = 13.5
min_BD = min_GC/100.0 * 0.098 + 1.66
# max BD (that we care about)
max_GC = 80
max_BD = max_GC / 100.0 * 0.098 + 1.66    # 80.0% G+C
max_BD = max_BD + max_13C_shift_in_BD
## BD range of values
BD_vals = np.arange(min_BD, max_BD, 0.001)

skewed normal distribution


In [70]:
F = os.path.join(workDir, 'ampFrags_real_kde_dif.pkl')
with open(F, 'rb') as inFH:
    kde = dill.load(inFH)
kde


Out[70]:
{'Clostridium_ljungdahlii_DSM_13528': <scipy.stats.kde.gaussian_kde at 0x7f9ac9893d90>,
 'Escherichia_coli_1303': <scipy.stats.kde.gaussian_kde at 0x7f9ac98530d0>,
 'Streptomyces_pratensis_ATCC_33331': <scipy.stats.kde.gaussian_kde at 0x7f9ac9853710>}

In [71]:
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
    pdf[k] = v.evaluate(BD_vals)
pdf.keys()


Out[71]:
['Clostridium_ljungdahlii_DSM_13528',
 'Escherichia_coli_1303',
 'Streptomyces_pratensis_ATCC_33331']

In [72]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)


Out[72]:
Clostridium_ljungdahlii_DSM_13528 Escherichia_coli_1303 Streptomyces_pratensis_ATCC_33331 BD
0 0.000053 1.141365e-93 1.218480e-43 1.67323
1 0.000376 1.954533e-78 1.137313e-36 1.67423
2 0.000838 1.302936e-64 2.278353e-30 1.67523

In [73]:
%%R -i df -w 800 -h 350

df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
    gather(taxon_name, P, 1:3) %>%
    mutate(BD = as.numeric(BD),
           P = as.numeric(P),
           taxon_name = as.character(taxon_name)) %>%
    filter(P > 1e-15)

x.lab = expression(paste('Buoyant density (g ml' ^ '-1', ')'))

p1.skn = ggplot(df.g, aes(BD, P, color=taxon_name)) +
    geom_point() +
    geom_line() +
    labs(x=x.lab, y='Probability density') +
    theme_bw() +
    theme(
        text = element_text(size=16),
        legend.position = 'none'
    )

p2.skn = p1.skn + scale_y_log10()

grid.arrange(p1.skn, p2.skn, ncol=2)


small uniform distribution


In [74]:
F = os.path.join(workDir, 'ampFrags_sm_kde_dif.pkl')
with open(F, 'rb') as inFH:
    kde = dill.load(inFH)
kde


Out[74]:
{'Clostridium_ljungdahlii_DSM_13528': <scipy.stats.kde.gaussian_kde at 0x7f9ac9603510>,
 'Escherichia_coli_1303': <scipy.stats.kde.gaussian_kde at 0x7f9ac9603810>,
 'Streptomyces_pratensis_ATCC_33331': <scipy.stats.kde.gaussian_kde at 0x7f9ac96034d0>}

In [75]:
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
    pdf[k] = v.evaluate(BD_vals)
pdf.keys()


Out[75]:
['Clostridium_ljungdahlii_DSM_13528',
 'Escherichia_coli_1303',
 'Streptomyces_pratensis_ATCC_33331']

In [76]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)


Out[76]:
Clostridium_ljungdahlii_DSM_13528 Escherichia_coli_1303 Streptomyces_pratensis_ATCC_33331 BD
0 0.000171 5.983261e-22 4.797488e-57 1.67323
1 0.000478 1.155676e-18 3.764569e-51 1.67423
2 0.000883 1.045137e-15 1.337521e-45 1.67523

In [77]:
%%R -i df -w 800 -h 350

df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
    gather(taxon_name, P, 1:3) %>%
    mutate(BD = as.numeric(BD),
           P = as.numeric(P),
           taxon_name = as.character(taxon_name)) %>%
    filter(P > 1e-9)



p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
    geom_point() +
    geom_line() +
    theme_bw() +
    theme(
        text = element_text(size=16),
        legend.position = 'none'
    )

p2 = p1 + scale_y_log10()

grid.arrange(p1, p2, ncol=2)


Notes

  • Even with fragment sizes of 1-2 kb, the taxa would likely not be detected even if the gradient contained 1e9 16S copies of the taxon.
    • Does this make sense based on the theory of diffusion used?

with DBL 'smearing'

Determining the probability of detecting in all fragments

skewed normal distribution


In [78]:
BD_vals = np.arange(min_BD, max_BD, 0.001)

In [79]:
F = os.path.join(workDir, 'ampFrags_real_kde_dif_DBL.pkl')
with open(F, 'rb') as inFH:
    kde = dill.load(inFH)
kde


Out[79]:
{'1': {'Clostridium_ljungdahlii_DSM_13528': <scipy.stats.kde.gaussian_kde at 0x7f9ac9603710>,
  'Escherichia_coli_1303': <scipy.stats.kde.gaussian_kde at 0x7f9ac9853350>,
  'Streptomyces_pratensis_ATCC_33331': <scipy.stats.kde.gaussian_kde at 0x7f9ac9853990>}}

In [80]:
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
    for kk,vv in v.items():
        pdf[kk] = vv.evaluate(BD_vals)
pdf.keys()


Out[80]:
['Clostridium_ljungdahlii_DSM_13528',
 'Escherichia_coli_1303',
 'Streptomyces_pratensis_ATCC_33331']

In [81]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)


Out[81]:
Clostridium_ljungdahlii_DSM_13528 Escherichia_coli_1303 Streptomyces_pratensis_ATCC_33331 BD
0 0.008254 0.011652 0.005325 1.67323
1 0.009910 0.009692 0.006718 1.67423
2 0.008902 0.006997 0.008998 1.67523

In [82]:
%%R -i df -w 800 -h 350

df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
    gather(taxon_name, P, 1:3) %>%
    mutate(BD = as.numeric(BD),
           P = as.numeric(P),
           taxon_name = as.character(taxon_name)) %>%
    filter(P > 1e-15)

x.lab = expression(paste('Buoyant density (g ml' ^ '-1', ')'))

p1.skn.dbl = ggplot(df.g, aes(BD, P, color=taxon_name)) +
    geom_point() +
    geom_line() +
    labs(x=x.lab, y='Probability density') +
    theme_bw() +
    theme(
        text = element_text(size=16),
        legend.position = 'none'
    )

p2.skn.dbl = p1.skn.dbl + scale_y_log10()

grid.arrange(p1.skn.dbl, p2.skn.dbl, ncol=2)


Notes

  • Even if 1% of DNA is in DBL (that then diffuses back into the gradient):
    • the probably of detecting a taxa in all the gradient positions is >= 1e-7
      • this is feasible for matching the emperical data!

Combined plot


In [84]:
%%R -w 800 -h 300

# plot formatting
title.size=16
p2.skn.f = p2.skn +
    ggtitle('Gaussian BD') +
    theme(
        plot.title = element_text(size=title.size)
    )
p2.skn.dbl.f = p2.skn.dbl +
    ggtitle('Gaussian BD + DBL') +
    theme(
        plot.title = element_text(size=title.size)
    )

# combined plot
#p.comb = cowplot::plot_grid(p2.skn.f, p2.skn.dbl.f, labels=c('A)', 'B)'), align='h')
p.comb = cowplot::ggdraw() +
    geom_rect(aes(xmin=0, ymin=0, xmax=1, ymax=1), fill='white') +
    cowplot::draw_plot(p2.skn.f, 0.01, 0.01, 0.49, 0.99) +
    cowplot::draw_plot(p2.skn.dbl.f, 0.5, 0.01, 0.49, 0.99) +
    cowplot::draw_plot_label(c('A)', 'B)'), c(0, 0.5), c(0.99, 0.99))
p.comb



In [85]:
%%R -i workDir
# writting plot
outFile = file.path(workDir, 'DBL_example_log10.pdf') 
ggsave(outFile, p.comb, width=10, height=3.75)
cat('File written:', outFile, '\n')


File written: /home/nick/notebook/SIPSim/dev/bac_genome3/validation//DBL_example_log10.pdf 

Combined plot (v2)


In [102]:
%%R -w 800 -h 300

p1.skn.e = p1.skn +
    scale_x_continuous(limits=c(1.675, 1.775))
p2.skn.e = p2.skn +
    scale_x_continuous(limits=c(1.675, 1.775)) +
    scale_y_log10(limits=c(1e-12, 150))
p1.skn.dbl.e = p1.skn.dbl +
    scale_x_continuous(limits=c(1.675, 1.775))
p2.skn.dbl.e = p2.skn.dbl +
    scale_x_continuous(limits=c(1.675, 1.775)) +
    scale_y_log10(limits=c(1e-12, 150))


p.comb = cowplot::ggdraw() +
    geom_rect(aes(xmin=0, ymin=0, xmax=1, ymax=1), fill='white') +
    cowplot::draw_plot(p2.skn.e, 0.01, 0.01, 0.49, 0.99) +
    cowplot::draw_plot(p2.skn.dbl.e, 0.5, 0.01, 0.49, 0.99) +
    cowplot::draw_plot_label(c('A)', 'B)'), c(0, 0.5), c(0.99, 0.99))
p.comb



In [103]:
%%R -i workDir
# writting plot
outFile = file.path(workDir, 'DBL_example_log10.pdf') 
ggsave(outFile, p.comb, width=10, height=3.75)
cat('File written:', outFile, '\n')


File written: /home/nick/notebook/SIPSim/dev/bac_genome3/validation//DBL_example_log10.pdf 

small fragment size distribution


In [54]:
BD_vals = np.arange(min_BD, max_BD, 0.001)

In [55]:
F = os.path.join(workDir, 'ampFrags_sm_kde_dif_DBL.pkl')
with open(F, 'rb') as inFH:
    kde = dill.load(inFH)
kde


Out[55]:
{'Clostridium_ljungdahlii_DSM_13528': <scipy.stats.kde.gaussian_kde at 0x7f5786775a90>,
 'Escherichia_coli_1303': <scipy.stats.kde.gaussian_kde at 0x7f5786a0e090>,
 'Streptomyces_pratensis_ATCC_33331': <scipy.stats.kde.gaussian_kde at 0x7f57867941d0>}

In [56]:
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
    pdf[k] = v.evaluate(BD_vals)
pdf.keys()


Out[56]:
['Clostridium_ljungdahlii_DSM_13528',
 'Escherichia_coli_1303',
 'Streptomyces_pratensis_ATCC_33331']

In [57]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)


Out[57]:
Clostridium_ljungdahlii_DSM_13528 Escherichia_coli_1303 Streptomyces_pratensis_ATCC_33331 BD
0 0.003643 0.008033 0.008235 1.67323
1 0.004997 0.008003 0.007361 1.67423
2 0.008100 0.008035 0.007202 1.67523

In [58]:
%%R -i df -w 800 -h 350

df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
    gather(taxon_name, P, 1:3) %>%
    mutate(BD = as.numeric(BD),
           P = as.numeric(P),
           taxon_name = as.character(taxon_name)) %>%
    filter(P > 1e-9)

p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
    geom_point() +
    geom_line() +
    theme_bw() +
    theme(
        text = element_text(size=16),
        legend.position = 'none'
    )

p2 = p1 + scale_y_log10()

grid.arrange(p1, p2, ncol=2)


with DBL 'smearing' (smaller DBL)

Determining the probability of detecting in all fragments

skewed normal distribution


In [59]:
BD_vals = np.arange(min_BD, max_BD, 0.001)

In [60]:
F = os.path.join(workDir, 'ampFrags_real_kde_dif_DBL_fa1e-4.pkl')
with open(F, 'rb') as inFH:
    kde = dill.load(inFH)
kde


Out[60]:
{'Clostridium_ljungdahlii_DSM_13528': <scipy.stats.kde.gaussian_kde at 0x7f5786775690>,
 'Escherichia_coli_1303': <scipy.stats.kde.gaussian_kde at 0x7f5786775f90>,
 'Streptomyces_pratensis_ATCC_33331': <scipy.stats.kde.gaussian_kde at 0x7f5786775c90>}

In [61]:
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
    pdf[k] = v.evaluate(BD_vals)
pdf.keys()


Out[61]:
['Clostridium_ljungdahlii_DSM_13528',
 'Escherichia_coli_1303',
 'Streptomyces_pratensis_ATCC_33331']

In [62]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)


Out[62]:
Clostridium_ljungdahlii_DSM_13528 Escherichia_coli_1303 Streptomyces_pratensis_ATCC_33331 BD
0 1.631444e-06 0.000375 0.000216 1.67323
1 5.361316e-07 0.001505 0.001849 1.67423
2 3.456207e-04 0.002166 0.000028 1.67523

In [63]:
%%R -i df -w 800 -h 350

df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
    gather(taxon_name, P, 1:3) %>%
    mutate(BD = as.numeric(BD),
           P = as.numeric(P),
           taxon_name = as.character(taxon_name)) %>%
    filter(P > 1e-9)

p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
    geom_point() +
    geom_line() +
    theme_bw() +
    theme(
        text = element_text(size=16),
        legend.position = 'none'
    )

p2 = p1 + scale_y_log10()

grid.arrange(p1, p2, ncol=2)


DBL with abundance-weighted smearing


In [131]:
BD_vals = np.arange(min_BD, max_BD, 0.001)

F = os.path.join(workDir, 'ampFrags_real_kde_dif_DBL-comm.pkl')
with open(F, 'rb') as inFH:
    kde = dill.load(inFH)
kde


Out[131]:
{'1': {'Clostridium_ljungdahlii_DSM_13528': <scipy.stats.kde.gaussian_kde at 0x7f5786694390>,
  'Escherichia_coli_1303': <scipy.stats.kde.gaussian_kde at 0x7f5786a2a710>,
  'Streptomyces_pratensis_ATCC_33331': <scipy.stats.kde.gaussian_kde at 0x7f5786723490>}}

In [132]:
# probability at each location in gradient
pdf = {}
for libID,v in kde.items():
    for taxon,k in v.items():
        pdf[taxon] = k.evaluate(BD_vals)
pdf.keys()


Out[132]:
['Clostridium_ljungdahlii_DSM_13528',
 'Escherichia_coli_1303',
 'Streptomyces_pratensis_ATCC_33331']

In [133]:
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)


Out[133]:
Clostridium_ljungdahlii_DSM_13528 Escherichia_coli_1303 Streptomyces_pratensis_ATCC_33331 BD
0 1.691746 2.760888 1.230768 1.67323
1 1.699645 2.763073 1.213971 1.67423
2 1.706747 2.762859 1.197765 1.67523

In [134]:
%%R -i df -w 800 -h 350

df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
    gather(taxon_name, P, 1:3) %>%
    mutate(BD = as.numeric(BD),
           P = as.numeric(P),
           taxon_name = as.character(taxon_name)) %>%
    filter(P > 1e-9)

p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
    geom_point() +
    geom_line() +
    theme_bw() +
    theme(
        text = element_text(size=16),
        legend.position = 'none'
    )

p2 = p1 + scale_y_log10()

grid.arrange(p1, p2, ncol=2)



In [135]:
%%R
df.g %>%
    group_by(taxon_name) %>%
    summarize(max_P = max(P),
              min_P = min(P)) %>% print


Source: local data frame [3 x 3]

                         taxon_name    max_P    min_P
                              (chr)    (dbl)    (dbl)
1 Clostridium_ljungdahlii_DSM_13528 60.98170 1.546809
2             Escherichia_coli_1303 68.20404 2.708163
3 Streptomyces_pratensis_ATCC_33331 76.69533 1.167650

Plotting pre-frac abundance vs heavy fraction P


In [136]:
%%R -i workDir

F = file.path(workDir, 'comm.txt')
df.comm = read.delim(F, sep='\t') %>%
    mutate(rel_abund = rel_abund_perc / 100)
df.comm %>% print

df.g.s = df.g %>%
    filter(BD > 1.75) %>%
    group_by(BD) %>%
    mutate(P_rel_abund = P / sum(P)) %>%
    group_by(taxon_name) %>%
    summarize(mean_P = mean(P))

df.g.s = inner_join(df.g.s, df.comm, c('taxon_name' = 'taxon_name')) 
df.g.s %>% print

ggplot(df.g.s, aes(rel_abund, mean_P)) +
    geom_point() +
    geom_line()


  library                        taxon_name rel_abund_perc rank rel_abund
1       1             Escherichia_coli_1303       48.87428    1 0.4887428
2       1 Clostridium_ljungdahlii_DSM_13528       29.55341    2 0.2955341
3       1 Streptomyces_pratensis_ATCC_33331       21.57231    3 0.2157231
Source: local data frame [3 x 6]

                         taxon_name   mean_P library rel_abund_perc  rank
                              (chr)    (dbl)   (int)          (dbl) (int)
1 Clostridium_ljungdahlii_DSM_13528 1.644610       1       29.55341     2
2             Escherichia_coli_1303 2.751454       1       48.87428     1
3 Streptomyces_pratensis_ATCC_33331 1.224158       1       21.57231     3
Variables not shown: rel_abund (dbl)

In [ ]:


In [ ]:


In [ ]: