Description:

For emperical data, most taxa (>0.1% abundance) are detected across the entire gradient.
Checking whether a similar pattern is seen with the simulated genome data

Setting variables



In [65]:

    
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome3/validation/'
R_dir = '/home/nick/notebook/SIPSim/lib/R/'
figDir = '/home/nick/notebook/SIPSim/figures/'

nprocs = 3

Init



In [66]:

    
import os
import numpy as np
import dill
import pandas as pd
%load_ext rpy2.ipython









    



The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython



In [67]:

    
%%R
library(ggplot2)
library(plyr)
library(dplyr)
library(tidyr)
library(gridExtra)



In [68]:

    
if not os.path.isdir(workDir):
    os.makedirs(workDir)

Determining the probability of detecting the taxa across the entire gradient



In [69]:

    
# max 13C shift
max_13C_shift_in_BD = 0.036
# min BD (that we care about)
min_GC = 13.5
min_BD = min_GC/100.0 * 0.098 + 1.66
# max BD (that we care about)
max_GC = 80
max_BD = max_GC / 100.0 * 0.098 + 1.66    # 80.0% G+C
max_BD = max_BD + max_13C_shift_in_BD
## BD range of values
BD_vals = np.arange(min_BD, max_BD, 0.001)

skewed normal distribution



In [70]:

    
F = os.path.join(workDir, 'ampFrags_real_kde_dif.pkl')
with open(F, 'rb') as inFH:
    kde = dill.load(inFH)
kde









    Out[70]:





{'Clostridium_ljungdahlii_DSM_13528': <scipy.stats.kde.gaussian_kde at 0x7f9ac9893d90>,
 'Escherichia_coli_1303': <scipy.stats.kde.gaussian_kde at 0x7f9ac98530d0>,
 'Streptomyces_pratensis_ATCC_33331': <scipy.stats.kde.gaussian_kde at 0x7f9ac9853710>}



In [71]:

    
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
    pdf[k] = v.evaluate(BD_vals)
pdf.keys()









    Out[71]:





['Clostridium_ljungdahlii_DSM_13528',
 'Escherichia_coli_1303',
 'Streptomyces_pratensis_ATCC_33331']



In [72]:

    
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)









    Out[72]:






  
    
      
      Clostridium_ljungdahlii_DSM_13528
      Escherichia_coli_1303
      Streptomyces_pratensis_ATCC_33331
      BD
    
  
  
    
      0
      0.000053
      1.141365e-93
      1.218480e-43
      1.67323
    
    
      1
      0.000376
      1.954533e-78
      1.137313e-36
      1.67423
    
    
      2
      0.000838
      1.302936e-64
      2.278353e-30
      1.67523



In [73]:

    
%%R -i df -w 800 -h 350

df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
    gather(taxon_name, P, 1:3) %>%
    mutate(BD = as.numeric(BD),
           P = as.numeric(P),
           taxon_name = as.character(taxon_name)) %>%
    filter(P > 1e-15)

x.lab = expression(paste('Buoyant density (g ml' ^ '-1', ')'))

p1.skn = ggplot(df.g, aes(BD, P, color=taxon_name)) +
    geom_point() +
    geom_line() +
    labs(x=x.lab, y='Probability density') +
    theme_bw() +
    theme(
        text = element_text(size=16),
        legend.position = 'none'
    )

p2.skn = p1.skn + scale_y_log10()

grid.arrange(p1.skn, p2.skn, ncol=2)

small uniform distribution



In [74]:

    
F = os.path.join(workDir, 'ampFrags_sm_kde_dif.pkl')
with open(F, 'rb') as inFH:
    kde = dill.load(inFH)
kde









    Out[74]:





{'Clostridium_ljungdahlii_DSM_13528': <scipy.stats.kde.gaussian_kde at 0x7f9ac9603510>,
 'Escherichia_coli_1303': <scipy.stats.kde.gaussian_kde at 0x7f9ac9603810>,
 'Streptomyces_pratensis_ATCC_33331': <scipy.stats.kde.gaussian_kde at 0x7f9ac96034d0>}



In [75]:

    
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
    pdf[k] = v.evaluate(BD_vals)
pdf.keys()









    Out[75]:





['Clostridium_ljungdahlii_DSM_13528',
 'Escherichia_coli_1303',
 'Streptomyces_pratensis_ATCC_33331']



In [76]:

    
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)









    Out[76]:






  
    
      
      Clostridium_ljungdahlii_DSM_13528
      Escherichia_coli_1303
      Streptomyces_pratensis_ATCC_33331
      BD
    
  
  
    
      0
      0.000171
      5.983261e-22
      4.797488e-57
      1.67323
    
    
      1
      0.000478
      1.155676e-18
      3.764569e-51
      1.67423
    
    
      2
      0.000883
      1.045137e-15
      1.337521e-45
      1.67523



In [77]:

    
%%R -i df -w 800 -h 350

df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
    gather(taxon_name, P, 1:3) %>%
    mutate(BD = as.numeric(BD),
           P = as.numeric(P),
           taxon_name = as.character(taxon_name)) %>%
    filter(P > 1e-9)



p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
    geom_point() +
    geom_line() +
    theme_bw() +
    theme(
        text = element_text(size=16),
        legend.position = 'none'
    )

p2 = p1 + scale_y_log10()

grid.arrange(p1, p2, ncol=2)

Notes

Even with fragment sizes of 1-2 kb, the taxa would likely not be detected even if the gradient contained 1e9 16S copies of the taxon.
- Does this make sense based on the theory of diffusion used?

with DBL 'smearing'

Determining the probability of detecting in all fragments

skewed normal distribution



In [78]:

    
BD_vals = np.arange(min_BD, max_BD, 0.001)



In [79]:

    
F = os.path.join(workDir, 'ampFrags_real_kde_dif_DBL.pkl')
with open(F, 'rb') as inFH:
    kde = dill.load(inFH)
kde









    Out[79]:





{'1': {'Clostridium_ljungdahlii_DSM_13528': <scipy.stats.kde.gaussian_kde at 0x7f9ac9603710>,
  'Escherichia_coli_1303': <scipy.stats.kde.gaussian_kde at 0x7f9ac9853350>,
  'Streptomyces_pratensis_ATCC_33331': <scipy.stats.kde.gaussian_kde at 0x7f9ac9853990>}}



In [80]:

    
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
    for kk,vv in v.items():
        pdf[kk] = vv.evaluate(BD_vals)
pdf.keys()









    Out[80]:





['Clostridium_ljungdahlii_DSM_13528',
 'Escherichia_coli_1303',
 'Streptomyces_pratensis_ATCC_33331']



In [81]:

    
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)









    Out[81]:






  
    
      
      Clostridium_ljungdahlii_DSM_13528
      Escherichia_coli_1303
      Streptomyces_pratensis_ATCC_33331
      BD
    
  
  
    
      0
      0.008254
      0.011652
      0.005325
      1.67323
    
    
      1
      0.009910
      0.009692
      0.006718
      1.67423
    
    
      2
      0.008902
      0.006997
      0.008998
      1.67523



In [82]:

    
%%R -i df -w 800 -h 350

df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
    gather(taxon_name, P, 1:3) %>%
    mutate(BD = as.numeric(BD),
           P = as.numeric(P),
           taxon_name = as.character(taxon_name)) %>%
    filter(P > 1e-15)

x.lab = expression(paste('Buoyant density (g ml' ^ '-1', ')'))

p1.skn.dbl = ggplot(df.g, aes(BD, P, color=taxon_name)) +
    geom_point() +
    geom_line() +
    labs(x=x.lab, y='Probability density') +
    theme_bw() +
    theme(
        text = element_text(size=16),
        legend.position = 'none'
    )

p2.skn.dbl = p1.skn.dbl + scale_y_log10()

grid.arrange(p1.skn.dbl, p2.skn.dbl, ncol=2)

Notes

Even if 1% of DNA is in DBL (that then diffuses back into the gradient):
- the probably of detecting a taxa in all the gradient positions is >= 1e-7
  - this is feasible for matching the emperical data!

Combined plot



In [84]:

    
%%R -w 800 -h 300

# plot formatting
title.size=16
p2.skn.f = p2.skn +
    ggtitle('Gaussian BD') +
    theme(
        plot.title = element_text(size=title.size)
    )
p2.skn.dbl.f = p2.skn.dbl +
    ggtitle('Gaussian BD + DBL') +
    theme(
        plot.title = element_text(size=title.size)
    )

# combined plot
#p.comb = cowplot::plot_grid(p2.skn.f, p2.skn.dbl.f, labels=c('A)', 'B)'), align='h')
p.comb = cowplot::ggdraw() +
    geom_rect(aes(xmin=0, ymin=0, xmax=1, ymax=1), fill='white') +
    cowplot::draw_plot(p2.skn.f, 0.01, 0.01, 0.49, 0.99) +
    cowplot::draw_plot(p2.skn.dbl.f, 0.5, 0.01, 0.49, 0.99) +
    cowplot::draw_plot_label(c('A)', 'B)'), c(0, 0.5), c(0.99, 0.99))
p.comb



In [85]:

    
%%R -i workDir
# writting plot
outFile = file.path(workDir, 'DBL_example_log10.pdf') 
ggsave(outFile, p.comb, width=10, height=3.75)
cat('File written:', outFile, '\n')









    





File written: /home/nick/notebook/SIPSim/dev/bac_genome3/validation//DBL_example_log10.pdf

Combined plot (v2)



In [102]:

    
%%R -w 800 -h 300

p1.skn.e = p1.skn +
    scale_x_continuous(limits=c(1.675, 1.775))
p2.skn.e = p2.skn +
    scale_x_continuous(limits=c(1.675, 1.775)) +
    scale_y_log10(limits=c(1e-12, 150))
p1.skn.dbl.e = p1.skn.dbl +
    scale_x_continuous(limits=c(1.675, 1.775))
p2.skn.dbl.e = p2.skn.dbl +
    scale_x_continuous(limits=c(1.675, 1.775)) +
    scale_y_log10(limits=c(1e-12, 150))


p.comb = cowplot::ggdraw() +
    geom_rect(aes(xmin=0, ymin=0, xmax=1, ymax=1), fill='white') +
    cowplot::draw_plot(p2.skn.e, 0.01, 0.01, 0.49, 0.99) +
    cowplot::draw_plot(p2.skn.dbl.e, 0.5, 0.01, 0.49, 0.99) +
    cowplot::draw_plot_label(c('A)', 'B)'), c(0, 0.5), c(0.99, 0.99))
p.comb



In [103]:

    
%%R -i workDir
# writting plot
outFile = file.path(workDir, 'DBL_example_log10.pdf') 
ggsave(outFile, p.comb, width=10, height=3.75)
cat('File written:', outFile, '\n')









    





File written: /home/nick/notebook/SIPSim/dev/bac_genome3/validation//DBL_example_log10.pdf

small fragment size distribution



In [54]:

    
BD_vals = np.arange(min_BD, max_BD, 0.001)



In [55]:

    
F = os.path.join(workDir, 'ampFrags_sm_kde_dif_DBL.pkl')
with open(F, 'rb') as inFH:
    kde = dill.load(inFH)
kde









    Out[55]:





{'Clostridium_ljungdahlii_DSM_13528': <scipy.stats.kde.gaussian_kde at 0x7f5786775a90>,
 'Escherichia_coli_1303': <scipy.stats.kde.gaussian_kde at 0x7f5786a0e090>,
 'Streptomyces_pratensis_ATCC_33331': <scipy.stats.kde.gaussian_kde at 0x7f57867941d0>}



In [56]:

    
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
    pdf[k] = v.evaluate(BD_vals)
pdf.keys()









    Out[56]:





['Clostridium_ljungdahlii_DSM_13528',
 'Escherichia_coli_1303',
 'Streptomyces_pratensis_ATCC_33331']



In [57]:

    
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)









    Out[57]:






  
    
      
      Clostridium_ljungdahlii_DSM_13528
      Escherichia_coli_1303
      Streptomyces_pratensis_ATCC_33331
      BD
    
  
  
    
      0
      0.003643
      0.008033
      0.008235
      1.67323
    
    
      1
      0.004997
      0.008003
      0.007361
      1.67423
    
    
      2
      0.008100
      0.008035
      0.007202
      1.67523



In [58]:

    
%%R -i df -w 800 -h 350

df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
    gather(taxon_name, P, 1:3) %>%
    mutate(BD = as.numeric(BD),
           P = as.numeric(P),
           taxon_name = as.character(taxon_name)) %>%
    filter(P > 1e-9)

p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
    geom_point() +
    geom_line() +
    theme_bw() +
    theme(
        text = element_text(size=16),
        legend.position = 'none'
    )

p2 = p1 + scale_y_log10()

grid.arrange(p1, p2, ncol=2)

with DBL 'smearing' (smaller DBL)

Determining the probability of detecting in all fragments

skewed normal distribution



In [59]:

    
BD_vals = np.arange(min_BD, max_BD, 0.001)



In [60]:

    
F = os.path.join(workDir, 'ampFrags_real_kde_dif_DBL_fa1e-4.pkl')
with open(F, 'rb') as inFH:
    kde = dill.load(inFH)
kde









    Out[60]:





{'Clostridium_ljungdahlii_DSM_13528': <scipy.stats.kde.gaussian_kde at 0x7f5786775690>,
 'Escherichia_coli_1303': <scipy.stats.kde.gaussian_kde at 0x7f5786775f90>,
 'Streptomyces_pratensis_ATCC_33331': <scipy.stats.kde.gaussian_kde at 0x7f5786775c90>}



In [61]:

    
# probability at each location in gradient
pdf = {}
for k,v in kde.items():
    pdf[k] = v.evaluate(BD_vals)
pdf.keys()









    Out[61]:





['Clostridium_ljungdahlii_DSM_13528',
 'Escherichia_coli_1303',
 'Streptomyces_pratensis_ATCC_33331']



In [62]:

    
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)









    Out[62]:






  
    
      
      Clostridium_ljungdahlii_DSM_13528
      Escherichia_coli_1303
      Streptomyces_pratensis_ATCC_33331
      BD
    
  
  
    
      0
      1.631444e-06
      0.000375
      0.000216
      1.67323
    
    
      1
      5.361316e-07
      0.001505
      0.001849
      1.67423
    
    
      2
      3.456207e-04
      0.002166
      0.000028
      1.67523



In [63]:

    
%%R -i df -w 800 -h 350

df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
    gather(taxon_name, P, 1:3) %>%
    mutate(BD = as.numeric(BD),
           P = as.numeric(P),
           taxon_name = as.character(taxon_name)) %>%
    filter(P > 1e-9)

p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
    geom_point() +
    geom_line() +
    theme_bw() +
    theme(
        text = element_text(size=16),
        legend.position = 'none'
    )

p2 = p1 + scale_y_log10()

grid.arrange(p1, p2, ncol=2)

DBL with abundance-weighted smearing



In [131]:

    
BD_vals = np.arange(min_BD, max_BD, 0.001)

F = os.path.join(workDir, 'ampFrags_real_kde_dif_DBL-comm.pkl')
with open(F, 'rb') as inFH:
    kde = dill.load(inFH)
kde









    Out[131]:





{'1': {'Clostridium_ljungdahlii_DSM_13528': <scipy.stats.kde.gaussian_kde at 0x7f5786694390>,
  'Escherichia_coli_1303': <scipy.stats.kde.gaussian_kde at 0x7f5786a2a710>,
  'Streptomyces_pratensis_ATCC_33331': <scipy.stats.kde.gaussian_kde at 0x7f5786723490>}}



In [132]:

    
# probability at each location in gradient
pdf = {}
for libID,v in kde.items():
    for taxon,k in v.items():
        pdf[taxon] = k.evaluate(BD_vals)
pdf.keys()









    Out[132]:





['Clostridium_ljungdahlii_DSM_13528',
 'Escherichia_coli_1303',
 'Streptomyces_pratensis_ATCC_33331']



In [133]:

    
df = pd.DataFrame(pdf)
df['BD'] = BD_vals
df.head(n=3)









    Out[133]:






  
    
      
      Clostridium_ljungdahlii_DSM_13528
      Escherichia_coli_1303
      Streptomyces_pratensis_ATCC_33331
      BD
    
  
  
    
      0
      1.691746
      2.760888
      1.230768
      1.67323
    
    
      1
      1.699645
      2.763073
      1.213971
      1.67423
    
    
      2
      1.706747
      2.762859
      1.197765
      1.67523



In [134]:

    
%%R -i df -w 800 -h 350

df.g = apply(df, 2, as.numeric) %>% as.data.frame %>%
    gather(taxon_name, P, 1:3) %>%
    mutate(BD = as.numeric(BD),
           P = as.numeric(P),
           taxon_name = as.character(taxon_name)) %>%
    filter(P > 1e-9)

p1 = ggplot(df.g, aes(BD, P, color=taxon_name)) +
    geom_point() +
    geom_line() +
    theme_bw() +
    theme(
        text = element_text(size=16),
        legend.position = 'none'
    )

p2 = p1 + scale_y_log10()

grid.arrange(p1, p2, ncol=2)



In [135]:

    
%%R
df.g %>%
    group_by(taxon_name) %>%
    summarize(max_P = max(P),
              min_P = min(P)) %>% print









    





Source: local data frame [3 x 3]

                         taxon_name    max_P    min_P
                              (chr)    (dbl)    (dbl)
1 Clostridium_ljungdahlii_DSM_13528 60.98170 1.546809
2             Escherichia_coli_1303 68.20404 2.708163
3 Streptomyces_pratensis_ATCC_33331 76.69533 1.167650

Plotting pre-frac abundance vs heavy fraction P



In [136]:

    
%%R -i workDir

F = file.path(workDir, 'comm.txt')
df.comm = read.delim(F, sep='\t') %>%
    mutate(rel_abund = rel_abund_perc / 100)
df.comm %>% print

df.g.s = df.g %>%
    filter(BD > 1.75) %>%
    group_by(BD) %>%
    mutate(P_rel_abund = P / sum(P)) %>%
    group_by(taxon_name) %>%
    summarize(mean_P = mean(P))

df.g.s = inner_join(df.g.s, df.comm, c('taxon_name' = 'taxon_name')) 
df.g.s %>% print

ggplot(df.g.s, aes(rel_abund, mean_P)) +
    geom_point() +
    geom_line()









    





  library                        taxon_name rel_abund_perc rank rel_abund
1       1             Escherichia_coli_1303       48.87428    1 0.4887428
2       1 Clostridium_ljungdahlii_DSM_13528       29.55341    2 0.2955341
3       1 Streptomyces_pratensis_ATCC_33331       21.57231    3 0.2157231
Source: local data frame [3 x 6]

                         taxon_name   mean_P library rel_abund_perc  rank
                              (chr)    (dbl)   (int)          (dbl) (int)
1 Clostridium_ljungdahlii_DSM_13528 1.644610       1       29.55341     2
2             Escherichia_coli_1303 2.751454       1       48.87428     1
3 Streptomyces_pratensis_ATCC_33331 1.224158       1       21.57231     3
Variables not shown: rel_abund (dbl)



In [ ]:



In [ ]:



In [ ]:

	Clostridium_ljungdahlii_DSM_13528	Escherichia_coli_1303	Streptomyces_pratensis_ATCC_33331	BD
0	0.000053	1.141365e-93	1.218480e-43	1.67323
1	0.000376	1.954533e-78	1.137313e-36	1.67423
2	0.000838	1.302936e-64	2.278353e-30	1.67523

	Clostridium_ljungdahlii_DSM_13528	Escherichia_coli_1303	Streptomyces_pratensis_ATCC_33331	BD
0	0.000171	5.983261e-22	4.797488e-57	1.67323
1	0.000478	1.155676e-18	3.764569e-51	1.67423
2	0.000883	1.045137e-15	1.337521e-45	1.67523

	Clostridium_ljungdahlii_DSM_13528	Escherichia_coli_1303	Streptomyces_pratensis_ATCC_33331	BD
0	0.008254	0.011652	0.005325	1.67323
1	0.009910	0.009692	0.006718	1.67423
2	0.008902	0.006997	0.008998	1.67523

	Clostridium_ljungdahlii_DSM_13528	Escherichia_coli_1303	Streptomyces_pratensis_ATCC_33331	BD
0	0.003643	0.008033	0.008235	1.67323
1	0.004997	0.008003	0.007361	1.67423
2	0.008100	0.008035	0.007202	1.67523

	Clostridium_ljungdahlii_DSM_13528	Escherichia_coli_1303	Streptomyces_pratensis_ATCC_33331	BD
0	1.631444e-06	0.000375	0.000216	1.67323
1	5.361316e-07	0.001505	0.001849	1.67423
2	3.456207e-04	0.002166	0.000028	1.67523

	Clostridium_ljungdahlii_DSM_13528	Escherichia_coli_1303	Streptomyces_pratensis_ATCC_33331	BD
0	1.691746	2.760888	1.230768	1.67323
1	1.699645	2.763073	1.213971	1.67423
2	1.706747	2.762859	1.197765	1.67523