Description:

  • dev dataset
  • just using 10 random bacteria genomes
  • dataset copied from wPDF_py
  • re-coding how diffusion is modeled
    • modeling diffusion as a function of fragment length, but altering how it is implemented for speed

Var


In [1]:
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome10/HTSSIP/'

# params
nprocs = 10
bandwidth = 0.8
DBL_scaling = 0.5
subsample_dist = 'lognormal'
subsample_mean = 9.432
subsample_scale = 0.5
subsample_min = 10000
subsample_max = 30000

Init


In [2]:
import os
import sys

In [3]:
%load_ext rpy2.ipython

In [4]:
%%R
library(dplyr)
library(tidyr)
library(ggplot2)


/home/nick/bin/anaconda3/envs/py35/lib/python3.5/site-packages/rpy2/rinterface/__init__.py:186: RRuntimeWarning: 
Attaching package: ‘dplyr’


  warnings.warn(x, RRuntimeWarning)
/home/nick/bin/anaconda3/envs/py35/lib/python3.5/site-packages/rpy2/rinterface/__init__.py:186: RRuntimeWarning: The following objects are masked from ‘package:stats’:

    filter, lag


  warnings.warn(x, RRuntimeWarning)
/home/nick/bin/anaconda3/envs/py35/lib/python3.5/site-packages/rpy2/rinterface/__init__.py:186: RRuntimeWarning: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


  warnings.warn(x, RRuntimeWarning)

In [5]:
if not os.path.isdir(workDir):
    os.makedirs(workDir)
%cd $workDir


/home/nick/notebook/SIPSim/dev/bac_genome10/HTSSIP

In [6]:
# Determining min/max BD that 
## min G+C cutoff
min_GC = 13.5
## max G+C cutoff
max_GC = 80
## max G+C shift
max_13C_shift_in_BD = 0.036


min_range_BD = min_GC/100.0 * 0.098 + 1.66    
max_range_BD = max_GC/100.0 * 0.098 + 1.66    

max_range_BD = max_range_BD + max_13C_shift_in_BD

print('Min BD: {}'.format(min_range_BD))
print('Max BD: {}'.format(max_range_BD))


Min BD: 1.67323
Max BD: 1.7744

Simulation

Indexing genomes

  • needed for in silico PCR to simulate amplicon-fragments

In [7]:
%%bash -s $nprocs
source activate SIPSim

SIPSim genome_index \
    ../genomes/genomes10.txt \
    --fp ../genomes/ \
    --np $1 \
    > index_log.txt
    
tail index_log.txt


0: 85.38%, 0:00:32.896777
0: 87.32%, 0:00:33.679537
0: 89.26%, 0:00:34.461325
0: 91.20%, 0:00:35.248926
0: 93.14%, 0:00:36.038003
0: 95.08%, 0:00:36.834498
0: 97.02%, 0:00:37.646834
0: 98.96%, 0:00:38.436043
Time used: 0:00:41.231442
Done.
Indexing: "Roseobacter_litoralis_Och_149"
Indexing: "Idiomarina_loihiensis_GSL_199"
Indexing: "Bacillus_cereus_F837_76"
Indexing: "Micrococcus_luteus_NCTC_2665"
Indexing: "Geobacter_lovleyi_SZ"
Indexing: "Cyanobium_gracile_PCC_6307"
Indexing: "Leuconostoc_citreum_KM20"
Indexing: "Escherichia_coli_APEC_O78"
Indexing: "Xanthomonas_axonopodis_Xac29-1"
Indexing: "Nitrosomonas_europaea_ATCC_19718"
#-- All genomes indexed --#

Making incorporator config file


In [30]:
# %%bash -s $nprocs
# source activate SIPSim

# SIPSim incorp_config_example \
#     --percTaxa 50 \
#     --percIncorpMean 50 \
#     --percIncorpSD 1 \
#     --n_reps 3 \
#     > incorp.config
    
# # checking output
# cat incorp.config

In [65]:
incorp_config = """
[1]
    # baseline: no incorporation
    treatment = control
    
    [[intraPopDist 1]]
        distribution = uniform
        
        [[[start]]]
            
            [[[[interPopDist 1]]]]
                distribution = uniform
                start = 0
                end = 0
        
        [[[end]]]
            
            [[[[interPopDist 1]]]]
                distribution = uniform
                start = 0
                end = 0
[2]
    # 'treatment' community: possible incorporation
    treatment = labeled
    max_perc_taxa_incorp = 50
    
    [[intraPopDist 1]]
        distribution = normal
        [[[mu]]]
            [[[[interPopDist 1]]]]
                start = 30
                distribution = uniform
                end = 90
        [[[sigma]]]
            [[[[interPopDist 1]]]]
                start = 1
                distribution = uniform
                end = 1
    

[3]
    # baseline: no incorporation
    treatment = control
    
    [[intraPopDist 1]]
        distribution = uniform
        
        [[[start]]]
            
            [[[[interPopDist 1]]]]
                distribution = uniform
                start = 0
                end = 0
        
        [[[end]]]
            
            [[[[interPopDist 1]]]]
                distribution = uniform
                start = 0
                end = 0
[4]
    # 'treatment' community: possible incorporation
    treatment = labeled
    max_perc_taxa_incorp = 50
    
    [[intraPopDist 1]]
        distribution = normal
        [[[mu]]]
            [[[[interPopDist 1]]]]
                start = 30
                distribution = uniform
                end = 90
        [[[sigma]]]
            [[[[interPopDist 1]]]]
                start = 1
                distribution = uniform
                end = 1
    

[5]
    # baseline: no incorporation
    treatment = control
    
    [[intraPopDist 1]]
        distribution = uniform
        
        [[[start]]]
            
            [[[[interPopDist 1]]]]
                distribution = uniform
                start = 0
                end = 0
        
        [[[end]]]
            
            [[[[interPopDist 1]]]]
                distribution = uniform
                start = 0
                end = 0
[6]
    # 'treatment' community: possible incorporation
    treatment = labeled
    max_perc_taxa_incorp = 50
    
    [[intraPopDist 1]]
        distribution = normal
        [[[mu]]]
            [[[[interPopDist 1]]]]
                start = 30
                distribution = uniform
                end = 90
        [[[sigma]]]
            [[[[interPopDist 1]]]]
                start = 1
                distribution = uniform
                end = 1

"""
with open('incorp.config', 'w') as outF:
    outF.write(incorp_config)

Simulating communities


In [66]:
%%bash 
source activate SIPSim

SIPSim communities \
    --config incorp.config \
    --abund_dist_p mean:4,sigma:1 \
    ../genomes/genomes10.txt \
    > comm.txt
    
head -n 5 comm.txt


library	taxon_name	rel_abund_perc	rank
1	Cyanobium_gracile_PCC_6307	38.674661093	1
1	Nitrosomonas_europaea_ATCC_19718	13.010951761	2
1	Idiomarina_loihiensis_GSL_199	10.533246482	3
1	Bacillus_cereus_F837_76	10.437157914	4

Simulating gradient fractions


In [67]:
%%bash -s $min_range_BD $max_range_BD
source activate SIPSim

SIPSim gradient_fractions \
    --BD_min $1 \
    --BD_max $2 \
    comm.txt \
    > fracs.txt  
    
head -n 5 fracs.txt


library	fraction	BD_min	BD_max	fraction_size
1	1	1.673	1.675	0.002
1	2	1.675	1.678	0.003
1	3	1.678	1.681	0.003
1	4	1.681	1.685	0.004

Simulating fragments


In [68]:
%%bash -s $nprocs 
source activate SIPSim

# creating amplicon fragments
SIPSim fragments \
    ../genomes/genomes10.txt \
    --fp ../genomes/ \
    --fr ../515Fm-927Rm.fna \
    --fld skewed-normal,9000,2500,-5 \
    --flr None,None \
    --np $1 \
    > genome10_ampFrag.pkl


Processing: "Roseobacter_litoralis_Och_149"
Processing: "Idiomarina_loihiensis_GSL_199"
Processing: "Bacillus_cereus_F837_76"
Processing: "Micrococcus_luteus_NCTC_2665"
Processing: "Geobacter_lovleyi_SZ"
Processing: "Cyanobium_gracile_PCC_6307"
Processing: "Leuconostoc_citreum_KM20"
Processing: "Escherichia_coli_APEC_O78"
Processing: "Xanthomonas_axonopodis_Xac29-1"
Processing: "Nitrosomonas_europaea_ATCC_19718"
  Genome name: Leuconostoc_citreum_KM20
  Genome name: Xanthomonas_axonopodis_Xac29-1
  Genome name: Roseobacter_litoralis_Och_149
  Genome name: Geobacter_lovleyi_SZ
  Genome length (bp): 1796284
  Number of amplicons: 4
  Number of fragments simulated: 10000
  Genome name: Idiomarina_loihiensis_GSL_199
  Genome length (bp): 2839759
  Number of amplicons: 4
  Number of fragments simulated: 10000
  Genome length (bp): 3917761
  Number of amplicons: 2
  Number of fragments simulated: 10000
  Genome name: Cyanobium_gracile_PCC_6307
  Genome length (bp): 4505211
  Number of amplicons: 1
  Number of fragments simulated: 10000
  Genome length (bp): 5153455
  Number of amplicons: 2
  Number of fragments simulated: 10000
  Genome name: Micrococcus_luteus_NCTC_2665
  Genome length (bp): 3342364
  Number of amplicons: 2
  Number of fragments simulated: 10000
  Genome name: Nitrosomonas_europaea_ATCC_19718
  Genome length (bp): 2501097
  Number of amplicons: 2
  Number of fragments simulated: 10000
  Genome length (bp): 2812094
  Number of amplicons: 1
  Number of fragments simulated: 10000
  Genome name: Escherichia_coli_APEC_O78
  Genome length (bp): 4798435
  Number of amplicons: 6
  Number of fragments simulated: 10000
  Genome name: Bacillus_cereus_F837_76
  Genome length (bp): 5222906
  Number of amplicons: 12
  Number of fragments simulated: 10000

Converting to KDE objects


In [69]:
%%bash -s $bandwidth
source activate SIPSim

SIPSim fragment_KDE \
    genome10_ampFrag.pkl \
    --bw $1 \
    > genome10_ampFrag_KDE.pkl

Checking fragment KDEs


In [70]:
%%bash -s $nprocs 
source activate SIPSim

# info on KDEs
SIPSim KDE_info \
    -s genome10_ampFrag_KDE.pkl \
    > genome10_ampFrag_info.txt


Loading KDEs...

In [71]:
%%R 
# loading
df = read.delim('genome10_ampFrag_info.txt', sep='\t')
df_kde1 = df %>%
    filter(KDE_ID == 1)
df_kde1 %>% head(n=3)

BD_GC50 = 0.098 * 0.5 + 1.66

In [72]:
%%R -w 500 -h 400
# plotting
p_amp = ggplot(df_kde1, aes(taxon_ID, median)) +
    geom_pointrange(aes(ymin=median-stdev, ymax=median+stdev)) +
    geom_hline(yintercept=BD_GC50, linetype='dashed', color='red', alpha=0.7) +
    labs(x='Median buoyant density') +
    theme_bw() +
    theme(
        text = element_text(size=16),
        axis.text.x = element_text(angle=65, hjust=1)
    )
plot(p_amp)


Adding diffusion


In [73]:
%%bash -s $bandwidth $nprocs
source activate SIPSim

SIPSim diffusion \
    genome10_ampFrag_KDE.pkl \
    --bw $1 \
    --np $2 \
    > genome10_ampFrag_dif_KDE.pkl


Index size: 90508
Processing: Roseobacter_litoralis_Och_149
Processing: Leuconostoc_citreum_KM20
Processing: Idiomarina_loihiensis_GSL_199
Processing: Xanthomonas_axonopodis_Xac29-1
Processing: Micrococcus_luteus_NCTC_2665
Processing: Escherichia_coli_APEC_O78
Processing: Nitrosomonas_europaea_ATCC_19718
Processing: Bacillus_cereus_F837_76
Processing: Geobacter_lovleyi_SZ
Processing: Cyanobium_gracile_PCC_6307

Adding DBL effects


In [74]:
%%bash -s $DBL_scaling $nprocs
source activate SIPSim

SIPSim DBL \
    --comm comm.txt \
    --commx $1 \
    --np $2 \
    -o genome10_ampFrag_dif_DBL_KDE.pkl \
    genome10_ampFrag_dif_KDE.pkl \
    2> genome10_ampFrag_dif_DBL_KDE.log
    
# checking output    
head -n 5 genome10_ampFrag_dif_DBL_KDE.log
echo "..."
tail -n 5 genome10_ampFrag_dif_DBL_KDE.log


DBL_index file written: "DBL_index.txt"
Processing: Bacillus_cereus_F837_76
Processing: Escherichia_coli_APEC_O78
Processing: Micrococcus_luteus_NCTC_2665
Processing: Roseobacter_litoralis_Och_149
...
Processing: Nitrosomonas_europaea_ATCC_19718
Processing: Xanthomonas_axonopodis_Xac29-1
Processing: Cyanobium_gracile_PCC_6307
Processing: Leuconostoc_citreum_KM20
Processing: Geobacter_lovleyi_SZ

Adding isotope incorporation

Selecting incorporators


In [75]:
%%bash -s $nprocs
source activate SIPSim

SIPSim KDE_select_taxa \
    -f 0.5 \
    genome10_ampFrag_KDE.pkl \
    > incorporators.txt
    
tail incorporators.txt


Bacillus_cereus_F837_76
Geobacter_lovleyi_SZ
Nitrosomonas_europaea_ATCC_19718
Escherichia_coli_APEC_O78
Micrococcus_luteus_NCTC_2665
Loading KDEs...
Subsampled 5 taxa

Simulating incorp


In [76]:
%%bash -s $nprocs
source activate SIPSim

SIPSim isotope_incorp \
    --comm comm.txt \
    --shift genome10_ampFrag_BD-shift.txt \
    --taxa incorporators.txt \
    --np $1 \
    -o genome10_ampFrag_dif_DBL_incorp_KDE.pkl \
    genome10_ampFrag_dif_DBL_KDE.pkl \
    incorp.config \
    2> genome10_ampFrag_dif_DBL_incorp_KDE.log
    
# checking log
head -n 5 genome10_ampFrag_dif_DBL_incorp_KDE.log
echo "..."
tail -n 5 genome10_ampFrag_dif_DBL_incorp_KDE.log


Loading KDE object...
Processing library: 1
Processing: Bacillus_cereus_F837_76
Processing: Escherichia_coli_APEC_O78
Processing: Micrococcus_luteus_NCTC_2665
...
Processing: Xanthomonas_axonopodis_Xac29-1
Processing: Cyanobium_gracile_PCC_6307
Processing: Leuconostoc_citreum_KM20
Processing: Geobacter_lovleyi_SZ
File written: genome10_ampFrag_BD-shift.txt

Plotting BD shift

  • distribution of BD shifts for all fragments per genome

In [77]:
%%R -w 1000 -h 350
df = read.delim('genome10_ampFrag_BD-shift.txt', sep='\t') %>%
    mutate(library = library %>% as.character)

p = ggplot(df, aes(taxon, mean, fill=library)) +
    geom_boxplot(aes(ymin=min, lower=q25, middle=median, upper=q75, ymax=max)) +
    labs(y='BD shift') + 
    theme_bw() +
    theme(
        axis.text.x = element_text(angle=45, hjust=1)
    )
plot(p)


Simulating OTU table


In [ ]:
%%bash -s $nprocs
source activate SIPSim

SIPSim OTU_table \
    --abs 1e9 \
    --np $1 \
    genome10_ampFrag_dif_DBL_incorp_KDE.pkl \
    comm.txt \
    fracs.txt \
    > OTU_n3_abs1e9.txt \
    2> OTU_n3_abs1e9.log 
    
# checking log
head -n 3 OTU_n3_abs1e9.log 
echo '...'
tail -n 3 OTU_n3_abs1e9.log

Creating 'wide' table


In [ ]:
%%bash 
source activate SIPSim

SIPSim OTU_wide_long -w \
    OTU_n3_abs1e9.txt \
    > OTU_n3_abs1e9_w.txt

Dataset from 'raw' counts

Creating 'wide' OTU table


In [ ]:
%%bash 
source activate SIPSim

SIPSim OTU_wide_long -w \
    OTU_n3_abs1e9.txt \
    > OTU_n3_abs1e9_w.txt

Making metadata


In [ ]:
%%bash 
source activate SIPSim

SIPSim OTU_sample_data \
    OTU_n3_abs1e9.txt \
    > OTU_n3_abs1e9_meta.txt

Simulating qPCR data


In [ ]:
%%bash 
source activate SIPSim

SIPSim qSIP \
    OTU_n3_abs1e9.txt \
    OTU_n3_abs1e9.txt \
    > OTU_n3_abs1e9_qSIP.txt

Creating phyloseq object


In [ ]:
%%bash 
source activate SIPSim

# getting around `WARNING: ignoring environment value of R_HOME` to STDOUT error
unset R_HOME

# making phyloseq object
phyloseq_make.R OTU_n3_abs1e9_w.txt \
    -s OTU_n3_abs1e9_meta.txt \
   > OTU_n3_abs1e9.physeq

Simulating PCR


In [ ]:
%%bash
source activate SIPSim

SIPSim OTU_PCR \
    OTU_n3_abs1e9.txt \
    > OTU_n3_abs1e9_PCR.txt

Subsampling from the OTU table

  • simulating sequencing of the DNA pool

In [ ]:
%%bash -s $subsample_dist $subsample_mean $subsample_scale $subsample_min $subsample_max
source activate SIPSim

SIPSim OTU_subsample \
    --dist $1 \
    --dist_params mean:$2,sigma:$3 \
    --min_size $4 \
    --max_size $5 \
    OTU_n3_abs1e9_PCR.txt \
    > OTU_n3_abs1e9_PCR_subNorm.txt

Creating 'wide' OTU table


In [ ]:
%%bash 
source activate SIPSim

SIPSim OTU_wide_long -w \
    OTU_n3_abs1e9_PCR_subNorm.txt \
    > OTU_n3_abs1e9_PCR_subNorm_w.txt

Making metadata


In [ ]:
%%bash 
source activate SIPSim

SIPSim OTU_sample_data \
    OTU_n3_abs1e9_PCR_subNorm.txt \
    > OTU_n3_abs1e9_PCR_subNorm_meta.txt

Simulating qPCR data

  • needed for qSIP

In [ ]:
%%bash 
source activate SIPSim

SIPSim qSIP \
    OTU_n3_abs1e9.txt \
    OTU_n3_abs1e9_PCR_subNorm.txt \
    > OTU_n3_abs1e9_PCR_subNorm_qSIP.txt

Creating phyloseq object


In [ ]:
%%bash 
source activate SIPSim

# getting around `WARNING: ignoring environment value of R_HOME` to STDOUT error
unset R_HOME

# making phyloseq object
phyloseq_make.R OTU_n3_abs1e9_PCR_subNorm_w.txt \
    -s OTU_n3_abs1e9_PCR_subNorm_meta.txt \
   > OTU_n3_abs1e9_PCR_subNorm.physeq

In [ ]: