Running SIPSim pipeline to simulate priming_exp gradient dataset

  • Basing simulation params off of priming_exp dataset
    • Basing starting community diversity on mean percent abundances in all fraction samples for the gradient
    • Other parameters are 'default'

Setting variables


In [47]:
workDir = '/home/nick/notebook/SIPSim/dev/priming_exp/validation_sample/X12C.700.14_fracRichness-moreDif/'
genomeDir = '/home/nick/notebook/SIPSim/dev/priming_exp/genomes/'
allAmpFrags = '/home/nick/notebook/SIPSim/dev/bac_genome1210/validation/ampFrags.pkl'
otuTableFile = '/var/seq_data/priming_exp/data/otu_table.txt'
metaDataFile = '/var/seq_data/priming_exp/data/allsample_metadata_nomock.txt'
primerFile = '/home/nick/notebook/SIPSim/dev/515F-806R.fna'

cdhit_dir = '/home/nick/notebook/SIPSim/dev/priming_exp/CD-HIT/'
R_dir = '/home/nick/notebook/SIPSim/lib/R/'
figureDir = '/home/nick/notebook/SIPSim/figures/'

# total dataset files
#allAmpFrags = '/home/nick/notebook/SIPSim/dev/bac_genome1210/validation/ampFrags.pkl'
genomeAllDir = '/home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/'
genomeAllIndex = '/home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/genome_index.txt'

# simulation params
comm_richness =  6606
seq_per_fraction = ['lognormal', 10.096, 1.116]

# for making genome_map file for genome fragment simulation
taxonMapFile = os.path.join(cdhit_dir, 'target_taxa.txt')
genomeFilterFile = os.path.join(cdhit_dir, 'genomeFile_seqID_filt.txt')
abundFile = os.path.join('/home/nick/notebook/SIPSim/dev/priming_exp/exp_info', 'X12C.700.14_frac_OTU.txt')

# misc
nprocs = 20

Init


In [22]:
import glob
import cPickle as pickle
import copy
from IPython.display import Image

In [23]:
%load_ext rpy2.ipython


The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython

In [24]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)

In [25]:
if not os.path.isdir(workDir):
    os.makedirs(workDir)

Creating a community file from the fraction relative abundances


In [26]:
%%R -i abundFile
# reading priming experiment OTU table
tbl.abund = read.delim(abundFile, sep='\t')
tbl.abund %>% head


      OTUId mean_perc_abund median_perc_abund max_perc_abund      sample
1     OTU.1     3.827532325       3.926317300    5.603168720 X12C.700.14
2    OTU.10     0.048631000       0.048182090    0.085456173 X12C.700.14
3   OTU.100     0.308850910       0.312166472    0.508460092 X12C.700.14
4  OTU.1000     0.013548909       0.013883827    0.024440914 X12C.700.14
5 OTU.10000     0.001464408       0.001464408    0.001464408 X12C.700.14
6  OTU.1001     0.005668025       0.005251645    0.016415420 X12C.700.14

In [27]:
%%R
tbl.comm = tbl.abund %>%
    rename('taxon_name' = OTUId,
           'rel_abund_perc' = mean_perc_abund) %>%
    select(taxon_name, rel_abund_perc) %>%
    mutate(library = '1',
           rank = row_number(-rel_abund_perc)) %>%
    arrange(rank)
    
tbl.comm %>% head


  taxon_name rel_abund_perc library rank
1      OTU.1       3.827532       1    1
2     OTU.68       2.088559       1    2
3      OTU.8       1.422617       1    3
4      OTU.3       1.315828       1    4
5      OTU.7       1.276686       1    5
6      OTU.2       1.264926       1    6

In [28]:
%%R
# rescaling rel_abund_perc so sum(rel_abund_perc) = 100
tbl.comm = tbl.comm %>%
    group_by(library) %>%
    mutate(total = sum(rel_abund_perc)) %>% 
    ungroup() %>%
    mutate(rel_abund_perc = rel_abund_perc * 100 / total) %>%
    select(library, taxon_name, rel_abund_perc, rank)
    
tbl.comm %>% head


Source: local data frame [6 x 4]

  library taxon_name rel_abund_perc rank
1       1      OTU.1       3.194891    1
2       1     OTU.68       1.743347    2
3       1      OTU.8       1.187477    3
4       1      OTU.3       1.098338    4
5       1      OTU.7       1.065667    5
6       1      OTU.2       1.055850    6

In [29]:
%%R -i comm_richness
# number of OTUs
n.OTUs = tbl.comm$taxon_name %>% unique %>% length
cat('Number of OTUs:', n.OTUs, '\n')

# assertion
cat('Community richness = number of OTUs?  ', comm_richness == n.OTUs, '\n')


Number of OTUs: 6606 
Community richness = number of OTUs?   TRUE 

In [30]:
%%R -i workDir

commFile = paste(c(workDir, 'comm.txt'), collapse='/')
write.table(tbl.comm, commFile, sep='\t', quote=F, row.names=F)

Plotting community distribution


In [31]:
%%R -i workDir

commFile = paste(c(workDir, 'comm.txt'), collapse='/')
comm = read.delim(commFile, sep='\t')
comm %>% head


  library taxon_name rel_abund_perc rank
1       1      OTU.1       3.194891    1
2       1     OTU.68       1.743347    2
3       1      OTU.8       1.187477    3
4       1      OTU.3       1.098338    4
5       1      OTU.7       1.065667    5
6       1      OTU.2       1.055850    6

In [32]:
%%R -w 900 -h 350

ggplot(comm, aes(rank, rel_abund_perc)) +
    geom_point() +
    labs(x='Rank', y='% relative abundance', title='Priming experiment community abundance distribution') +
    theme_bw() +
    theme(
        text = element_text(size=16)
        )


Simulating fragments

Making a genome index file to map genome fasta files to OTUs

  • Will be used for community simulation
  • Just OTUs with association to genomes

In [33]:
%%R -i taxonMapFile -i genomeFilterFile 

taxonMap = read.delim(taxonMapFile, sep='\t') %>%
    select(target_genome, OTU) %>%
    distinct()
taxonMap %>% nrow %>% print
taxonMap %>% head(n=3) %>% print

breaker = '----------------\n'
cat(breaker)

genomeFilter = read.delim(genomeFilterFile, sep='\t', header=F) 
genomeFilter %>% nrow %>% print
genomeFilter %>% head(n=3) %>% print

cat(breaker)

comm = read.delim(commFile, sep='\t') 
comm %>% nrow %>% print
comm %>% head(n=3) %>% print


[1] 236
                               target_genome      OTU
1 CP001738_Thermomonospora_curvata_DSM_43183 OTU.8540
2 CP001738_Thermomonospora_curvata_DSM_43183 OTU.9267
3 CP001738_Thermomonospora_curvata_DSM_43183 OTU.1457
----------------
[1] 187
                                                         V1
1                  CP003093_Pseudoxanthomonas_spadix_BD_a59
2                  CP000511_Mycobacterium_vanbaalenii_PYR_1
3 CP003344_Desulfitobacterium_dichloroeliminans_LMG_P_21439
                                                                                                            V2
1                  /home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/Pseudoxanthomonas_spadix_BD-a59.fasta
2                  /home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/Mycobacterium_vanbaalenii_PYR-1.fasta
3 /home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/Desulfitobacterium_dichloroeliminans_LMG_P-21439.fasta
----------------
[1] 6606
  library taxon_name rel_abund_perc rank
1       1      OTU.1       3.194891    1
2       1     OTU.68       1.743347    2
3       1      OTU.8       1.187477    3

In [34]:
%%R
taxonMap$OTU %>% table %>% sort(decreasing=T) %>% head


.
    OTU.1    OTU.10   OTU.101   OTU.102 OTU.10237  OTU.1035 
        1         1         1         1         1         1 

In [35]:
%%R

tbl.j = inner_join(taxonMap, genomeFilter, c('target_genome' = 'V1')) %>%
     rename('fasta_file' = V2) %>%
     select(OTU, fasta_file, target_genome)

tbl.j %>% head(n=3)


       OTU
1 OTU.8540
2 OTU.9267
3 OTU.1457
                                                                                     fasta_file
1 /home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/Thermomonospora_curvata_DSM_43183.fasta
2 /home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/Thermomonospora_curvata_DSM_43183.fasta
3 /home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/Thermomonospora_curvata_DSM_43183.fasta
                               target_genome
1 CP001738_Thermomonospora_curvata_DSM_43183
2 CP001738_Thermomonospora_curvata_DSM_43183
3 CP001738_Thermomonospora_curvata_DSM_43183

In [36]:
%%R
tbl.j$OTU %>% table %>% sort(decreasing=T) %>% head


.
    OTU.1    OTU.10   OTU.101   OTU.102 OTU.10237  OTU.1035 
        1         1         1         1         1         1 

In [37]:
%%R
tbl.j2 = inner_join(tbl.j, comm, c('OTU' = 'taxon_name')) 

n.target.genomes = tbl.j2$OTU %>% unique %>% length
cat('Number of target OTUs: ', n.target.genomes, '\n')
cat('--------', '\n')
tbl.j2 %>% head(n=3)


Number of target OTUs:  198 
-------- 
       OTU
1 OTU.8540
2 OTU.9267
3 OTU.1457
                                                                                     fasta_file
1 /home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/Thermomonospora_curvata_DSM_43183.fasta
2 /home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/Thermomonospora_curvata_DSM_43183.fasta
3 /home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/Thermomonospora_curvata_DSM_43183.fasta
                               target_genome library rel_abund_perc rank
1 CP001738_Thermomonospora_curvata_DSM_43183       1    0.002582583 4561
2 CP001738_Thermomonospora_curvata_DSM_43183       1    0.005045402 2191
3 CP001738_Thermomonospora_curvata_DSM_43183       1    0.011258918 1106

In [38]:
%%R -i workDir

outFile = paste(c(workDir, 'target_genome_index.txt'), collapse='/')
write.table(tbl.j2, outFile, sep='\t', quote=F, row.names=F, col.names=F)

Plotting community abundance distribution of target genomes


In [39]:
%%R -w 900 -h 350

ggplot(tbl.j2, aes(rank, rel_abund_perc)) +
    geom_point(size=3, shape='O', color='red') +
    labs(x='Rank', y='% relative abundance', title='Priming experiment community abundance distribution') +
    theme_bw() +
    theme(
        text = element_text(size=16)
        )


Simulating fragments of genomes that match priming_exp bulk OTUs


In [43]:
!cd $workDir; \
    SIPSim fragments \
    target_genome_index.txt \
    --fp $genomeDir \
    --fr $primerFile \
    --fld skewed-normal,5000,2000,-5 \
    --flr None,None \
    --nf 10000 \
    --np $nprocs \
    --tbl \
    2> ampFrags.log \
    > ampFrags.txt

Plotting fragment length distribution


In [44]:
%%R -i workDir

inFile = paste(c(workDir, 'ampFrags.txt'), collapse='/')

tbl = read.delim(inFile, sep='\t')
tbl %>% head(n=3)


  taxon_name                                 scaffoldID fragStart fragLength
1   OTU.8540 CP001738_Thermomonospora_curvata_DSM_43183   5586544       3830
2   OTU.8540 CP001738_Thermomonospora_curvata_DSM_43183    266123       3970
3   OTU.8540 CP001738_Thermomonospora_curvata_DSM_43183   2985740       2451
    fragGC
1 68.19843
2 67.83375
3 72.46022

In [50]:
%%R -w 950 -h 650

some.taxa = tbl$taxon_name %>% unique %>% head(n=20)

tbl.f = tbl %>% 
    filter(taxon_name %in% some.taxa)

ggplot(tbl.f, aes(fragGC, fragLength)) +
    stat_density2d() +
    labs(x='Fragment G+C', y='Fragment length (bp)') +
    facet_wrap(~ taxon_name, ncol=5) +
    theme_bw() +
    theme(
        text=element_text(size=16),
        axis.title.y=element_text(vjust=1)
        )



In [46]:
# re-running simulation with pickled file

!cd $workDir; \
    SIPSim fragments \
    target_genome_index.txt \
    --fp $genomeDir \
    --fr $primerFile \
    --fld skewed-normal,5000,2000,-5 \
    --flr None,None \
    --nf 10000 \
    --np $nprocs \
    2> ampFrags.log \
    > ampFrags.pkl

Simulating fragments of total dataset with a greater diffusion


In [48]:
!cd $workDir; \
    SIPSim fragments \
    $genomeAllIndex \
    --fp $genomeAllDir \
    --fr $primerFile \
    --fld skewed-normal,5000,2000,-5 \
    --flr None,None \
    --nf 10000 \
    --np $nprocs \
    2> ampFragsAll.log \
    > ampFragsAll.pkl

In [49]:
ampFragsAllFile = os.path.join(workDir, 'ampFragsAll.pkl')

Appending fragments from randomly selected genomes of total dataset (n=1210)

  • This is to obtain the richness of the bulk soil community
  • Random OTUs will be named after non-target OTUs in comm file

Making list of non-target OTUs


In [51]:
%%R -i workDir
# loading files

## target genome index (just OTUs with associated genome)
inFile = paste(c(workDir, 'target_genome_index.txt'), collapse='/')
tbl.target = read.delim(inFile, sep='\t', header=F)
colnames(tbl.target) = c('OTUId', 'fasta_file', 'genome_name')

## comm file of total community OTUs 
commFile = paste(c(workDir, 'comm.txt'), collapse='/')
tbl.comm = read.delim(commFile, sep='\t')

In [52]:
%%R
# just OTUs w/out an associated genome
tbl.j = anti_join(tbl.comm, tbl.target, c('taxon_name' = 'OTUId'))
n.nontarget.genomes = tbl.j$taxon_name %>% length
cat('Number of non-target genomes: ', n.nontarget.genomes, '\n')
cat('---------\n')
tbl.j %>% head(n=5)


Number of non-target genomes:  6408 
---------
  library taxon_name rel_abund_perc rank
1       1   OTU.9280   0.0008491831 6606
2       1   OTU.8991   0.0008491831 6605
3       1   OTU.8467   0.0008491831 6604
4       1   OTU.7687   0.0008491831 6603
5       1   OTU.7201   0.0008491831 6602

In [53]:
%%R -i comm_richness
# checking assumptions
cat('Target + nonTarget richness = total community richness?: ',
    n.target.genomes + n.nontarget.genomes == comm_richness, '\n')


Target + nonTarget richness = total community richness?:  TRUE 

In [54]:
%%R -i workDir
# writing out non-target OTU file
outFile = paste(c(workDir, 'comm_nonTarget.txt'), collapse='/')
write.table(tbl.j, outFile, sep='\t', quote=F, row.names=F)

Randomly selecting amplicon fragment length-GC KDEs from total genome pool


In [55]:
# List of non-target OTUs
inFile = os.path.join(workDir, 'comm_nonTarget.txt')
nonTarget = pd.read_csv(inFile, sep='\t')['taxon_name'].tolist()

print 'Number of non-target OTUs: {}'.format(len(nonTarget))
nonTarget[:4]


Number of non-target OTUs: 6408
Out[55]:
['OTU.9280', 'OTU.8991', 'OTU.8467', 'OTU.7687']

In [56]:
# loading amplicon fragments from full genome KDE dataset
inFile = os.path.join(workDir, 'ampFrags.pkl')
ampFrag_target = []
with open(inFile, 'rb') as iFH:
    ampFrag_target = pickle.load(iFH)
print 'Target OTU richness: {}'.format(len(ampFrag_target))


Target OTU richness: 198

In [57]:
# loading amplicon fragments from full genome KDE dataset
ampFrag_all = []
with open(allAmpFrags, 'rb') as iFH:
    ampFrag_all = pickle.load(iFH)
print 'Count of frag-GC KDEs for all genomes: {}'.format(len(ampFrag_all))


Count of frag-GC KDEs for all genomes: 1210

In [58]:
# random selection from list
#target_richness = len(ampFrag_target)

target_richness = len(ampFrag_target)
richness_needed = comm_richness - target_richness
print 'Number of random taxa needed to reach richness: {}'.format(richness_needed)

if richness_needed > 0:
    index = range(target_richness)
    index = np.random.choice(index, richness_needed)
    
    ampFrag_rand = []
    for i in index:
        sys.stderr.write('{},'.format(i))
        ampFrag_rand.append(copy.deepcopy(ampFrag_all[i]))
else:
    ampFrag_rand = []


127,196,70,29,4,84,121,40,24,111,131,3,185,178,103,19,25,112,179,74,47,28,196,114,112,55,98,108,46,85,155,79,182,102,195,0,22,39,146,72,160,5,40,30,51,71,115,11,137,38,188,124,158,114,55,164,73,121,145,134,148,193,162,128,40,168,38,177,32,10,59,134,19,74,35,118,114,123,179,127,2,32,64,108,2,81,132,99,134,46,111,190,36,149,157,81,161,93,145,53,135,153,195,69,160,12,109,127,164,145,158,7,139,31,56,137,177,28,28,161,25,160,174,90,98,1,187,189,43,141,46,165,3,89,183,46,58,39,68,84,187,61,153,148,186,134,163,102,31,16,8,81,71,86,18,13,128,133,29,45,61,182,77,22,54,116,22,77,102,83,105,56,189,118,177,172,167,191,9,115,72,43,122,113,95,13,117,71,29,187,1,139,30,126,30,138,110,174,31,37,82,29,197,138,84,192,13,135,85,83,164,172,102,150,70,61,179,186,105,153,53,76,52,24,100,66,161,192,132,63,88,166,149,182,106,171,189,175,94,116,20,78,197,141,115,121,130,191,96,23,117,66,120,178,42,85,33,143,140,157,52,62,147,74,183,52,90,196,84,75,168,52,136,26,167,34,10,51,42,33,181,172,64,108,19,155,137,18,156,137,96,73,95,107,31,190,113,96,193,172,26,178,8,12,115,68,164,115,196,196,131,88,4,140,80,136,125,177,15,185,140,65,81,147,183,41,126,23,132,129,66,104,2,83,60,161,32,139,163,171,103,183,97,27,33,2,124,50,86,4,191,6,145,160,190,111,147,177,50,68,65,25,34,165,116,15,54,25,36,5,107,166,95,31,119,77,122,163,32,93,106,82,86,50,152,148,52,90,96,72,42,55,194,95,168,3,94,88,50,107,161,91,65,130,30,117,135,155,29,176,101,142,175,60,104,84,36,75,99,75,105,186,15,158,95,160,167,113,189,173,157,40,63,32,57,173,64,83,59,115,6,5,88,188,183,34,18,197,3,161,158,169,13,126,170,156,99,17,85,78,60,47,180,77,25,66,147,96,188,16,188,12,14,113,6,22,45,115,100,19,159,67,178,165,1,156,159,158,110,121,180,120,113,3,87,175,169,121,81,175,54,106,136,38,189,61,63,31,11,78,43,74,24,112,158,110,29,54,115,174,50,152,88,155,63,73,59,158,46,197,51,192,115,77,34,19,120,12,177,117,0,14,141,178,131,191,4,172,90,120,194,103,99,119,26,57,88,133,187,49,66,26,87,177,178,4,115,123,168,43,16,44,68,123,44,64,157,130,101,154,195,34,10,10,161,177,101,90,77,161,141,41,145,121,133,129,57,104,168,176,66,184,67,167,36,44,113,71,103,62,86,174,95,67,163,85,175,93,33,11,102,37,33,92,160,18,55,75,39,123,142,66,158,35,159,150,89,173,40,4,90,182,128,192,69,52,144,22,91,14,27,108,156,187,38,144,101,191,18,29,154,49,105,112,153,156,44,155,113,24,170,166,114,37,175,61,150,142,187,156,46,76,166,86,83,82,166,17,6,100,30,166,116,112,162,197,154,29,174,50,155,25,52,146,27,70,156,116,167,72,99,179,27,154,74,40,13,155,79,126,34,97,130,127,8,106,173,139,36,149,184,173,95,185,55,9,150,68,105,192,191,195,31,119,65,196,104,69,66,71,79,103,187,5,79,139,64,186,138,23,40,101,96,133,103,121,55,126,22,113,140,71,155,3,106,174,11,181,94,18,122,84,33,39,99,100,21,39,33,5,179,150,136,148,101,68,104,96,94,117,131,38,62,21,88,96,119,36,19,189,123,71,146,178,40,122,192,106,71,28,148,3,145,94,71,127,126,121,66,102,194,176,39,81,99,77,83,86,73,132,9,183,1,129,92,125,30,191,79,191,120,165,59,39,28,100,44,93,114,65,47,195,105,65,3,96,80,190,54,78,4,100,140,53,10,46,82,145,84,133,3,188,19,25,91,144,2,157,106,134,40,185,185,171,99,109,105,21,108,101,45,18,77,130,11,122,115,119,123,83,99,13,191,110,13,56,12,32,141,49,146,129,101,171,62,127,34,148,185,98,61,104,131,187,169,90,87,105,20,45,162,105,29,107,11,136,113,190,52,8,138,75,46,42,87,134,197,174,42,134,35,118,151,35,22,165,51,140,58,150,163,163,172,140,161,126,136,29,92,131,196,8,116,164,18,37,77,6,23,188,96,26,174,20,121,159,127,117,153,152,154,69,194,67,135,33,192,117,183,178,33,179,169,20,100,191,34,159,26,95,75,52,42,75,164,130,23,168,44,132,197,194,155,153,112,181,6,181,180,140,161,43,149,41,6,177,136,134,105,158,75,182,144,122,179,12,87,127,18,120,190,44,90,111,181,73,38,190,24,18,32,16,3,184,60,191,47,141,194,36,51,145,11,175,125,151,102,186,24,162,2,116,98,39,177,175,45,147,72,88,26,100,116,95,26,51,99,152,29,100,49,89,74,1,68,22,175,194,116,135,60,167,89,82,155,15,166,194,104,10,89,62,106,141,176,19,102,91,159,131,65,144,72,108,40,181,119,78,14,114,17,23,192,190,134,164,129,57,98,121,51,171,50,162,69,132,171,67,41,67,134,71,68,124,77,0,76,45,92,130,26,95,135,133,173,65,90,79,96,105,16,157,123,162,163,183,10,8,160,33,170,14,73,134,176,190,172,0,164,29,116,84,25,57,2,85,46,166,177,19,12,166,46,151,50,192,195,195,27,100,114,23,27,1,60,104,85,170,156,40,167,92,58,0,140,93,180,41,84,192,46,114,42,114,116,76,5,99,148,138,48,20,176,178,60,26,181,55,80,36,123,100,192,39,23,22,3,162,126,104,188,181,196,10,56,61,67,3,151,110,189,118,172,74,196,111,20,15,99,66,38,36,169,149,176,2,60,17,143,106,145,6,150,153,1,93,70,71,189,15,41,95,107,158,87,120,14,21,87,189,92,48,24,124,188,185,107,155,138,29,13,167,183,74,58,31,25,177,120,197,161,28,152,150,173,30,119,179,7,86,81,66,67,9,41,35,178,77,166,62,161,102,116,77,135,109,72,96,161,143,96,21,103,94,47,126,179,167,41,103,130,31,99,183,28,91,125,100,68,49,84,167,157,76,99,107,45,195,8,79,11,97,73,58,129,43,179,161,67,51,119,1,157,40,148,56,93,121,60,162,177,92,46,96,164,113,123,16,23,47,107,133,114,179,141,119,118,185,50,11,191,28,124,146,93,26,192,125,136,166,193,72,148,130,93,82,60,194,65,177,182,50,70,6,189,159,116,116,156,161,148,48,80,39,78,163,113,65,42,188,22,191,163,186,54,56,21,125,175,26,111,39,72,117,111,73,157,197,138,194,147,121,164,47,136,119,15,113,32,74,196,117,167,156,22,96,65,33,31,155,110,79,79,78,52,173,124,91,89,188,102,97,161,68,0,51,106,92,149,116,176,174,102,61,60,177,102,37,51,142,35,189,180,109,149,133,170,90,13,173,187,116,128,185,149,63,39,38,187,188,28,41,145,68,157,148,24,151,142,79,45,65,17,91,135,22,115,58,114,9,34,123,181,149,150,132,127,122,152,3,68,186,22,192,188,46,67,139,182,165,134,38,66,192,135,73,111,165,9,136,65,136,81,124,97,65,14,121,21,85,192,8,121,184,159,171,153,24,131,123,188,18,167,94,82,184,60,52,105,74,99,192,178,143,183,145,147,159,30,128,95,135,116,186,73,54,125,197,34,138,91,166,41,159,24,190,144,126,63,30,82,110,93,42,142,55,169,187,96,194,19,98,26,37,183,25,38,70,55,48,111,151,161,145,135,16,113,124,83,47,21,55,11,131,22,179,36,36,48,19,126,123,23,39,188,69,153,6,46,76,162,44,39,8,185,161,192,99,182,15,116,96,186,72,119,47,39,187,193,34,11,186,125,56,71,162,11,162,48,61,69,83,23,69,148,135,8,86,23,130,31,185,33,159,116,161,115,185,61,163,197,95,125,11,123,124,118,66,54,188,160,8,67,121,95,134,5,92,169,117,74,187,133,59,79,50,133,168,36,13,131,68,93,156,194,86,76,183,100,188,168,28,88,134,5,185,20,106,173,104,49,25,3,187,32,28,10,2,107,81,134,151,179,97,45,107,101,124,177,51,173,115,160,5,187,87,146,120,16,107,110,0,118,47,101,105,150,133,171,6,79,52,114,61,124,170,152,29,188,65,120,3,163,179,151,72,11,160,162,132,86,97,161,145,91,113,181,69,10,64,174,48,176,14,164,47,146,98,143,134,50,25,145,121,112,195,101,197,26,3,9,146,27,25,130,177,102,168,118,137,41,95,73,187,90,167,138,78,141,55,122,192,14,172,118,105,166,152,18,85,144,175,95,167,43,182,55,22,27,160,2,24,119,146,127,98,55,132,64,168,122,37,21,109,155,39,69,192,190,64,84,78,184,116,174,53,128,2,137,59,53,97,104,145,124,115,123,27,50,43,124,52,170,103,59,19,130,126,76,114,105,103,182,34,90,27,93,162,31,141,108,1,182,106,92,132,168,41,69,71,187,3,135,197,31,170,7,20,179,102,24,100,183,113,194,173,194,128,115,94,147,188,44,151,29,155,28,19,111,197,72,69,19,156,92,90,34,7,52,179,173,72,133,55,78,134,60,149,102,84,89,29,165,33,36,149,39,183,9,121,174,168,105,47,100,92,54,0,52,4,154,69,55,172,44,2,191,33,141,54,185,44,135,182,185,52,136,187,79,179,25,119,142,139,113,86,123,8,188,70,162,128,3,6,40,109,26,108,173,145,19,112,4,125,56,4,155,165,112,132,54,144,32,51,192,17,72,103,60,22,68,95,102,174,189,181,91,74,50,102,15,137,17,189,135,21,0,153,65,9,71,90,102,39,83,136,73,8,5,4,79,177,96,87,21,87,110,38,147,86,37,4,132,24,85,122,85,36,105,178,152,193,189,123,146,134,90,36,72,161,111,134,168,157,36,118,197,16,145,66,74,50,181,99,60,90,15,120,141,183,3,143,108,93,49,20,46,100,147,5,6,93,12,55,133,11,113,86,137,12,176,157,92,184,101,18,170,13,129,152,189,84,152,132,15,119,126,148,147,118,54,128,159,168,136,21,14,92,11,116,66,105,130,113,139,128,19,121,1,37,31,122,179,34,112,80,71,132,93,75,48,54,61,2,50,65,124,47,180,15,124,18,59,31,12,79,27,162,82,196,76,106,85,58,154,129,101,129,52,36,122,173,102,23,66,63,23,22,187,157,50,41,18,9,12,133,14,191,178,81,0,62,183,99,120,125,182,15,184,189,96,117,0,13,121,145,154,169,176,165,5,40,91,105,30,131,98,52,45,124,178,176,17,158,84,177,157,170,72,70,123,6,194,197,49,188,75,73,14,30,197,192,197,153,18,74,57,163,89,107,83,53,133,67,195,131,98,100,177,70,191,64,22,187,194,172,102,120,66,182,5,153,111,144,23,185,61,99,13,0,83,90,120,158,116,129,73,167,127,92,145,31,106,158,29,77,78,180,127,60,54,42,31,158,141,93,179,91,143,134,31,35,153,137,71,3,79,82,44,13,146,61,119,51,45,167,78,190,150,184,154,41,52,136,50,85,70,47,49,85,157,84,165,23,38,52,121,90,99,188,177,73,170,183,121,132,161,36,152,86,14,95,161,35,46,132,170,129,86,62,130,112,125,43,113,98,108,46,170,176,57,91,144,36,84,65,147,128,144,79,63,72,139,75,115,4,180,118,28,91,65,37,26,128,177,136,80,23,69,117,46,23,170,15,144,182,36,7,117,4,184,134,136,121,78,155,77,30,113,176,142,58,1,58,107,171,189,13,39,94,150,187,105,191,54,100,153,117,117,136,74,140,81,35,146,192,46,165,180,164,23,141,187,60,45,152,189,83,160,29,7,176,20,139,140,104,177,134,54,152,48,42,49,28,21,98,182,78,7,180,89,21,51,117,138,41,20,53,189,61,140,143,154,64,28,144,38,111,131,167,62,40,26,80,173,7,110,103,117,108,80,110,143,0,85,79,49,122,39,151,178,134,142,189,25,181,92,57,131,79,130,194,165,165,197,113,44,141,174,83,103,127,127,150,118,104,145,163,54,18,122,29,53,153,68,0,112,139,126,50,93,44,117,93,129,94,174,75,58,4,72,108,75,91,154,16,17,165,141,82,103,25,120,139,74,81,166,96,129,23,25,182,88,7,8,97,190,165,41,122,173,51,175,38,154,131,43,80,197,93,138,10,8,90,192,116,16,36,128,179,5,8,172,167,149,124,172,118,101,143,10,188,56,59,102,144,124,63,128,89,173,4,34,45,28,109,17,72,87,95,118,151,102,111,119,81,129,66,7,108,41,177,71,164,120,10,161,180,133,164,85,94,47,190,121,194,125,15,153,135,111,74,139,41,28,168,163,99,14,24,108,0,50,25,102,67,54,63,110,26,173,63,36,193,193,54,124,91,14,113,180,179,31,47,11,87,186,45,45,30,25,176,92,163,103,62,118,122,8,106,85,38,80,153,22,2,54,28,65,22,131,157,77,177,12,196,15,148,102,180,109,104,143,133,1,110,129,9,147,173,6,60,189,103,166,7,104,122,26,163,131,192,71,175,14,115,99,13,65,93,89,88,126,72,54,95,164,187,12,166,150,20,64,6,160,115,42,180,117,59,14,107,144,114,99,157,70,82,150,25,156,16,124,193,135,111,52,88,65,187,186,195,22,111,137,28,118,76,28,35,159,40,12,100,177,129,139,142,119,132,17,7,23,94,21,60,115,74,102,7,11,41,45,187,129,154,78,153,193,169,59,181,41,47,55,157,82,54,69,177,151,148,41,103,189,43,196,189,155,36,17,44,63,117,17,182,183,185,177,70,2,108,170,27,4,26,10,164,182,195,148,166,98,114,157,190,106,83,37,135,133,100,110,111,67,137,111,109,107,67,195,141,34,7,157,162,68,5,174,1,164,66,74,53,100,138,9,105,67,187,172,17,58,79,12,144,164,110,104,4,113,103,19,87,118,181,5,61,22,29,119,18,181,97,10,152,78,23,7,68,135,115,2,155,73,33,117,129,121,155,68,44,173,36,197,116,154,173,13,42,152,104,25,128,12,131,146,35,37,52,3,140,10,13,0,196,43,57,6,190,26,70,79,107,143,142,80,180,72,78,88,127,157,115,114,163,163,57,183,58,197,86,52,104,142,115,3,16,108,5,164,1,93,94,177,114,117,93,174,22,75,159,153,9,93,38,49,17,36,52,92,70,144,167,134,14,143,119,111,150,135,89,50,62,3,41,120,27,23,33,113,24,52,56,98,196,25,8,172,18,77,135,75,16,180,146,197,20,182,37,123,51,175,78,50,33,109,16,194,126,82,197,16,176,17,41,182,195,27,146,105,104,190,112,67,154,191,139,106,30,57,75,64,120,81,127,171,134,76,79,33,178,30,178,8,192,38,126,17,177,49,165,162,69,90,111,119,195,40,54,68,138,78,58,26,65,117,129,129,57,52,30,110,76,6,155,141,88,186,20,12,142,174,176,26,35,157,176,93,123,143,90,63,62,22,176,154,193,35,80,53,12,42,48,117,62,26,15,125,52,123,158,84,155,190,108,84,67,91,154,33,114,20,108,47,179,89,192,152,191,194,106,168,190,194,53,137,70,57,118,158,9,112,114,187,73,54,182,134,162,140,183,86,10,112,152,9,89,133,133,107,160,188,161,29,162,32,169,162,15,59,189,194,60,56,49,10,194,38,16,115,83,154,166,0,121,7,8,26,65,54,57,162,148,96,30,69,145,88,17,5,78,132,43,29,171,129,32,104,161,56,131,162,135,33,177,99,31,150,122,8,176,190,53,25,119,95,36,46,159,166,69,91,10,54,160,54,137,150,88,152,164,47,149,142,101,112,118,12,127,91,109,188,21,6,48,183,101,6,95,44,55,127,19,166,113,184,74,93,159,88,156,82,118,88,159,164,100,141,130,114,162,81,124,38,128,130,137,174,106,37,32,140,82,37,125,73,188,115,193,38,7,32,41,14,9,45,90,163,125,21,116,48,58,111,176,139,112,20,188,0,26,37,47,102,172,179,39,83,175,100,128,49,181,103,42,106,183,136,86,101,95,95,3,111,95,14,18,78,9,13,124,175,79,63,156,194,161,77,116,15,89,163,85,189,13,73,78,32,187,41,195,140,81,131,2,31,14,72,101,47,63,148,92,29,105,138,44,180,68,128,54,2,60,180,4,81,136,53,143,143,54,124,90,86,13,125,98,137,98,195,55,119,155,104,37,15,161,175,195,43,98,120,23,128,66,7,38,184,144,47,196,30,175,84,62,50,52,162,6,166,88,96,57,147,78,45,86,107,11,89,49,84,46,131,191,119,108,83,157,166,146,43,146,22,16,158,76,81,98,126,6,81,152,99,40,190,7,113,132,129,143,56,39,167,99,6,19,55,45,81,147,75,191,78,0,104,42,55,9,143,187,71,51,52,102,164,191,193,27,30,115,113,159,172,173,62,38,80,162,43,101,82,52,133,1,79,87,160,15,93,152,133,29,110,92,80,152,132,150,48,66,19,2,73,112,76,37,162,143,76,152,2,149,176,139,46,15,187,28,41,68,33,70,183,85,127,21,102,46,162,106,45,80,41,59,93,119,72,161,71,117,34,37,128,191,159,76,3,185,31,168,59,1,196,25,60,64,147,125,132,21,184,25,169,71,159,4,27,28,80,55,50,10,123,193,70,106,193,94,70,118,76,58,33,193,161,41,23,190,133,150,72,192,16,102,4,4,53,9,174,182,43,18,156,95,92,3,17,154,141,111,5,76,119,151,19,108,186,110,150,123,110,173,122,114,122,54,174,140,169,34,157,128,152,161,192,125,155,36,180,148,118,170,74,98,115,92,15,121,30,114,116,112,101,51,168,143,50,154,103,26,23,174,153,190,111,26,136,75,125,78,8,108,49,78,91,40,193,144,114,176,184,189,176,194,94,189,79,8,180,101,195,192,41,63,21,162,104,48,171,11,79,183,167,166,31,127,120,9,183,68,178,96,172,106,139,109,123,19,64,50,101,115,144,183,9,126,33,96,160,146,96,132,139,86,167,141,79,125,20,20,40,83,141,17,88,83,128,26,84,113,140,96,174,29,30,65,185,166,124,96,63,116,119,92,161,0,158,100,168,143,31,2,124,49,126,83,131,3,44,88,84,34,113,107,95,167,44,94,155,167,11,105,136,62,105,193,7,9,188,116,160,57,188,195,97,22,45,21,75,30,116,128,161,40,57,128,128,92,116,107,78,132,189,187,15,69,74,159,45,90,41,133,60,28,89,55,185,66,49,69,110,167,70,73,86,59,68,76,41,79,158,191,42,29,42,117,168,147,2,7,162,56,25,55,183,125,92,56,65,164,105,10,122,89,89,147,58,20,53,72,157,81,197,91,148,114,52,123,142,9,193,127,84,116,137,117,21,41,36,166,6,50,15,57,110,62,137,60,70,51,25,175,68,84,61,16,21,194,175,169,195,23,79,0,142,128,157,144,100,113,77,76,40,147,123,168,15,176,37,46,105,29,105,179,91,19,126,13,37,59,61,72,8,7,130,17,11,177,97,166,4,96,38,63,51,107,42,52,191,69,17,179,121,115,127,162,133,95,14,183,64,175,156,51,40,24,171,65,30,76,192,100,170,94,70,16,73,66,61,149,91,124,155,73,178,51,7,118,74,170,77,153,77,175,147,42,71,167,66,117,126,27,114,126,8,101,81,68,90,175,78,185,117,184,172,71,185,72,165,106,44,121,24,82,122,183,152,21,134,48,65,0,187,13,28,144,25,40,141,150,60,106,49,167,9,44,181,2,91,36,71,160,41,125,57,17,150,86,74,91,68,125,163,32,40,196,52,111,109,195,89,124,159,86,32,135,121,124,8,125,13,183,120,66,28,4,8,188,15,65,173,55,28,52,60,72,138,145,146,164,110,65,102,171,114,34,62,183,99,77,133,31,171,6,85,31,169,158,162,4,22,120,10,34,28,21,14,120,86,49,51,119,177,84,149,60,59,176,94,96,180,184,110,118,48,119,26,35,131,187,76,177,104,128,134,131,63,74,93,189,91,30,130,177,43,92,114,90,153,142,91,127,41,196,84,1,57,84,110,20,178,66,158,78,142,103,118,160,97,58,41,93,156,44,109,140,31,16,46,51,79,164,95,128,133,112,157,91,73,5,6,136,127,127,50,86,60,35,96,89,124,97,73,173,74,48,13,80,137,84,22,67,73,29,115,82,68,97,167,145,34,127,102,48,133,60,192,62,47,142,5,75,139,41,171,163,35,47,163,184,118,86,174,102,94,197,32,57,124,58,24,32,167,61,109,189,16,155,74,60,67,81,132,134,134,89,109,155,16,103,31,175,21,151,50,34,143,58,56,34,109,97,145,114,66,24,140,193,123,155,40,44,129,83,114,69,82,185,106,32,57,31,187,38,3,86,31,130,45,14,141,133,86,49,190,96,10,65,42,110,5,147,195,128,167,66,113,193,50,191,153,134,196,144,15,84,71,171,8,124,193,115,12,115,69,157,15,65,132,103,53,88,61,50,1,147,124,156,30,123,23,113,24,178,103,175,96,187,55,196,21,58,45,79,158,159,149,57,38,26,179,165,47,96,87,5,189,124,49,84,146,175,182,41,183,142,179,17,190,58,42,48,37,93,24,80,129,15,81,175,136,30,10,137,149,37,5,156,100,114,77,20,6,70,71,33,130,66,70,126,56,184,2,96,155,40,57,44,194,64,188,160,22,121,149,84,167,20,88,1,20,62,78,173,123,21,95,147,95,183,16,183,46,17,122,167,148,2,165,175,153,108,4,165,12,90,44,142,27,102,92,20,16,68,127,181,70,32,31,27,140,80,94,160,87,22,107,145,95,143,182,111,197,16,66,74,21,173,96,67,137,95,33,171,132,87,179,146,126,81,87,1,56,145,38,193,22,68,163,174,123,7,57,60,83,161,169,5,57,122,195,21,70,94,170,143,190,13,101,41,103,70,152,48,38,0,44,113,118,158,140,169,118,87,114,187,91,53,129,172,13,161,54,3,151,100,147,92,57,82,190,56,179,122,107,1,87,189,36,163,143,43,32,98,149,38,14,155,96,45,109,133,174,188,129,182,139,63,152,6,54,120,170,110,93,91,53,57,90,4,114,89,57,158,30,134,195,35,88,153,172,82,96,40,192,177,145,156,21,82,118,156,137,65,121,179,62,27,145,63,90,133,115,166,121,69,105,49,165,27,92,35,91,92,69,15,4,123,27,131,51,114,186,85,71,8,134,166,91,154,172,154,114,108,28,1,168,47,32,155,15,71,174,179,48,156,48,68,114,70,87,153,127,158,117,178,102,72,85,109,32,167,182,79,134,39,126,189,189,10,98,190,72,5,194,73,93,84,3,27,112,113,94,94,27,86,167,27,134,81,147,151,20,93,134,13,42,190,160,3,50,48,154,145,78,71,162,164,0,69,34,103,103,162,56,49,34,177,149,5,8,16,154,23,1,176,24,153,167,31,185,108,71,71,160,138,50,194,23,41,138,103,103,40,160,13,178,70,27,117,112,118,111,166,142,123,120,128,153,52,120,176,28,26,160,96,84,117,159,144,155,105,159,30,188,150,41,81,105,82,51,186,134,178,10,166,192,174,41,140,19,128,96,38,185,102,120,148,115,127,156,79,106,176,160,45,71,69,152,159,56,184,158,31,23,166,53,61,22,28,183,76,75,56,60,101,136,97,25,124,122,8,66,162,127,129,36,50,187,172,84,193,187,45,102,144,119,48,187,194,179,116,10,67,177,128,5,154,170,158,3,186,141,194,114,66,111,42,189,52,161,6,54,106,129,35,73,94,177,92,112,33,180,118,44,168,18,192,12,10,79,191,195,53,148,91,68,133,85,42,132,173,15,172,187,31,145,104,85,128,21,83,195,82,75,43,183,80,109,179,23,149,87,94,189,22,118,96,99,162,139,116,79,168,181,62,27,136,153,33,47,138,2,170,125,3,168,128,25,72,27,16,124,100,34,176,45,148,191,130,135,85,197,7,36,115,50,102,116,119,64,190,49,89,21,28,140,166,35,11,53,71,56,156,30,112,160,169,46,63,52,197,133,94,65,94,32,102,16,127,75,27,38,89,151,102,86,188,61,118,84,90,151,132,180,27,142,196,11,111,73,140,113,97,64,115,132,5,7,18,3,185,71,1,180,144,187,33,152,124,79,166,177,127,42,62,181,67,75,192,16,92,167,156,184,150,23,163,38,21,178,28,33,75,113,180,165,119,10,177,43,115,79,161,3,69,151,65,145,27,31,8,76,96,24,75,38,98,62,1,22,65,17,135,131,38,103,13,38,146,96,90,187,62,4,1,184,14,197,102,135,86,90,83,190,120,97,146,130,115,151,37,93,169,111,113,19,175,101,34,67,135,118,179,86,18,51,175,144,127,101,112,123,185,92,101,185,122,147,5,51,47,147,39,58,47,50,109,158,48,68,116,173,149,68,154,86,36,172,22,24,105,55,29,122,11,55,43,158,108,94,91,107,112,121,134,184,186,37,132,173,57,65,94,162,110,130,78,47,175,49,97,78,30,116,34,60,104,127,71,147,156,58,125,84,133,105,194,191,93,183,115,142,82,122,104,38,130,88,84,143,77,178,51,190,25,146,101,175,11,32,175,117,90,54,70,149,114,31,77,79,48,3,125,37,10,117,43,78,7,197,141,157,112,103,151,1,164,164,82,149,86,82,76,196,7,130,174,14,64,52,90,148,17,167,106,114,119,145,21,196,37,20,90,15,65,40,6,185,1,119,72,183,180,16,57,80,24,156,184,69,54,25,124,186,18,15,36,44,22,82,79,76,1,86,161,1,3,21,119,177,0,154,87,155,36,117,149,20,109,157,134,48,185,141,64,70,92,68,91,192,140,169,112,112,128,137,127,16,24,94,70,90,10,31,75,184,125,149,92,56,116,34,50,192,8,38,18,36,33,168,16,151,132,195,142,111,160,94,151,158,148,117,144,4,17,157,120,110,103,76,155,46,31,57,80,22,24,75,76,47,187,32,61,30,23,125,54,130,131,167,85,68,33,96,101,10,32,44,31,9,35,62,90,196,29,75,76,73,166,125,68,94,63,66,56,82,21,75,114,159,192,61,173,133,11,81,126,149,185,155,133,186,66,179,23,43,91,173,164,96,50,45,148,71,152,22,150,102,137,38,74,156,57,0,193,21,87,13,176,104,149,118,170,112,50,181,185,147,86,171,172,92,112,155,5,183,172,27,135,114,129,103,100,53,131,170,36,145,44,26,146,38,135,110,19,5,21,116,73,110,0,59,20,112,147,123,91,177,178,16,56,87,157,192,126,72,19,86,70,173,33,82,45,156,48,49,48,52,21,69,186,93,181,167,115,111,41,23,50,38,195,177,15,149,19,11,129,52,127,72,81,197,35,59,155,48,124,51,152,135,104,112,70,116,135,15,112,11,49,100,177,139,2,116,148,158,66,62,42,135,101,78,109,59,52,63,118,133,121,179,103,94,161,105,13,165,8,159,161,32,101,26,152,81,66,31,83,65,34,178,29,141,108,85,130,180,7,107,53,182,188,182,58,95,109,76,7,162,93,95,119,65,169,74,164,176,119,109,141,34,36,175,35,90,103,170,100,159,135,157,78,50,58,97,96,55,22,84,1,24,3,131,172,185,122,36,186,100,72,49,115,76,136,106,11,2,14,188,72,137,15,183,61,149,161,73,137,118,195,175,38,54,80,195,173,10,179,181,105,155,197,150,14,39,118,146,140,9,41,175,181,3,3,17,173,104,143,173,109,196,94,52,101,81,57,197,183,170,148,146,0,48,117,192,125,36,116,50,53,114,160,197,74,190,189,99,2,52,7,97,3,195,39,185,161,53,150,109,111,99,68,79,157,166,120,122,33,132,53,120,47,96,158,194,18,27,162,155,191,168,118,13,101,146,31,189,1,118,78,161,134,178,106,92,46,87,194,131,169,126,165,194,9,16,20,170,165,73,33,196,51,122,153,55,170,121,1,119,138,135,39,126,42,16,121,162,162,71,181,189,137,99,117,158,144,55,102,126,43,46,170,5,57,29,35,135,158,75,90,2,171,169,19,162,115,97,170,71,132,41,74,11,42,53,168,92,174,185,73,99,58,32,24,63,116,83,118,13,116,13,121,35,127,73,98,62,117,142,88,125,112,119,8,69,0,146,160,81,10,25,113,42,45,45,190,143,83,184,127,13,32,53,68,65,82,88,154,65,97,115,159,161,38,182,171,3,104,129,118,66,182,53,172,27,189,103,166,50,28,48,14,177,47,183,41,31,136,117,182,100,98,189,15,18,140,
Number of random taxa needed to reach richness: 6408

In [59]:
# renaming randomly selected KDEs by non-target OTU-ID
for i in range(len(ampFrag_rand)):
    ampFrag_rand[i][0] = nonTarget[i]

In [ ]:
# appending random taxa to target taxa and writing
outFile = os.path.join(workDir, 'ampFrags_wRand.pkl')

with open(outFile, 'wb') as oFH:
    x = ampFrag_target + ampFrag_rand
    print 'Number of taxa in output: {}'.format(len(x))
    pickle.dump(x, oFH)


Number of taxa in output: 6606

Converting fragments to kde object


In [ ]:
!cd $workDir; \
    SIPSim fragment_kde \
    ampFrags_wRand.pkl \
    > ampFrags_wRand_kde.pkl

Adding diffusion


In [ ]:
!cd $workDir; \
    SIPSim diffusion \
    ampFrags_wRand_kde.pkl \
    --np $nprocs \
    > ampFrags_wRand_kde_dif.pkl

Making an incorp config file


In [ ]:
!cd $workDir; \
    SIPSim incorpConfigExample \
    --percTaxa 0 \
    --percIncorpUnif 100 \
    > PT0_PI100.config

Adding isotope incorporation to BD distribution


In [ ]:
!cd $workDir; \
    SIPSim isotope_incorp \
    ampFrags_wRand_kde_dif.pkl \
    PT0_PI100.config \
    --comm comm.txt \
    --np $nprocs \
    > ampFrags_wRand_kde_dif_incorp.pkl

Calculating BD shift from isotope incorporation


In [ ]:
!cd $workDir; \
    SIPSim BD_shift \
    ampFrags_wRand_kde_dif.pkl \
    ampFrags_wRand_kde_dif_incorp.pkl \
    --np $nprocs \
    > ampFrags_wRand_kde_dif_incorp_BD-shift.txt

Simulating gradient fractions


In [ ]:
!cd $workDir; \
    SIPSim gradient_fractions \
    comm.txt \
    > fracs.txt

Simulating an OTU table


In [ ]:
!cd $workDir; \
    SIPSim OTU_table \
    ampFrags_wRand_kde_dif_incorp.pkl \
    comm.txt \
    fracs.txt \
    --abs 1e9 \
    --np $nprocs \
    > OTU_abs1e9.txt

Plotting taxon abundances


In [ ]:
%%R -i workDir
setwd(workDir)

# loading file
tbl = read.delim('OTU_abs1e9.txt', sep='\t')

In [ ]:
%%R
## BD for G+C of 0 or 100
BD.GCp0 = 0 * 0.098 + 1.66
BD.GCp100 = 1 * 0.098 + 1.66

In [ ]:
%%R -w 800 -h 300
# plotting absolute abundances

tbl.s = tbl %>%
    group_by(library, BD_mid) %>%
    summarize(total_count = sum(count))

## plot
p = ggplot(tbl.s, aes(BD_mid, total_count)) +
    geom_area(stat='identity', alpha=0.3, position='dodge') +
    geom_histogram(stat='identity') +
    geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    labs(x='Buoyant density') +
    theme_bw() +
    theme( 
        text = element_text(size=16) 
    )
p

In [ ]:
%%R -w 800 -h 300
# plotting number of taxa at each BD

tbl.nt = tbl %>%
    filter(count > 0) %>%
    group_by(library, BD_mid) %>%
    summarize(n_taxa = n())

## plot
p = ggplot(tbl.nt, aes(BD_mid, n_taxa)) +
    geom_area(stat='identity', alpha=0.3, position='dodge') +
    geom_histogram(stat='identity') +
    geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    labs(x='Buoyant density', y='Number of taxa') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p

In [ ]:
%%R -w 800 -h 250
# plotting relative abundances

## plot
p = ggplot(tbl, aes(BD_mid, count, fill=taxon)) +
    geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    labs(x='Buoyant density') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p + geom_area(stat='identity', position='dodge', alpha=0.5)

In [ ]:
%%R -w 800 -h 250

p + geom_area(stat='identity', position='fill')

Subsampling from the OTU table


In [ ]:
dist,loc,scale = seq_per_fraction

!cd $workDir; \
    SIPSim OTU_subsample \
    --dist $dist \
    --dist_params mean:$loc,sigma:$scale \
    --walk 2 \
    --min_size 10000 \
    --max_size 200000 \
    OTU_abs1e9.txt \
    > OTU_abs1e9_sub.txt

Testing/Plotting seq count distribution of subsampled fraction samples


In [ ]:
%%R -h 300 -i workDir
setwd(workDir)

tbl = read.csv('OTU_abs1e9_sub.txt', sep='\t') 

tbl.s = tbl %>% 
    group_by(library, fraction) %>%
    summarize(total_count = sum(count)) %>%
    ungroup() %>%
    mutate(library = as.character(library))

ggplot(tbl.s, aes(total_count)) +
    geom_density(fill='blue')

In [ ]:
%%R -h 300 -w 600
setwd(workDir)

tbl.s = tbl %>%
    group_by(fraction, BD_min, BD_mid, BD_max) %>%
    summarize(total_count = sum(count)) 

ggplot(tbl.s, aes(BD_mid, total_count)) +
    geom_point() +
    geom_line() +
    labs(x='Buoyant density', y='Total sequences') +
    theme_bw() +
    theme(
        text = element_text(size=16)
        )

Getting list of target taxa


In [ ]:
%%R -i workDir

inFile = paste(c(workDir, 'target_genome_index.txt'), collapse='/')

tbl.target = read.delim(inFile, sep='\t', header=F)
colnames(tbl.target) = c('OTUId', 'genome_file', 'genome_ID', 'X', 'Y', 'Z')
tbl.target = tbl.target %>% distinct(OTUId)


cat('Number of target OTUs: ', tbl.target$OTUId %>% unique %>% length, '\n')
cat('----------\n')
tbl.target %>% head(n=3)

Plotting abundance distributions


In [ ]:
%%R -w 800 -h 250
# plotting relative abundances

tbl = tbl %>% 
    group_by(fraction) %>%
    mutate(rel_abund = count / sum(count))


## plot
p = ggplot(tbl, aes(BD_mid, count, fill=taxon)) +
    geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    labs(x='Buoyant density') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p + geom_area(stat='identity', position='dodge', alpha=0.5)

In [ ]:
%%R -w 800 -h 250

p = ggplot(tbl, aes(BD_mid, rel_abund, fill=taxon)) +
    geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    labs(x='Buoyant density') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p + geom_area(stat='identity')

Abundance distribution of just target taxa


In [ ]:
%%R

targets = tbl.target$OTUId %>% as.vector %>% unique 

tbl.f = tbl %>%
    filter(taxon %in% targets)

tbl.f %>% head

In [ ]:
%%R -w 800 -h 250
# plotting absolute abundances

## plot
p = ggplot(tbl.f, aes(BD_mid, count, fill=taxon)) +
    geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    labs(x='Buoyant density') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p + geom_area(stat='identity', position='dodge', alpha=0.5)

In [ ]:
%%R -w 800 -h 250
# plotting relative abundances

p = ggplot(tbl.f, aes(BD_mid, rel_abund, fill=taxon)) +
    geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    labs(x='Buoyant density') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p + geom_area(stat='identity')

Plotting 'true' taxon abundance distribution (from priming exp dataset)


In [ ]:
%%R -i metaDataFile
# loading priming_exp metadata file

meta = read.delim(metaDataFile, sep='\t')
meta %>% head(n=4)

In [ ]:
%%R -i otuTableFile
# loading priming_exp OTU table 

tbl.otu.true = read.delim(otuTableFile, sep='\t') %>%
    select(OTUId, starts_with('X12C.700.14')) 
tbl.otu.true %>% head(n=3)

In [ ]:
%%R
# editing table
tbl.otu.true.w = tbl.otu.true %>%
    gather('sample', 'count', 2:ncol(tbl.otu.true)) %>%
    mutate(sample = gsub('^X', '', sample)) %>%
    group_by(sample) %>%
    mutate(rel_abund = count / sum(count)) %>%
    ungroup() %>%
    filter(count > 0)
tbl.otu.true.w %>% head(n=5)

In [ ]:
%%R
tbl.true.j = inner_join(tbl.otu.true.w, meta, c('sample' = 'Sample'))
tbl.true.j %>% as.data.frame %>% head(n=3)

In [ ]:
%%R -w 800 -h 300  -i workDir
# plotting number of taxa at each BD

tbl = read.csv('OTU_abs1e9_sub.txt', sep='\t') 

tbl.nt = tbl %>%
    filter(count > 0) %>%
    group_by(library, BD_mid) %>%
    summarize(n_taxa = n())

## plot
p = ggplot(tbl.nt, aes(BD_mid, n_taxa)) +
    geom_area(stat='identity', alpha=0.5) +
    geom_point() +
    geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    labs(x='Buoyant density', y='Number of taxa') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )
p

In [ ]:
%%R -w 700 -h 350

tbl.true.j.s = tbl.true.j %>%
    filter(count > 0) %>%
    group_by(sample, Density) %>%
    summarize(n_taxa = sum(count > 0))

ggplot(tbl.true.j.s, aes(Density, n_taxa)) +
    geom_area(stat='identity', alpha=0.5) +
    geom_point() +
    geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    labs(x='Buoyant density', y='Number of taxa') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )

Plotting total counts for each sample


In [ ]:
%%R -h 300 -w 600
tbl.true.j.s = tbl.true.j %>%
    group_by(sample, Density) %>%
    summarize(total_count = sum(count)) 

ggplot(tbl.true.j.s, aes(Density, total_count)) +
    geom_point() +
    geom_line() +
    labs(x='Buoyant density', y='Total sequences') +
    theme_bw() +
    theme(
        text = element_text(size=16)
        )

Plotting abundance distribution of target OTUs


In [ ]:
%%R
tbl.true.j.f = tbl.true.j %>%
    filter(OTUId %in% targets) %>%
    arrange(OTUId, Density) %>%
    group_by(sample)
tbl.true.j.f %>% head(n=3) %>% as.data.frame

In [ ]:
%%R -w 800 -h 250
# plotting relative abundances

## plot
ggplot(tbl.true.j.f, aes(Density, rel_abund, fill=OTUId)) +
    geom_area(stat='identity') +
    geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    labs(x='Buoyant density') +
    theme_bw() +
    theme( 
        text = element_text(size=16),
        legend.position = 'none'
    )

Combining true and simulated OTU tables for target taxa


In [ ]:
%%R
tbl.f.e = tbl.f %>%
    mutate(library = 'simulation') %>%
    rename('density' = BD_mid) %>%
    select(-BD_min, -BD_max)

tbl.true.e = tbl.true.j.f %>% 
    select('taxon' = OTUId,
           'fraction' = sample,
           'density' = Density,
           count, rel_abund) %>%
    mutate(library = 'true') 
           
           
tbl.sim.true = rbind(tbl.f.e, tbl.true.e) %>% as.data.frame
tbl.f.e = data.frame()
tbl.true.e = data.frame()

tbl.sim.true %>% head(n=3)

In [ ]:
%%R
# check
cat('Number of target taxa: ', tbl.sim.true$taxon %>% unique %>% length, '\n')

Abundance distributions of each target taxon


In [ ]:
%%R -w 900 -h 3500

tbl.sim.true.f = tbl.sim.true %>%
    ungroup() %>%
    filter(density >= 1.6772) %>%
    filter(density <= 1.7603) %>%
    group_by(taxon) %>%
    mutate(mean_rel_abund = mean(rel_abund))  %>%
    ungroup()

tbl.sim.true.f$taxon = reorder(tbl.sim.true.f$taxon, -tbl.sim.true.f$mean_rel_abund)

ggplot(tbl.sim.true.f, aes(density, rel_abund, color=library)) +
    geom_point() +
    geom_line() +
    theme_bw() +
    facet_wrap(~ taxon, ncol=4, scales='free_y')

In [ ]:


In [ ]:


In [ ]:


In [ ]: