In [6]:
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome10/smallFragTest/'
SIPSimExe = '/home/nick/notebook/SIPSim/SIPSim'
In [2]:
import os
import sys
import numpy as np
import pandas as pd
In [3]:
%load_ext rpy2.ipython
%pylab inline
In [5]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
In [7]:
import os
# making config file
config = """
[library 1]
# baseline: no incorp
[[intraPopDist 1]]
distribution = uniform
weight = 1
[[[start]]]
[[[[interPopDist 1]]]]
distribution = uniform
start = 0
end = 0
[[[end]]]
[[[[interPopDist 1]]]]
distribution = uniform
start = 0
end = 0
[library 2]
# split intra-populations
## to get some taxa with split; use inter-pop mixture for 2nd intra-pop mu, where mixture is highly uneven
[[intraPopDist 1]]
distribution = normal
weight = 0.5
[[[mu]]]
[[[[interPopDist 1]]]]
distribution = normal
mu = 90
sigma = 2
[[[sigma]]]
[[[[interPopDist 1]]]]
distribution = normal
mu = 5
sigma = 2
[[intraPopDist 2]]
distribution = normal
weight = 0.5
[[[mu]]]
[[[[interPopDist 1]]]]
distribution = normal
mu = 5
sigma = 2
[[[sigma]]]
[[[[interPopDist 1]]]]
distribution = normal
mu = 5
sigma = 2
[library 3]
# split inter-pop distribution (some approx. full; others none)
[[intraPopDist 1]]
distribution = normal
weight = 1
[[[mu]]]
[[[[interPopDist 1]]]]
distribution = normal
mu = 90
sigma = 2
# these taxa in the community get no incorp
[[[[interPopDist 2]]]]
distribution = uniform
start = 0
end = 0
[[[sigma]]]
[[[[interPopDist 1]]]]
distribution = normal
mu = 5
sigma = 2
"""
outfile = os.path.join(workDir, 'genome10_n3.config')
outf = open(outfile, 'wb')
outf.write(config)
outf.close()
!cd $workDir; \ $SIPSimExe isoIncorp ../genome10_comm_n3.txt genome10_n3.config > genome10_comm_n3_incorp.txt
In [57]:
!cd $workDir; \
$SIPSimExe fragGC ../genomes/genomes10.txt \
--fp ../genomes/ --fr ../515Fm-927Rm.fna \
--fld uniform,500,10000 \
--flr 500,None \
--np 10 > genome10_ampFragGC_unif5-100.txt
In [58]:
%%R -i workDir
setwd(workDir)
tbl = read.delim('genome10_ampFragGC_unif5-100.txt', sep='\t')
tbl$fragLength = abs(tbl$fragEnd - tbl$fragStart) + 1
In [59]:
%%R -i workDir -h 300 -w 550
ggplot(tbl, aes(fragLength)) +
geom_histogram(binwidth=100) +
theme(
text = element_text(size=16)
)
In [60]:
%%R -i workDir -w 900
ggplot(tbl, aes(taxon_name, fragLength)) +
geom_violin() +
theme(
text = element_text(size=16),
axis.text.x = element_text(angle=90, hjust=1)
)
In [69]:
!cd $workDir; \
$SIPSimExe OTU_table \
genome10_ampFragGC_unif5-100.txt \
../genome10_comm_n3.txt \
genome10_comm_n3_incorp.txt \
../genome10_comm_n3_fracs.txt \
--abs_abund 1e8 \
--log genome10_OTU_abnd1e8_log.txt \
> genome10_unif5-100_OTU_abnd1e8.txt
In [62]:
%%R -i workDir
# loading file
infile = paste(c(workDir, 'genome10_OTU_abnd1e8_log.txt'), collapse='/')
tbl = read.delim(infile, sep='\t')
# reformat
tbl$taxon = gsub("(.*?_.*?)_(.+)", "\\1\n\\2", tbl$taxon)
tbl$fragment_length = tbl$fragment_length / 1000
In [63]:
%%R -w 1000 -h 700
# plot
tbl.l1 = tbl %>% filter(library == 1)
ggplot(tbl.l1, aes(fragment_length, fragment_GC)) +
geom_density2d() +
facet_wrap( ~ taxon) +
labs(x="fragment length [kb]", y="fragment G+C") +
theme(
text = element_text(size=18),
axis.text.x = element_text(angle=90)
)
In [75]:
%%R -i workDir
# loading file
infile = paste(c(workDir, 'genome10_unif5-100_OTU_abnd1e8.txt'), collapse='/')
tbl = read.delim(infile, sep='\t')
In [76]:
%%R
# formatting table
tbl$BD_min = gsub('-.+', '', tbl$fractions)
tbl$BD_min = as.numeric(tbl$BD_min)
tbl$BD_max = gsub('.+-', '', tbl$fractions)
tbl$BD_max = as.numeric(tbl$BD_max)
In [77]:
%%R
# summarizing counts (should be approx. total abundance)
tbl %>%
group_by(library) %>%
summarize(sum(count))
In [88]:
%%R -w 800
# plotting absolute abundances
## BD for G+C of 0 or 100
BD.GCp0 = 0 * 0.098 + 1.66
BD.GCp100 = 1 * 0.098 + 1.66
## plot
ggplot(tbl, aes(BD_min, count, fill=taxon, group=taxon)) +
geom_area(stat='identity', alpha=0.5, position='dodge') +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
facet_grid(library ~ .) +
labs(x='Buoyant density') +
theme_bw() +
theme( text = element_text(size=16) )
In [89]:
%%R -w 800
# plotting relative abundances
ggplot(tbl, aes(BD_min, count, fill=taxon, group=taxon)) +
geom_area(stat='identity', alpha=0.8, position='fill') +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
facet_grid(library ~ .) +
labs(x='Buoyant density') +
theme_bw() +
theme( text = element_text(size=16) )
In [90]:
%%R -w 800
# plotting absolute abundances on log scale
## BD for G+C of 0 or 100
BD.GCp0 = 0 * 0.098 + 1.66
BD.GCp100 = 1 * 0.098 + 1.66
## plot
ggplot(tbl, aes(BD_min, count, color=taxon, group=taxon)) +
geom_point(alpha=0.5) +
geom_line() +
geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
scale_y_log10() +
facet_grid(library ~ .) +
labs(x='Buoyant density') +
theme( text = element_text(size=16) )
Notes:
In [ ]: