In [1]:
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome3/validation/'
genomeDir = '/home/nick/notebook/SIPSim/dev/bac_genome3/genomes/'
R_dir = '/home/nick/notebook/SIPSim/lib/R/'
figDir = '/home/nick/notebook/SIPSim/figures/'
bandwidth=0.2
nprocs = 3
In [2]:
import os
import glob
import nestly
from IPython.display import Image
import dill
import numpy as np
import pandas as pd
import scipy.stats as stats
In [3]:
%load_ext rpy2.ipython
In [4]:
%%R
library(ggplot2)
library(plyr)
library(dplyr)
library(tidyr)
library(gridExtra)
In [5]:
if not os.path.isdir(workDir):
os.makedirs(workDir)
%cd $workDir
In [33]:
# skewed-normal
!SIPSim fragments \
$genomeDir/genome_index.txt \
--fp $genomeDir \
--fr ../../515F-806R.fna \
--fld skewed-normal,9000,2500,-5 \
--flr None,None \
--nf 10000 \
--np $nprocs \
--tbl \
2> ampFrags_real.log \
> ampFrags_real.txt
In [34]:
# uniform (small)
!SIPSim fragments \
$genomeDir/genome_index.txt \
--fp $genomeDir \
--fr ../../515F-806R.fna \
--fld uniform,1000,2000 \
--flr None,None \
--nf 10000 \
--np $nprocs \
--tbl \
2> ampFrags_sm.log \
> ampFrags_sm.txt
In [35]:
# uniform (largs)
!SIPSim fragments \
$genomeDir/genome_index.txt \
--fp $genomeDir \
--fr ../../515F-806R.fna \
--fld uniform,50000,51000 \
--flr None,None \
--nf 10000 \
--np $nprocs \
--tbl \
2> ampFrags_lg.log \
> ampFrags_lg.txt
In [6]:
%%R -i workDir
setwd(workDir)
files = c('ampFrags_real.txt',
'ampFrags_sm.txt',
'ampFrags_lg.txt')
tbl.l = list()
for(f in files){
tbl.l[[f]] = read.delim(f, sep='\t')
}
tbl = do.call(rbind, tbl.l)
tbl %>% head(n=3)
In [7]:
%%R
tbl$fld = gsub('\\.txt.+', '', rownames(tbl))
tbl %>% head(n=3)
In [8]:
%%R
fld.revalue = c('ampFrags_real' = 'skewed normal (mean=9 kb)',
'ampFrags_sm' = 'uniform (1-2 kb)',
'ampFrags_lg' = 'uniform (50-51 kb)')
taxon.revalue = c('Clostridium_ljungdahlii_DSM_13528' = 'Clostridium_ljungdahlii\nDSM_13528',
'Streptomyces_pratensis_ATCC_33331' = 'Streptomyces_pratensis\nATCC_33331')
tbl.f = tbl %>%
mutate(taxon_name = revalue(taxon_name, taxon.revalue),
fld = revalue(fld, fld.revalue)) %>%
mutate(taxon_name = gsub('_', ' ', taxon_name))
tbl.f %>% head(n=3)
In [22]:
%%R -w 800 -h 350
ggplot(tbl.f %>% mutate(fragLength = fragLength / 1000),
aes(fragGC, fragLength, color=fld)) +
stat_density2d() +
scale_color_discrete(name='') +
labs(x='Fragment G+C', y='Fragment length (kb)',
title='An example of fragment simulation from reference genomes') +
facet_grid(. ~ taxon_name) +
theme_bw() +
theme(
text=element_text(size=16),
title=element_text(size=13),
axis.title.y=element_text(vjust=1)#,
# legend.position = 'bottom'
)
In [23]:
%%R -i figDir
outFile = paste(c(figDir, 'genome3_fragKDE.pdf'), collapse='/')
ggsave(outFile, width=10, height=4)
cat('File written', outFile, '\n')
In [43]:
!SIPSim fragment_KDE \
ampFrags_real.txt \
> ampFrags_real_kde.pkl
!SIPSim fragment_KDE \
ampFrags_sm.txt \
> ampFrags_sm_kde.pkl
!SIPSim fragment_KDE \
ampFrags_lg.txt \
> ampFrags_lg_kde.pkl
In [51]:
!SIPSim diffusion \
ampFrags_real_kde.pkl \
--bw $bandwidth \
--np $nprocs \
> ampFrags_real_kde_dif.pkl
!SIPSim diffusion \
ampFrags_sm_kde.pkl \
--bw $bandwidth \
--np $nprocs \
> ampFrags_sm_kde_dif.pkl
!SIPSim diffusion \
ampFrags_lg_kde.pkl \
--bw $bandwidth \
--np $nprocs \
> ampFrags_lg_kde_dif.pkl
In [52]:
%%bash
n=100000
SIPSim KDE_sample -n $n ampFrags_real_kde.pkl > ampFrags_real_kde.txt
SIPSim KDE_sample -n $n ampFrags_real_kde_dif.pkl > ampFrags_real_kde_dif.txt
SIPSim KDE_sample -n $n ampFrags_sm_kde.pkl > ampFrags_sm_kde.txt
SIPSim KDE_sample -n $n ampFrags_sm_kde_dif.pkl > ampFrags_sm_kde_dif.txt
SIPSim KDE_sample -n $n ampFrags_lg_kde.pkl > ampFrags_lg_kde.txt
SIPSim KDE_sample -n $n ampFrags_lg_kde_dif.pkl > ampFrags_lg_kde_dif.txt
In [24]:
%%R -i workDir
kde.files = c('ampFrags_real_kde.txt',
'ampFrags_sm_kde.txt',
'ampFrags_lg_kde.txt')
kde.dif.files = c('ampFrags_real_kde_dif.txt',
'ampFrags_sm_kde_dif.txt',
'ampFrags_lg_kde_dif.txt')
load.tables = function(files, data.id){
tbl.l = list()
for(f in files){
F = file.path(workDir, f)
x = read.delim(F, sep='\t')
x = mutate(x,
data = data.id,
fld = gsub('_kde(_dif)*.txt', '', f))
tbl.l[[f]] = x
}
tbl = do.call(rbind, tbl.l) %>%
gather('taxon_name','BD', 2:4)
return(tbl)
}
tbl1 = load.tables(kde.files, 'no diffusion')
tbl2 = load.tables(kde.dif.files, 'diffusion')
tbl = rbind(tbl1, tbl2)
tbl %>% head(n=3)
In [33]:
%%R
fld.revalue = c('ampFrags_real' = 'skewed normal\n(mean = 9 kb)',
'ampFrags_sm' = 'uniform\n(1-2 kb)',
'ampFrags_lg' = 'uniform\n(50-51 kb)')
taxon.revalue = c('Clostridium_ljungdahlii_DSM_13528' = 'Clostridium_ljungdahlii\nDSM_13528',
'Streptomyces_pratensis_ATCC_33331' = 'Streptomyces_pratensis\nATCC_33331')
tbl.f = tbl %>%
mutate(taxon_name = revalue(taxon_name, taxon.revalue),
fld = revalue(fld, fld.revalue)) %>%
mutate(taxon_name = gsub('_', ' ', taxon_name)) %>%
mutate(data = ifelse(data=='diffusion', 'Gaussian BD', 'Discrete BD'))
In [34]:
%%R -w 700 -h 400
tbl.f$fld = factor(tbl.f$fld,
levels=c('uniform\n(1-2 kb)','skewed normal\n(mean = 9 kb)','uniform\n(50-51 kb)'))
x.lab = expression(paste('Buoyant density (g ml' ^ '-1', ')'))
ggplot(tbl.f, aes(BD, fill=data)) +
geom_density(alpha=0.25) +
scale_x_continuous(limits=c(1.68, 1.74)) +
facet_grid(fld ~ taxon_name, scales='free_y') +
scale_fill_discrete('') +
labs(x = x.lab, y='Probability density') +
theme_bw() +
theme(
text=element_text(size=16),
axis.title.y = element_text(vjust=1),
axis.text.x = element_text(angle=50, hjust=1)
)
In [36]:
%%R -i figDir
outFile = paste(c(figDir, 'genome3_fragKDE_dif.pdf'), collapse='/')
ggsave(outFile, width=10, height=5.71)
cat('File written:', outFile, '\n')
In [57]:
# fraction of fragments in DBL
frac_abs = 0.001
!SIPSim DBL \
ampFrags_real_kde_dif.pkl \
--frac_abs $frac_abs \
--bw $bandwidth \
--np $nprocs \
> ampFrags_real_kde_dif_DBL.pkl
!SIPSim DBL \
ampFrags_sm_kde_dif.pkl \
--frac_abs $frac_abs \
--bw $bandwidth \
--np $nprocs \
> ampFrags_sm_kde_dif_DBL.pkl
!SIPSim DBL \
ampFrags_lg_kde_dif.pkl \
--frac_abs $frac_abs \
--bw $bandwidth \
--np $nprocs \
> ampFrags_lg_kde_dif_DBL.pkl
In [58]:
# viewing DBL logs
g_path = os.path.join(workDir, '*_DBL.log')
files = glob.glob(g_path)
for f in files:
!tail $f
In [59]:
%%bash -s "$workDir"
n=100000
SIPSim KDE_sample -n $n ampFrags_real_kde_dif_DBL.pkl > ampFrags_real_kde_dif_DBL.txt
SIPSim KDE_sample -n $n ampFrags_sm_kde_dif_DBL.pkl > ampFrags_sm_kde_dif_DBL.txt
SIPSim KDE_sample -n $n ampFrags_lg_kde_dif_DBL.pkl > ampFrags_lg_kde_dif_DBL.txt
In [37]:
%%R -i workDir
kde.dif.files = c('ampFrags_real_kde_dif.txt',
'ampFrags_sm_kde_dif.txt',
'ampFrags_lg_kde_dif.txt')
kde.DBL.files = c('ampFrags_real_kde_dif_DBL.txt',
'ampFrags_sm_kde_dif_DBL.txt',
'ampFrags_lg_kde_dif_DBL.txt')
load.tables = function(files, data.id){
tbl.l = list()
for(f in files){
F = file.path(workDir, f)
x = read.delim(F, sep='\t') %>%
mutate(data = data.id,
fld = gsub('_kde_dif(_DBL)*.txt', '', f))
if (!any('libID' %in% colnames(x))){
x$libID = '1'
}
tbl.l[[f]] = x
}
tbl = do.call(rbind, tbl.l) %>%
gather('taxon_name','BD', 2:4)
return(tbl)
}
tbl1 = load.tables(kde.dif.files, 'diffusion')
tbl2 = load.tables(kde.DBL.files, 'diffusion_wDBL')
tbl = rbind(tbl1, tbl2)
tbl %>% head(n=3)
In [38]:
%%R
fld.revalue = c('ampFrags_real' = 'skewed normal\n(mean = 9 kb)',
'ampFrags_sm' = 'uniform\n(1-2 kb)',
'ampFrags_lg' = 'uniform\n(50-51 kb)')
taxon.revalue = c('Clostridium_ljungdahlii_DSM_13528' = 'Clostridium_ljungdahlii\nDSM_13528',
'Streptomyces_pratensis_ATCC_33331' = 'Streptomyces_pratensis\nATCC_33331')
tbl.f = tbl %>%
mutate(taxon_name = revalue(taxon_name, taxon.revalue),
fld = revalue(fld, fld.revalue)) %>%
mutate(taxon_name = gsub('_', ' ', taxon_name))
In [39]:
%%R -w 700 -h 400
tbl.f$fld = factor(tbl.f$fld,
levels=c('uniform\n(1-2 kb)','skewed normal\n(mean = 9 kb)','uniform\n(50-51 kb)'))
p1 = ggplot(tbl.f, aes(BD, fill=data)) +
geom_density(alpha=0.25) +
scale_x_continuous(limits=c(1.68, 1.74)) +
facet_grid(fld ~ taxon_name, scales='free_y') +
labs(x = 'Buoyant density') +
theme_bw() +
theme(
text=element_text(size=16),
axis.title.y = element_text(vjust=1),
axis.text.x = element_text(angle=50, hjust=1)
)
p1
In [40]:
%%R -w 700 -h 400
p2 = p1 + scale_y_log10()
In [64]:
# getting file listing taxon names
!SIPSim KDE_info \
-t ampFrags.pkl \
> taxon_names.txt
!head taxon_names.txt
In [65]:
# making a community file
comm_params = 'mean:-9.5,sigma:0.9'
!SIPSim communities \
--abund_dist_p $comm_params \
taxon_names.txt \
> comm.txt
!head comm.txt
In [66]:
# fraction of fragments in DBL
!SIPSim DBL \
ampFrags_real_kde_dif.pkl \
--frac_abs $frac_abs \
--comm comm.txt \
--bw $bandwidth \
--np $nprocs \
> ampFrags_real_kde_dif_DBL-comm.pkl
!SIPSim DBL \
ampFrags_sm_kde_dif.pkl \
--frac_abs $frac_abs \
--comm comm.txt \
--bw $bandwidth \
--np $nprocs \
> ampFrags_sm_kde_dif_DBL-comm.pkl
!SIPSim DBL \
ampFrags_lg_kde_dif.pkl \
--frac_abs $frac_abs \
--comm comm.txt \
--bw $bandwidth \
--np $nprocs \
> ampFrags_lg_kde_dif_DBL-comm.pkl
In [ ]: