In [1]:
    
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome3/validation/'
genomeDir = '/home/nick/notebook/SIPSim/dev/bac_genome3/genomes/'
R_dir = '/home/nick/notebook/SIPSim/lib/R/'
figDir = '/home/nick/notebook/SIPSim/figures/'
bandwidth=0.2
nprocs = 3
    
In [2]:
    
import os
import glob
import nestly
from IPython.display import Image
import dill
import numpy as np                                                              
import pandas as pd                                                             
import scipy.stats as stats
    
In [3]:
    
%load_ext rpy2.ipython
    
In [4]:
    
%%R
library(ggplot2)
library(plyr)
library(dplyr)
library(tidyr)
library(gridExtra)
    
    
In [5]:
    
if not os.path.isdir(workDir):
    os.makedirs(workDir)
    
%cd $workDir
    
    
In [33]:
    
# skewed-normal
!SIPSim fragments \
    $genomeDir/genome_index.txt \
    --fp $genomeDir \
    --fr ../../515F-806R.fna \
    --fld skewed-normal,9000,2500,-5 \
    --flr None,None \
    --nf 10000 \
    --np $nprocs \
    --tbl \
    2> ampFrags_real.log \
    > ampFrags_real.txt
    
In [34]:
    
# uniform (small)
!SIPSim fragments \
    $genomeDir/genome_index.txt \
    --fp $genomeDir \
    --fr ../../515F-806R.fna \
    --fld uniform,1000,2000 \
    --flr None,None \
    --nf 10000 \
    --np $nprocs \
    --tbl \
    2> ampFrags_sm.log \
    > ampFrags_sm.txt
    
In [35]:
    
# uniform (largs)
!SIPSim fragments \
    $genomeDir/genome_index.txt \
    --fp $genomeDir \
    --fr ../../515F-806R.fna \
    --fld uniform,50000,51000 \
    --flr None,None \
    --nf 10000 \
    --np $nprocs \
    --tbl \
    2> ampFrags_lg.log \
    > ampFrags_lg.txt
    
In [6]:
    
%%R -i workDir
setwd(workDir)
files = c('ampFrags_real.txt',
          'ampFrags_sm.txt',
          'ampFrags_lg.txt')
tbl.l = list()
for(f in files){
    tbl.l[[f]] = read.delim(f, sep='\t')
    }
tbl = do.call(rbind, tbl.l)
tbl %>% head(n=3)
    
    
In [7]:
    
%%R
tbl$fld = gsub('\\.txt.+', '', rownames(tbl))
tbl %>% head(n=3)
    
    
In [8]:
    
%%R 
fld.revalue = c('ampFrags_real' = 'skewed normal (mean=9 kb)',
                  'ampFrags_sm' = 'uniform (1-2 kb)',
                  'ampFrags_lg' = 'uniform (50-51 kb)')
taxon.revalue = c('Clostridium_ljungdahlii_DSM_13528' = 'Clostridium_ljungdahlii\nDSM_13528',
                  'Streptomyces_pratensis_ATCC_33331' = 'Streptomyces_pratensis\nATCC_33331')
tbl.f = tbl %>%
    mutate(taxon_name = revalue(taxon_name, taxon.revalue),
           fld = revalue(fld, fld.revalue)) %>%
    mutate(taxon_name = gsub('_', ' ', taxon_name))
tbl.f %>% head(n=3)
    
    
In [22]:
    
%%R -w 800 -h 350
ggplot(tbl.f %>% mutate(fragLength = fragLength / 1000), 
       aes(fragGC, fragLength, color=fld)) +
    stat_density2d() +
    scale_color_discrete(name='') +
    labs(x='Fragment G+C', y='Fragment length (kb)',
         title='An example of fragment simulation from reference genomes') +
    facet_grid(. ~ taxon_name) +
    theme_bw() +
    theme(
        text=element_text(size=16),
        title=element_text(size=13),
        axis.title.y=element_text(vjust=1)#,
#        legend.position = 'bottom'
        )
    
    
In [23]:
    
%%R -i figDir
outFile = paste(c(figDir, 'genome3_fragKDE.pdf'), collapse='/')
ggsave(outFile, width=10, height=4)
cat('File written', outFile, '\n')
    
    
In [43]:
    
!SIPSim fragment_KDE \
    ampFrags_real.txt \
    > ampFrags_real_kde.pkl
    
!SIPSim fragment_KDE \
    ampFrags_sm.txt \
    > ampFrags_sm_kde.pkl
    
!SIPSim fragment_KDE \
    ampFrags_lg.txt \
    > ampFrags_lg_kde.pkl
    
In [51]:
    
!SIPSim diffusion \
    ampFrags_real_kde.pkl \
    --bw $bandwidth \
    --np $nprocs \
    > ampFrags_real_kde_dif.pkl    
    
!SIPSim diffusion \
    ampFrags_sm_kde.pkl \
    --bw $bandwidth \
    --np $nprocs \
    > ampFrags_sm_kde_dif.pkl    
    
!SIPSim diffusion \
    ampFrags_lg_kde.pkl \
    --bw $bandwidth \
    --np $nprocs \
    > ampFrags_lg_kde_dif.pkl
    
    
In [52]:
    
%%bash 
n=100000
SIPSim KDE_sample -n $n ampFrags_real_kde.pkl > ampFrags_real_kde.txt
SIPSim KDE_sample -n $n ampFrags_real_kde_dif.pkl > ampFrags_real_kde_dif.txt
SIPSim KDE_sample -n $n ampFrags_sm_kde.pkl > ampFrags_sm_kde.txt
SIPSim KDE_sample -n $n ampFrags_sm_kde_dif.pkl > ampFrags_sm_kde_dif.txt
SIPSim KDE_sample -n $n ampFrags_lg_kde.pkl > ampFrags_lg_kde.txt
SIPSim KDE_sample -n $n ampFrags_lg_kde_dif.pkl > ampFrags_lg_kde_dif.txt
    
In [24]:
    
%%R -i workDir
kde.files = c('ampFrags_real_kde.txt',
              'ampFrags_sm_kde.txt',
              'ampFrags_lg_kde.txt')
kde.dif.files = c('ampFrags_real_kde_dif.txt',
                  'ampFrags_sm_kde_dif.txt',
                  'ampFrags_lg_kde_dif.txt')
load.tables = function(files, data.id){
    tbl.l = list()
    for(f in files){
        F = file.path(workDir, f)
        x = read.delim(F, sep='\t')
        x = mutate(x, 
                   data = data.id,
                   fld = gsub('_kde(_dif)*.txt', '', f))
        tbl.l[[f]] = x
        }
    
    tbl = do.call(rbind, tbl.l) %>%
        gather('taxon_name','BD', 2:4)
    return(tbl)
    }
tbl1 = load.tables(kde.files, 'no diffusion')
tbl2 = load.tables(kde.dif.files, 'diffusion')
tbl = rbind(tbl1, tbl2)
tbl %>% head(n=3)
    
    
In [33]:
    
%%R 
fld.revalue = c('ampFrags_real' = 'skewed normal\n(mean = 9 kb)',
                'ampFrags_sm' = 'uniform\n(1-2 kb)',
                'ampFrags_lg' = 'uniform\n(50-51 kb)')
taxon.revalue = c('Clostridium_ljungdahlii_DSM_13528' = 'Clostridium_ljungdahlii\nDSM_13528',
                  'Streptomyces_pratensis_ATCC_33331' = 'Streptomyces_pratensis\nATCC_33331')
tbl.f = tbl %>%
    mutate(taxon_name = revalue(taxon_name, taxon.revalue),
           fld = revalue(fld, fld.revalue)) %>%
    mutate(taxon_name = gsub('_', ' ', taxon_name)) %>%
    mutate(data = ifelse(data=='diffusion', 'Gaussian BD', 'Discrete BD'))
    
In [34]:
    
%%R -w 700 -h 400
tbl.f$fld = factor(tbl.f$fld, 
                   levels=c('uniform\n(1-2 kb)','skewed normal\n(mean = 9 kb)','uniform\n(50-51 kb)'))
x.lab = expression(paste('Buoyant density (g ml' ^ '-1', ')'))
ggplot(tbl.f, aes(BD, fill=data)) +
    geom_density(alpha=0.25) +
    scale_x_continuous(limits=c(1.68, 1.74)) +
    facet_grid(fld ~ taxon_name, scales='free_y') +    
    scale_fill_discrete('') +
    labs(x = x.lab, y='Probability density') +
    theme_bw() +
    theme(
        text=element_text(size=16),
        axis.title.y = element_text(vjust=1),
        axis.text.x = element_text(angle=50, hjust=1)
        )
    
    
In [36]:
    
%%R -i figDir 
outFile = paste(c(figDir, 'genome3_fragKDE_dif.pdf'), collapse='/')
ggsave(outFile, width=10, height=5.71) 
cat('File written:', outFile, '\n')
    
    
In [57]:
    
# fraction of fragments in DBL
frac_abs = 0.001
!SIPSim DBL \
    ampFrags_real_kde_dif.pkl \
    --frac_abs $frac_abs \
    --bw $bandwidth \
    --np $nprocs \
    > ampFrags_real_kde_dif_DBL.pkl    
    
!SIPSim DBL \
    ampFrags_sm_kde_dif.pkl \
    --frac_abs $frac_abs \
    --bw $bandwidth \
    --np $nprocs \
    > ampFrags_sm_kde_dif_DBL.pkl    
    
!SIPSim DBL \
    ampFrags_lg_kde_dif.pkl \
    --frac_abs $frac_abs \
    --bw $bandwidth \
    --np $nprocs \
    > ampFrags_lg_kde_dif_DBL.pkl
    
    
In [58]:
    
# viewing DBL logs
g_path = os.path.join(workDir, '*_DBL.log')
files = glob.glob(g_path)
for f in files:
    !tail $f
    
    
In [59]:
    
%%bash -s "$workDir"
n=100000
SIPSim KDE_sample -n $n ampFrags_real_kde_dif_DBL.pkl > ampFrags_real_kde_dif_DBL.txt
SIPSim KDE_sample -n $n ampFrags_sm_kde_dif_DBL.pkl > ampFrags_sm_kde_dif_DBL.txt
SIPSim KDE_sample -n $n ampFrags_lg_kde_dif_DBL.pkl > ampFrags_lg_kde_dif_DBL.txt
    
In [37]:
    
%%R -i workDir
kde.dif.files = c('ampFrags_real_kde_dif.txt',
                  'ampFrags_sm_kde_dif.txt',
                  'ampFrags_lg_kde_dif.txt')
kde.DBL.files = c('ampFrags_real_kde_dif_DBL.txt',
                  'ampFrags_sm_kde_dif_DBL.txt',
                  'ampFrags_lg_kde_dif_DBL.txt')
load.tables = function(files, data.id){
    tbl.l = list()
    for(f in files){
        F = file.path(workDir, f)
        x = read.delim(F, sep='\t') %>%
                mutate(data = data.id,
                       fld = gsub('_kde_dif(_DBL)*.txt', '', f))
        if (!any('libID' %in% colnames(x))){
            x$libID = '1'
        }
        tbl.l[[f]] = x
        }
    
    tbl = do.call(rbind, tbl.l) %>%
        gather('taxon_name','BD', 2:4)
    return(tbl)
    }
tbl1 = load.tables(kde.dif.files, 'diffusion')
tbl2 = load.tables(kde.DBL.files, 'diffusion_wDBL')
tbl = rbind(tbl1, tbl2)
tbl %>% head(n=3)
    
    
In [38]:
    
%%R 
fld.revalue = c('ampFrags_real' = 'skewed normal\n(mean = 9 kb)',
                'ampFrags_sm' = 'uniform\n(1-2 kb)',
                'ampFrags_lg' = 'uniform\n(50-51 kb)')
taxon.revalue = c('Clostridium_ljungdahlii_DSM_13528' = 'Clostridium_ljungdahlii\nDSM_13528',
                  'Streptomyces_pratensis_ATCC_33331' = 'Streptomyces_pratensis\nATCC_33331')
tbl.f = tbl %>%
    mutate(taxon_name = revalue(taxon_name, taxon.revalue),
           fld = revalue(fld, fld.revalue)) %>%
    mutate(taxon_name = gsub('_', ' ', taxon_name))
    
In [39]:
    
%%R -w 700 -h 400
tbl.f$fld = factor(tbl.f$fld, 
                   levels=c('uniform\n(1-2 kb)','skewed normal\n(mean = 9 kb)','uniform\n(50-51 kb)'))
p1 = ggplot(tbl.f, aes(BD, fill=data)) +
    geom_density(alpha=0.25) +
    scale_x_continuous(limits=c(1.68, 1.74)) +
    facet_grid(fld ~ taxon_name, scales='free_y') +    
    labs(x = 'Buoyant density') +
    theme_bw() +
    theme(
        text=element_text(size=16),
        axis.title.y = element_text(vjust=1),
        axis.text.x = element_text(angle=50, hjust=1)
        )
p1
    
    
In [40]:
    
%%R -w 700 -h 400
p2 = p1 + scale_y_log10()
    
In [64]:
    
# getting file listing taxon names
!SIPSim KDE_info \
    -t ampFrags.pkl \
    > taxon_names.txt
    
!head taxon_names.txt
    
    
In [65]:
    
# making a community file
comm_params = 'mean:-9.5,sigma:0.9'
!SIPSim communities \
    --abund_dist_p $comm_params \
    taxon_names.txt \
    > comm.txt
    
!head comm.txt
    
    
In [66]:
    
# fraction of fragments in DBL
!SIPSim DBL \
    ampFrags_real_kde_dif.pkl \
    --frac_abs $frac_abs \
    --comm comm.txt \
    --bw $bandwidth \
    --np $nprocs \
    > ampFrags_real_kde_dif_DBL-comm.pkl    
    
!SIPSim DBL \
    ampFrags_sm_kde_dif.pkl \
    --frac_abs $frac_abs \
    --comm comm.txt \
    --bw $bandwidth \
    --np $nprocs \
    > ampFrags_sm_kde_dif_DBL-comm.pkl    
    
!SIPSim DBL \
    ampFrags_lg_kde_dif.pkl \
    --frac_abs $frac_abs \
    --comm comm.txt \
    --bw $bandwidth \
    --np $nprocs \
    > ampFrags_lg_kde_dif_DBL-comm.pkl
    
    
In [ ]: