Goal

Validation of fragment simulation.

  • Settings to test:
    • fragment length distribution
    • number of fragments

Setting variables


In [4]:
workDir = '/home/nick/notebook/SIPSim/dev/Ecoli/'
genomeDir = '/home/nick/notebook/SIPSim/dev/Ecoli/genomes/'

Init


In [6]:
import glob
import nestly

In [3]:
%load_ext rpy2.ipython

In [42]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: grid

Setting up nestly


In [71]:
# building tree structure
nest = nestly.Nest()

## varying params
nest.add('fld', 
         ['uniform,1000,2000',
          'uniform,20000,20500',
          'uniform,50000,51000',
          'normal,9000,2500',
          'normal,12000,500',
          'skewed-normal,9000,2500,-5'])

## set params
nest.add('np', [5], create_dir=False)

## input/output files
nest.add('fileName', ['ampFrags'], create_dir=False)
nest.add('genome_index', [os.path.join(genomeDir, 'genome_index.txt')], create_dir=False)
nest.add('genome_dir', [genomeDir], create_dir=False)
nest.add('primers', [os.path.join(workDir, '../', '515F-806R.fna')], create_dir=False)

# building directory tree
buildDir = os.path.join(workDir, 'frag_sim')
nest.build(buildDir)

In [72]:
bashFile = os.path.join(workDir, 'SIPSimRun.sh')

In [73]:
%%writefile $bashFile
#!/bin/bash

# simulating fragments
SIPSim fragments \
    {genome_index} \
    --fp {genome_dir} \
    --fr {primers} \
    --fld {fld} \
    --flr None,None \
    --nf 10000 \
    --np {np} \
    --tbl \
    2> {fileName}.log \
    > {fileName}.txt


Overwriting /home/nick/notebook/SIPSim/dev/Ecoli/SIPSimRun.sh

In [74]:
!chmod 775 $bashFile

In [75]:
!cd $workDir; \
    nestrun -j 4 --template-file $bashFile -d frag_sim


2015-06-30 14:57:01,422 * INFO * Template: ./SIPSimRun.sh
2015-06-30 14:57:01,424 * INFO * [161035] Started ./SIPSimRun.sh in frag_sim/normal,12000,500
2015-06-30 14:57:01,426 * INFO * [161036] Started ./SIPSimRun.sh in frag_sim/uniform,50000,51000
2015-06-30 14:57:01,428 * INFO * [161038] Started ./SIPSimRun.sh in frag_sim/normal,9000,2500
2015-06-30 14:57:01,429 * INFO * [161040] Started ./SIPSimRun.sh in frag_sim/uniform,20000,20500
2015-06-30 14:57:04,211 * INFO * [161038] frag_sim/normal,9000,2500 Finished with 0
2015-06-30 14:57:04,212 * INFO * [161171] Started ./SIPSimRun.sh in frag_sim/skewed-normal,9000,2500,-5
2015-06-30 14:57:04,540 * INFO * [161035] frag_sim/normal,12000,500 Finished with 0
2015-06-30 14:57:04,542 * INFO * [161187] Started ./SIPSimRun.sh in frag_sim/uniform,1000,2000
2015-06-30 14:57:05,639 * INFO * [161040] frag_sim/uniform,20000,20500 Finished with 0
2015-06-30 14:57:06,203 * INFO * [161187] frag_sim/uniform,1000,2000 Finished with 0
2015-06-30 14:57:06,754 * INFO * [161171] frag_sim/skewed-normal,9000,2500,-5 Finished with 0
2015-06-30 14:57:09,076 * INFO * [161036] frag_sim/uniform,50000,51000 Finished with 0

In [76]:
!cd $workDir; \
    nestagg delim -t -k fld -d frag_sim -o frag_sim/ampFrags.txt ampFrags.txt

Plotting fragments


In [77]:
%%R -i workDir
setwd(workDir)

tbl = read.delim('frag_sim/ampFrags.txt', sep='\t')
tbl %>% head


    taxon_name
1 Ecoli_O157H7
2 Ecoli_O157H7
3 Ecoli_O157H7
4 Ecoli_O157H7
5 Ecoli_O157H7
6 Ecoli_O157H7
                                                                                                    scaffoldID
1 AE005174|Escherichia coli O157:H7 EDL933, complete genome. Escherichia coli O157:H7 EDL933, complete genome.
2 AE005174|Escherichia coli O157:H7 EDL933, complete genome. Escherichia coli O157:H7 EDL933, complete genome.
3 AE005174|Escherichia coli O157:H7 EDL933, complete genome. Escherichia coli O157:H7 EDL933, complete genome.
4 AE005174|Escherichia coli O157:H7 EDL933, complete genome. Escherichia coli O157:H7 EDL933, complete genome.
5 AE005174|Escherichia coli O157:H7 EDL933, complete genome. Escherichia coli O157:H7 EDL933, complete genome.
6 AE005174|Escherichia coli O157:H7 EDL933, complete genome. Escherichia coli O157:H7 EDL933, complete genome.
  fragStart fragLength   fragGC              fld
1   3979080      12505 50.36403 normal,12000,500
2   1334405      11727 45.31423 normal,12000,500
3     52632      11810 51.63858 normal,12000,500
4   2717544      11406 51.03563 normal,12000,500
5    253448      11302 52.42899 normal,12000,500
6   4821613      11877 50.89699 normal,12000,500

In [78]:
%%R -w 950 -h 450

ggplot(tbl, aes(fragGC, fragLength)) +
    geom_point(shape='O') +
    labs(x='Fragment G+C', y='Fragment length (bp)') +
    facet_wrap(~ fld) +
    theme_bw() +
    theme(
        text=element_text(size=16),
        axis.title.y=element_text(vjust=1)
        )



In [82]:
%%R -w 950 -h 450

ggplot(tbl, aes(fragLength)) +
    geom_histogram(binwidth=100) +
    labs(x='Fragment G+C', y='Fragment length (bp)') +
    facet_wrap(~ fld) +
    theme_bw() +
    theme(
        text=element_text(size=16),
        axis.title.y=element_text(vjust=1)
        )



In [ ]: