Goal

  • simulating amplicon fragments for genomes in non-singleton OTUs

Setting variables


In [1]:
import os

workDir = '/var/seq_data/ncbi_db/genome/Jan2016/ampFrags/'
genomeDir = '/var/seq_data/ncbi_db/genome/Jan2016/bac_complete_rn/'
ampliconFile = '/var/seq_data/ncbi_db/genome/Jan2016/rnammer_aln/otusn_map_nonSingle.txt'

Init


In [2]:
%load_ext rpy2.ipython
%load_ext pushnote

In [3]:
%%R
library(dplyr)
library(tidyr)
library(ggplot2)


/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘dplyr’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:stats’:

    filter, lag


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


  res = super(Function, self).__call__(*new_args, **new_kwargs)

In [4]:
if not os.path.isdir(workDir):
    os.makedirs(workDir)
    
%cd $workDir


/var/seq_data/ncbi_db/genome/Jan2016/ampFrags

In [5]:
# simlink amplicon OTU map file
tmp = os.path.join(workDir, '../', ampliconFile)
!ln -s -f $tmp .

Indexing genomes


In [6]:
!head -n 3 $ampliconFile


rRNA_NZ_CP009129_Planococcus_sp__PAMC_21323__Planococcus_sp__PAMC_21323_424242-425779_DIR+	OTU.1	100	253	0	0	1	253	1	253	*	*	Planococcus_sp_PAMC_21323.fna	39
rRNA_NZ_CP009129_Planococcus_sp__PAMC_21323__Planococcus_sp__PAMC_21323_836724-838261_DIR+	OTU.1	100	253	0	0	1	253	1	253	*	*	Planococcus_sp_PAMC_21323.fna	39
rRNA_NZ_CP009129_Planococcus_sp__PAMC_21323__Planococcus_sp__PAMC_21323_3114313-3115850_DIR+	OTU.1	100	253	0	0	1	253	1	253	*	*	Planococcus_sp_PAMC_21323.fna	39

In [7]:
!cut -f 13 $ampliconFile | head


Planococcus_sp_PAMC_21323.fna
Planococcus_sp_PAMC_21323.fna
Planococcus_sp_PAMC_21323.fna
Planococcus_sp_PAMC_21323.fna
Planococcus_sp_PAMC_21323.fna
Plautia_stali_symbiont.fna
Pluralibacter_gergoviae.fna
Planococcus_sp_PAMC_21323.fna
Plautia_stali_symbiont.fna
Plautia_stali_symbiont.fna
cut: write error: Broken pipe

symlinking all genomes of interest


In [8]:
!cut -f 13 $ampliconFile | \
    sort -u | \
    perl -pe 's|^|../bac_complete_rn/|' | \
    xargs -I % ln -s -f % .

In [9]:
!cut -f 13 $ampliconFile | sort -u | wc -l
!find . -name "*.fna" | wc -l


3217
3217

In [10]:
!ls -thlc 2>/dev/null | head -n 4


total 3.1M
lrwxrwxrwx 1 nick seq_data_users  67 Feb 14 10:26 Zymomonas_mobilis_subsp_mobilis_NRRL_B-12526.fna -> ../bac_complete_rn/Zymomonas_mobilis_subsp_mobilis_NRRL_B-12526.fna
lrwxrwxrwx 1 nick seq_data_users  75 Feb 14 10:26 Zymomonas_mobilis_subsp_mobilis_str_CP4_NRRL_B-14023.fna -> ../bac_complete_rn/Zymomonas_mobilis_subsp_mobilis_str_CP4_NRRL_B-14023.fna
lrwxrwxrwx 1 nick seq_data_users  69 Feb 14 10:26 Zymomonas_mobilis_subsp_mobilis_ZM4_ATCC_31821.fna -> ../bac_complete_rn/Zymomonas_mobilis_subsp_mobilis_ZM4_ATCC_31821.fna

Making genome -> genome_file index


In [13]:
!cut -f 13 $ampliconFile | perl -pe 's/(.+).fna/\$1\t\$1\.fna/' | sort -u > genome_index.txt
!wc -l genome_index.txt
!head genome_index.txt


3217 genome_index.txt
Acaryochloris_marina_MBIC11017	Acaryochloris_marina_MBIC11017.fna
Acetobacterium_woodii_DSM_1030	Acetobacterium_woodii_DSM_1030.fna
Acetobacter_pasteurianus_386B	Acetobacter_pasteurianus_386B.fna
Acetobacter_pasteurianus	Acetobacter_pasteurianus.fna
Acetobacter_pasteurianus_IFO_3283-01-42C	Acetobacter_pasteurianus_IFO_3283-01-42C.fna
Acetobacter_pasteurianus_IFO_3283-01	Acetobacter_pasteurianus_IFO_3283-01.fna
Acetobacter_pasteurianus_IFO_3283-03	Acetobacter_pasteurianus_IFO_3283-03.fna
Acetobacter_pasteurianus_IFO_3283-07	Acetobacter_pasteurianus_IFO_3283-07.fna
Acetobacter_pasteurianus_IFO_3283-12	Acetobacter_pasteurianus_IFO_3283-12.fna
Acetobacter_pasteurianus_IFO_3283-22	Acetobacter_pasteurianus_IFO_3283-22.fna

Index genomes


In [14]:
!SIPSim genome_index \
    genome_index.txt \
    --fp . --np 26 \
    > index_log.txt \
    2> index_log_err.txt

In [17]:
!find . -name "*sqlite3.db" | wc -l


3217

Simulating fragments


In [18]:
# copy primer file
!cp /home/nick/notebook/SIPSim/dev/515F-806R.fna ../

In [ ]:
!SIPSim fragments \
    genome_index.txt \
    --fp $workDir \
    --fr ../515F-806R.fna \
    --fld skewed-normal,9000,2500,-5 \
    --flr None,None \
    --nf 10000 \
    --np 20 \
    2> ../ampFrags.log \
    > ../ampFrags.pkl

In [21]:
!ls -thlc ../ampFrags.pkl


-rw-rw-r-- 1 nick seq_data_users 1.5G Feb 14 15:01 ../ampFrags.pkl

Converting to a KDE


In [23]:
!SIPSim fragment_KDE ../ampFrags.pkl > ../ampFrags_KDE.pkl

In [25]:
!ls -thlc ../ampFrags_KDE.pkl


-rw-rw-r-- 1 nick seq_data_users 464M Feb 15 08:03 ../ampFrags_KDE.pkl

In [ ]: