Description:

  • Getting the needed dataset

Setting variables


In [1]:
workDir = '/home/nick/notebook/SIPSim/dev/Ecoli/'
SIPSimExe = '/home/nick/notebook/SIPSim/SIPSim'

Init


In [2]:
import os,sys
import numpy as np
import pandas as pd
from ggplot import *
import matplotlib.pyplot as plt

In [3]:
%load_ext rpy2.ipython
%matplotlib inline

In [4]:
if not os.path.isdir(workDir):
    os.mkdir(workDir)

In [9]:
genomeDir = os.path.join(workDir, 'genomes')
if not os.path.isdir(genomeDir):
    os.mkdir(genomeDir)

Downloading genome


In [10]:
!cd $genomeDir; \
    seqDB_tools accession-GI2fasta < ../accession.txt > Ecoli_O157H7.fna


Starting batch: 1
Starting trial: 1

--------------------- WARNING ---------------------
MSG: No whitespace allowed in FASTA ID [AE005174|Escherichia coli O157:H7 EDL933, complete genome.]
---------------------------------------------------

--------------------- WARNING ---------------------
MSG: No whitespace allowed in FASTA ID [AE005174|Escherichia coli O157:H7 EDL933, complete genome.]
---------------------------------------------------

Genome info


In [12]:
!cd $genomeDir; \
    seq_tools fasta_info --tl --tgc --header Ecoli_O157H7.fna


total_seq_length	total_GC
5528445	50.38

Indexing genome


In [13]:
# list of all genomes files and their associated names
!cd $genomeDir; \
    find . -name "*fna" | \
    perl -pe 's/.+\///' | \
    perl -pe 's/(.+)(\.[^.]+)/\$1\t\$1\$2/' > genome_index.txt

In [ ]:
!cd $genomeDir; \
    $SIPSimExe indexGenomes genome_index.txt --fp .


Indexing: "Ecoli_O157H7"
0
0: 1.81%, 0:00:00.885690
0: 3.62%, 0:00:01.596740
0: 5.43%, 0:00:02.329085

In [ ]: