Description:

  • Getting the needed dataset

Setting variables


In [1]:
workDir = '/home/nick/notebook/SIPSim/dev/Ecoli/'
SIPSimExe = '/home/nick/notebook/SIPSim/SIPSim'

Init


In [2]:
import os,sys
import numpy as np
import pandas as pd
from ggplot import *
import matplotlib.pyplot as plt

In [3]:
%load_ext rpy2.ipython
%matplotlib inline

In [4]:
if not os.path.isdir(workDir):
    os.mkdir(workDir)

In [9]:
genomeDir = os.path.join(workDir, 'genomes')
if not os.path.isdir(genomeDir):
    os.mkdir(genomeDir)

Downloading genome


In [10]:
!cd $genomeDir; \
    seqDB_tools accession-GI2fasta < ../accession.txt > Ecoli_O157H7.fna


Starting batch: 1
Starting trial: 1

--------------------- WARNING ---------------------
MSG: No whitespace allowed in FASTA ID [AE005174|Escherichia coli O157:H7 EDL933, complete genome.]
---------------------------------------------------

--------------------- WARNING ---------------------
MSG: No whitespace allowed in FASTA ID [AE005174|Escherichia coli O157:H7 EDL933, complete genome.]
---------------------------------------------------

Genome info


In [12]:
!cd $genomeDir; \
    seq_tools fasta_info --tl --tgc --header Ecoli_O157H7.fna


total_seq_length	total_GC
5528445	50.38

Indexing genome


In [13]:
# list of all genomes files and their associated names
!cd $genomeDir; \
    find . -name "*fna" | \
    perl -pe 's/.+\///' | \
    perl -pe 's/(.+)(\.[^.]+)/\$1\t\$1\$2/' > genome_index.txt

In [14]:
!cd $genomeDir; \
    $SIPSimExe indexGenomes genome_index.txt --fp .


Indexing: "Ecoli_O157H7"
0
0: 1.81%, 0:00:00.885690
0: 3.62%, 0:00:01.596740
0: 5.43%, 0:00:02.329085
0: 7.24%, 0:00:03.076365
0: 9.04%, 0:00:03.836068
0: 10.85%, 0:00:04.601638
0: 12.66%, 0:00:05.373581
0: 14.47%, 0:00:06.146811
0: 16.28%, 0:00:06.922546
0: 18.09%, 0:00:07.701694
0: 19.90%, 0:00:08.481591
0: 21.71%, 0:00:09.262435
0: 23.51%, 0:00:10.041371
0: 25.32%, 0:00:10.826290
0: 27.13%, 0:00:11.612463
0: 28.94%, 0:00:12.398390
0: 30.75%, 0:00:13.183941
0: 32.56%, 0:00:13.945560
0: 34.37%, 0:00:14.730293
0: 36.18%, 0:00:15.517682
0: 37.99%, 0:00:16.308253
0: 39.79%, 0:00:17.096117
0: 41.60%, 0:00:17.888693
0: 43.41%, 0:00:18.681459
0: 45.22%, 0:00:19.476137
0: 47.03%, 0:00:20.270224
0: 48.84%, 0:00:21.065014
0: 50.65%, 0:00:21.858216
0: 52.46%, 0:00:22.654421
0: 54.26%, 0:00:23.445138
0: 56.07%, 0:00:24.242237
0: 57.88%, 0:00:25.041500
0: 59.69%, 0:00:25.841211
0: 61.50%, 0:00:26.642007
0: 63.31%, 0:00:27.443267
0: 65.12%, 0:00:28.243499
0: 66.93%, 0:00:29.047837
0: 68.74%, 0:00:29.852749
0: 70.54%, 0:00:30.658524
0: 72.35%, 0:00:31.465123
0: 74.16%, 0:00:32.273945
0: 75.97%, 0:00:33.081771
0: 77.78%, 0:00:33.890411
0: 79.59%, 0:00:34.702737
0: 81.40%, 0:00:35.515972
0: 83.21%, 0:00:36.327419
0: 85.02%, 0:00:37.138094
0: 86.82%, 0:00:37.951932
0: 88.63%, 0:00:38.765231
0: 90.44%, 0:00:39.581108
0: 92.25%, 0:00:40.395221
0: 94.06%, 0:00:41.212436
0: 95.87%, 0:00:42.029935
0: 97.68%, 0:00:42.847313
0: 99.49%, 0:00:43.665670
Time used: 0:00:46.529657
Done.
#-- All genomes indexed --#

In [ ]: