In [1]:
import pyparanoid.genomedb as gdb
In [2]:
### The pyparanoid.genomedb module contains functions for downloading and
### organizing genomic data from Ensembl, NCBI, and local sources.
### Fasta protein data is stored in subfolder 'pep' and metadata is in
### flat file 'genome_metadata.txt'
### Folders are made for DNA fasta and Genbank files, but these aren't
### populated from Ensembl or NCBI sources to save disk space.
### Initialize a folder for the genome database
gdb.setupdirs("../test_genomedb")
In [3]:
### The only argument needed is the path to the folder for genomic data
### Only AA fasta files will be downloaded, as well as some metadata.
### Default behavior is to download only 10 complete genomes at a time
### that haven't already been downloaded.
### The first time a new release of EnsemblBacteria is downloaded, a metadata
### file named release-XX.txt will be made to facilitate future downloads. This
### lengthy step will not be performed in later steps.
gdb.download_Ensembl_files("../test_genomedb")
In [4]:
### You can also download all of the genomes with certain names.
### This downloads anything with a matching genus or species - spelling counts!
### Also, set maxgen=None to download all genomes that fit your criteria.
### Also also, set complete=False to download draft genomes as well
gdb.download_Ensembl_files("../test_genomedb", maxgen=None, \
names="syringae,fluorescens", complete=False)
In [5]:
### Alternatively you can use taxonomy IDs
### Consult http://bacteria.ensembl.org/species.html
### Specifying maxgen as an integer sets a limit on # of genomes.
### This can be useful for avoiding time-out errors.
gdb.download_Ensembl_files("../test_genomedb", maxgen=5, \
taxids="178900,178901", complete=False)
In [6]:
### You can also download files from NCBI RefSeq database.
### Specify species names and taxids separated by commas.
### Use 'cpus' to utilize multiple threads to download genomes (default = 1)
gdb.download_Refseq_files("../test_genomedb", cpus=4, \
names="herbaspirillum,azospirillum", taxids="294,178900,178901")
In [7]:
### Once a genomic database folder has been initialized with Ensembl or Genbank data,
### you can add in-house genomes annotated with Prokka
### Specify path to genomedb, path to Prokka folder, and a species id - the species id
### can't be already in use in the genomedb
gdb.add_Prokka_genome("../test_genomedb", "../../assemblies/WCS365_prokka", \
"pseudomonas_sp_wcs365")
In [8]:
### Specify 'taxid' argument to add a NCBI taxonomy code. If none specified,
### defaults to "2" for Bacteria kingdom
gdb.add_Prokka_genome("../test_genomedb", "../../assemblies/CH267_prokka", \
"pseudomonas_sp_ch267", taxid="294")
In [9]:
### You can also get taxonomic information for each genome in the database
### This is stored in the flat file 'tax_info.txt'
gdb.get_taxonomy("../test_genomedb")
In [16]:
### The previous methods for downloading data only download protein fasta
### files. To download DNA fasta files and genbank files there are format-specific
### commands.
### The first argument for both is an array containing the unique strain names of the species
### to download. These are the prefix before 'pep.fa' or the "species" field of the
### genome_metadata.txt files.
### As an example, to make the strain names array:
!cut -f 3 ../test_genomedb/genome_metadata.txt | head -n 10 > "test_list.txt"
strains = [line.rstrip() for line in open("test_list.txt",'r') if line.rstrip() not in ["species"]]
strains
Out[16]:
In [17]:
gdb.download_dna_files(strains,"../test_genomedb")
In [18]:
gdb.download_genbank_files(strains,"../test_genomedb")
In [ ]: