Goal:

  • Download most up-to-date version of NCBI 'complete' genomes

Setting variables


In [17]:
workDir = '/var/seq_data/ncbi_db/genome/Jan2016/'
proksFile = 'proks_complete.txt'
taxFile = 'proks_complete_tax.txt'

Init


In [2]:
import os
%load_ext rpy2.ipython
%load_ext pushnote

In [3]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(genomes)


/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘dplyr’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:stats’:

    filter, lag


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: XML

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘XML’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:tools’:

    toHTML


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: RCurl

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: bitops

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘RCurl’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:tidyr’:

    complete


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: GenomicRanges

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: BiocGenerics

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: parallel

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘BiocGenerics’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:stats’:

    xtabs


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, as.vector, cbind, colnames,
    do.call, duplicated, eval, evalq, Filter, Find, get, intersect,
    is.unsorted, lapply, Map, mapply, match, mget, order, paste, pmax,
    pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rep.int,
    rownames, sapply, setdiff, sort, table, tapply, union, unique,
    unlist, unsplit


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: S4Vectors

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: stats4

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘S4Vectors’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:dplyr’:

    rename


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: IRanges

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘IRanges’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:tidyr’:

    expand


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:dplyr’:

    collapse, desc, slice


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: GenomeInfoDb

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: Biostrings

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: XVector

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘genomes’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:GenomeInfoDb’:

    species


  res = super(Function, self).__call__(*new_args, **new_kwargs)

In [55]:
if not os.path.isdir(workDir):
    os.makedirs(workDir)

%cd $workDir


/var/seq_data/ncbi_db/genome/Jan2016

Loading list of complete prok genomes


In [19]:
%%R -i workDir -i proksFile 

F = file.path(workDir, proksFile)

df.proks.complete = read.delim(F, sep='\t')

# checking join
df.proks.complete %>% nrow %>% print
df.proks.complete %>% head(n=3)


[1] 4732
     pid                           name          status   released  taxid
1  12997 Acaryochloris marina MBIC11017 Complete Genome 2007-10-16 329726
2  60713 Acetobacterium woodii DSM 1030 Complete Genome 2012-02-14 931626
3 242487       Acetobacter pasteurianus Complete Genome 2015-07-21    438
   bioproject          group              subgroup    size      gc
1  PRJNA12997  Cyanobacteria Oscillatoriophycideae 8.36160 46.9889
2  PRJNA60713     Firmicutes            Clostridia 4.04478 39.3000
3 PRJNA242487 Proteobacteria   Alphaproteobacteria 2.80615 53.3000
         refseq      insdc
1   NC_009925.1 CP000828.1
2   NC_016894.1 CP002987.1
3 NZ_CP012111.1 CP012111.1
                                                                                               plasmid.refseq
1 NC_009930.1,NC_009928.1,NC_009927.1,NC_009931.1,NC_009933.1,NC_009934.1,NC_009932.1,NC_009926.1,NC_009929.1
2                                                                                                        <NA>
3                                                                                                        <NA>
                                                                                       plasmid.insdc
1 CP000842.1,CP000840.1,CP000839.1,CP000843.1,CP000845.1,CP000846.1,CP000844.1,CP000838.1,CP000841.1
2                                                                                               <NA>
3                                                                                               <NA>
   wgs scaffolds genes proteins   modified                             center
1 <NA>        10  7469     7187 2015-07-30              Washington University
2 <NA>         1  3649     3521 2015-08-18 Georg-August-University Goettingen
3 <NA>         1  2688     2535 2015-12-09      Zhejiang Gongshang University
     biosample        assembly reference ftp   pubmed
1 SAMN02604308 GCA_000018105.1      REFR  NA 18252824
2 SAMN02603267 GCA_000247605.1      REFR  NA 22479398
3 SAMN02709032 GCA_001183745.1      <NA>  NA     <NA>

In [20]:
%%R -i workDir -i taxFile

F = file.path(workDir, taxFile)

df.tax = read.delim(F, sep='\t') %>%
    distinct(taxid)
df.proks.complete = dplyr::inner_join(df.proks.complete, df.tax, c('taxid' = 'taxid'))

# checking join
df.proks.complete %>% nrow %>% print
df.proks.complete %>% nrow %>% print
df.proks.complete %>% head(n=3)


[1] 4732
[1] 4732
     pid                           name          status   released  taxid
1  12997 Acaryochloris marina MBIC11017 Complete Genome 2007-10-16 329726
2  60713 Acetobacterium woodii DSM 1030 Complete Genome 2012-02-14 931626
3 242487       Acetobacter pasteurianus Complete Genome 2015-07-21    438
   bioproject          group              subgroup    size      gc
1  PRJNA12997  Cyanobacteria Oscillatoriophycideae 8.36160 46.9889
2  PRJNA60713     Firmicutes            Clostridia 4.04478 39.3000
3 PRJNA242487 Proteobacteria   Alphaproteobacteria 2.80615 53.3000
         refseq      insdc
1   NC_009925.1 CP000828.1
2   NC_016894.1 CP002987.1
3 NZ_CP012111.1 CP012111.1
                                                                                               plasmid.refseq
1 NC_009930.1,NC_009928.1,NC_009927.1,NC_009931.1,NC_009933.1,NC_009934.1,NC_009932.1,NC_009926.1,NC_009929.1
2                                                                                                        <NA>
3                                                                                                        <NA>
                                                                                       plasmid.insdc
1 CP000842.1,CP000840.1,CP000839.1,CP000843.1,CP000845.1,CP000846.1,CP000844.1,CP000838.1,CP000841.1
2                                                                                               <NA>
3                                                                                               <NA>
   wgs scaffolds genes proteins   modified                             center
1 <NA>        10  7469     7187 2015-07-30              Washington University
2 <NA>         1  3649     3521 2015-08-18 Georg-August-University Goettingen
3 <NA>         1  2688     2535 2015-12-09      Zhejiang Gongshang University
     biosample        assembly reference ftp   pubmed superkingdom
1 SAMN02604308 GCA_000018105.1      REFR  NA 18252824     Bacteria
2 SAMN02603267 GCA_000247605.1      REFR  NA 22479398     Bacteria
3 SAMN02709032 GCA_001183745.1      <NA>  NA     <NA>     Bacteria
          phylum               class            order           family
1  Cyanobacteria                <NA>    Chroococcales             <NA>
2     Firmicutes          Clostridia    Clostridiales   Eubacteriaceae
3 Proteobacteria Alphaproteobacteria Rhodospirillales Acetobacteraceae
           genus               species
1  Acaryochloris  Acaryochloris marina
2 Acetobacterium Acetobacterium woodii
3    Acetobacter                  <NA>

Just Bacteria


In [21]:
%%R
df.bac.complete = df.proks.complete %>%
    filter(superkingdom == 'Bacteria')

df.bac.complete %>% nrow


[1] 4498

Phylum representation


In [22]:
%%R -w 800
df.bac.complete.s = df.bac.complete %>%
    group_by(phylum) %>%
    summarize(n = n()) %>%
    filter(! is.na(n), n > 0)

ggplot(df.bac.complete.s, aes(phylum, n)) +
    geom_bar(stat='identity') +
    scale_y_log10() +
    labs(y = 'Number of genomes') +
    theme_bw() +
    theme(
        text = element_text(size=16),
        axis.text.x = element_text(angle=60, hjust=1)
    )


removing what are really phage/plasmid genomes


In [23]:
%%R
cat('Pre-filter:', df.bac.complete %>% nrow, '\n')

to.rm = c("Thermoanaerobacterium saccharolyticum JW/SL-YS485",
          "Streptococcus salivarius 57.I")

df.bac.complete = df.bac.complete %>%
    filter(! name %in% to.rm)

cat('Post-filter:', df.bac.complete %>% nrow, '\n')


Pre-filter: 4498 
Post-filter: 4496 

Sequence download


In [56]:
%%R -i workDir

outFile = file.path(workDir, 'bac_complete.txt')
write.table(df.bac.complete, outFile, sep='\t', quote=FALSE, row.names=FALSE)

In [ ]:
!seqDB_tools accession-GI2fasta \
    -a 11 -n 2 -f 12 -header -o bac_complete \
    < bac_complete.txt \
    2> bac_complete.log

In [ ]:
%pushnote genome download complete

Getting list of empty genome files


In [38]:
fileSizes = !ls -tlc *.fna | perl -pe 's/[ \t]+/ /g' 

outFile = 'empty_genome_files.txt'
with open(outFile, 'wb') as outFH:
    for x in fileSizes:
        xx = x.split(' ')
        if xx[4] == '0':
            xx[-1] = xx[-1].replace('_', ' ').rstrip('.fna')
            outFH.write(xx[-1] + '\n')

# status
!printf 'Number of empty genome files: '     
!wc -l $outFile
!head $outFile


Number of empty genome files: 13 empty_genome_files.txt
Proteus mirabilis
Pseudomonas chlororaphis subsp aurantiac
Pseudomonas putida BIRD-1
Pseudothermotoga elfii DSM 9442 NBRC 107921
Pseudomonadaceae bacterium B4199
Pseudomonas syringae pv syringae B728
Pseudomonas stutzeri DSM 4166
Pseudomonas sp CCOS 191
Pseudomonas aeruginosa PA1R
Pusillimonas sp T7-7

Deleting empty files


In [48]:
fileSizes = !ls -tlc bac_complete/*.fna | perl -pe 's/[ \t]+/ /g' 

for x in fileSizes:
    xx = x.split(' ')
    if float(xx[4]) < 100000.0:
        os.remove(xx[-1])

Checking output


In [49]:
genomeDir = os.path.join(workDir, 'bac_complete')
%cd $genomeDir


/var/seq_data/ncbi_db/genome/Jan2016/bac_complete

In [50]:
# number of genomes downloaded
!printf "Number of bacterial genomes: "
!find . -name "*.fna" | wc -l


Number of bacterial genomes: 0

In [51]:
# file size
!echo "Genome file size distribution (bytes):"
!ls -tlc *.fna | \
    perl -pe 's/ +/\t/g' | \
    cut -f 5 | NY_misc_perl stats_descriptive


Genome file size distribution (bytes):
ls: cannot access *.fna: No such file or directory

In [45]:
# checking for non-bacterial genomes
!find . -name "*fna" | xargs -P 20 egrep "phage|virus|phage"


^C

In [ ]:
# deleting non-bacterial genomes
!rm -f ./Clostridium_perfringens_SM101.fna \
    ./Chlamydophila_pneumoniae_AR39.fna \
    ./Enterococcus_faecalis_62.fna

In [ ]:
# number of genomes downloaded
!printf "Number of bacterial genomes: "
!find . -name "*.fna" | wc -l

Renaming genomes


In [ ]:
genomeDirRn = genomeDir + '_rn'
genomeDirRn

In [ ]:
# renameing
!find . -name "*.fna" | \
    SIPSim genome_rename -n 26 --prefix $genomeDirRn -

In [ ]: