Goal

  • Testing options for genome reference dataset download

In [1]:
%load_ext rpy2.ipython

In [5]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(genomes)


/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: XML

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘XML’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:tools’:

    toHTML


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: RCurl

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: bitops

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘RCurl’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:tidyr’:

    complete


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: GenomicRanges

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: BiocGenerics

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: parallel

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘BiocGenerics’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:stats’:

    xtabs


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, as.vector, cbind, colnames,
    do.call, duplicated, eval, evalq, Filter, Find, get, intersect,
    is.unsorted, lapply, Map, mapply, match, mget, order, paste, pmax,
    pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rep.int,
    rownames, sapply, setdiff, sort, table, tapply, union, unique,
    unlist, unsplit


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: S4Vectors

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: stats4

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘S4Vectors’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:dplyr’:

    rename


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: IRanges

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘IRanges’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:tidyr’:

    expand


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:dplyr’:

    collapse, desc, slice


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: GenomeInfoDb

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: Biostrings

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: XVector

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘genomes’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:GenomeInfoDb’:

    species


  res = super(Function, self).__call__(*new_args, **new_kwargs)

In [10]:
%%R
data(proks)
summary(proks)


$`Total genomes`
[1] 27570 genome projects on Sep 04, 2014

$`By status`
                     Total
Contig               13074
Scaffold             10718
Gapless Chromosome    3053
Chromosome             373
Chromosome with gaps   343
Complete                 9

$`Recent submissions`
  released   name                      status  
1 2014-09-02 Altuibacter lentus        Scaffold
2 2014-09-02 Bacillus cereus ATCC 4342 Scaffold
3 2014-09-02 Bacillus licheniformis    Scaffold
4 2014-09-02 Bacillus megaterium       Scaffold
5 2014-09-02 Paenibacillus macerans    Scaffold


In [11]:
%%R
update(proks)


/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: proks has been successfully updated

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 29217 new project IDs added

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 235 old project IDs removed

  res = super(Function, self).__call__(*new_args, **new_kwargs)

In [12]:
%%R
summary(proks)


$`Total genomes`
[1] 57093 genome projects on Jan 06, 2016

$`By status`
                Total
Contig          34539
Scaffold        17035
Complete Genome  4649
Chromosome        870

$`Recent submissions`
  released   name                       status         
1 2015-12-30 Listeria monocytogenes     Complete Genome
2 2015-12-30 Listeria monocytogenes     Complete Genome
3 2015-12-30 Listeria monocytogenes     Complete Genome
4 2015-12-30 Tenacibaculum dicentrarchi Complete Genome
5 2015-12-29 Acinetobacter johnsonii    Contig         


In [23]:
%%R -w 600 -h 300
# plotting GC distribution
ggplot(proks, aes(gc)) +
    geom_histogram(binwidth=1) +
    geom_vline(xintercept=50, linetype='dashed', color='red', alpha=0.7) +
    labs(x='G+C') +
    theme(
        text = element_text(size=16)
    )



In [17]:
%%R
proks.complete = proks %>% as.data.frame %>% 
    filter(status == 'Complete Genome')
proks.complete %>% head


     pid                                     name          status   released
1  12997           Acaryochloris marina MBIC11017 Complete Genome 2007-10-16
2  60713           Acetobacterium woodii DSM 1030 Complete Genome 2012-02-14
3 242487                 Acetobacter pasteurianus Complete Genome 2015-07-21
4 214045            Acetobacter pasteurianus 386B Complete Genome 2013-08-01
5  31129     Acetobacter pasteurianus IFO 3283-01 Complete Genome 2009-08-27
6  31141 Acetobacter pasteurianus IFO 3283-01-42C Complete Genome 2009-08-27
    taxid  bioproject          group              subgroup    size      gc
1  329726  PRJNA12997  Cyanobacteria Oscillatoriophycideae 8.36160 46.9889
2  931626  PRJNA60713     Firmicutes            Clostridia 4.04478 39.3000
3     438 PRJNA242487 Proteobacteria   Alphaproteobacteria 2.80615 53.3000
4 1266844   PRJEB1172 Proteobacteria   Alphaproteobacteria 3.07865 52.8532
5  634452  PRJDA31129 Proteobacteria   Alphaproteobacteria 3.34025 53.0701
6  634458  PRJDA31141 Proteobacteria   Alphaproteobacteria 3.24799 53.1587
         refseq      insdc
1   NC_009925.1 CP000828.1
2   NC_016894.1 CP002987.1
3 NZ_CP012111.1 CP012111.1
4   NC_021991.1 HF677570.1
5   NC_013209.1 AP011121.1
6   NC_017150.1 AP011163.1
                                                                                               plasmid.refseq
1 NC_009926.1,NC_009929.1,NC_009927.1,NC_009928.1,NC_009930.1,NC_009934.1,NC_009931.1,NC_009933.1,NC_009932.1
2                                                                                                        <NA>
3                                                                                                        <NA>
4                       NC_021992.1,NC_021976.1,NC_021993.1,NC_021978.1,NC_021979.1,NZ_HF677572.1,NC_021977.1
5                                     NC_013210.1,NC_013212.1,NC_013213.1,NC_013214.1,NC_013211.1,NC_013215.1
6                                     NC_017105.1,NC_017104.1,NC_017151.1,NC_017107.1,NC_017106.1,NC_017152.1
                                                                                       plasmid.insdc
1 CP000838.1,CP000841.1,CP000839.1,CP000840.1,CP000842.1,CP000846.1,CP000843.1,CP000845.1,CP000844.1
2                                                                                               <NA>
3                                                                                               <NA>
4                       HF677573.1,HF677571.1,HF677575.1,HF677576.1,HF677577.1,HF677572.1,HF677574.1
5                                  AP011122.1,AP011124.1,AP011125.1,AP011126.1,AP011123.1,AP011127.1
6                                  AP011165.1,AP011164.1,AP011166.1,AP011168.1,AP011167.1,AP011169.1
   wgs scaffolds genes proteins   modified
1 <NA>        10  7469     7187 2015-07-30
2 <NA>         1  3649     3521 2015-08-18
3 <NA>         1  2688     2535 2015-12-09
4 <NA>         8  2866     2740 2015-08-18
5 <NA>         7  3148     3019 2015-08-19
6 <NA>         7  3070     2955 2015-08-19
                                                 center    biosample
1                                 Washington University SAMN02604308
2                    Georg-August-University Goettingen SAMN02603267
3                         Zhejiang Gongshang University SAMN02709032
4                            Vrije Universiteit Brussel SAMEA3139047
5 Acetobacter pasteurianus genome sequencing consortium         <NA>
6 Acetobacter pasteurianus genome sequencing consortium         <NA>
         assembly reference ftp   pubmed
1 GCA_000018105.1      REFR  NA 18252824
2 GCA_000247605.1      REFR  NA 22479398
3 GCA_001183745.1      <NA>  NA     <NA>
4 GCA_000723785.1      <NA>  NA 23902333
5 GCA_000010825.1      REFR  NA 19638423
6 GCA_000010945.1      <NA>  NA 19638423

In [21]:
%%R -w 600 -h 300
# plotting GC distribution
ggplot(proks.complete, aes(gc)) +
    geom_histogram(binwidth=1) +
    geom_vline(xintercept=50, linetype='dashed', color='red', alpha=0.7) +
    labs(x='G+C') +
    theme(
        text = element_text(size=16)
    )



In [29]:
%%R -w 900 -h 500
# plotting GC distribution
ggplot(proks.complete, aes(group, gc)) +
    geom_boxplot() +
    geom_hline(yintercept=50, linetype='dashed', color='red', alpha=0.7) +
    labs(x='G+C') +
    theme(
        text = element_text(size=16),
        axis.text.x = element_text(angle=60, hjust=1)
    )