In [1]:
%load_ext rpy2.ipython
In [5]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(genomes)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: XML
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning:
Attaching package: ‘XML’
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:tools’:
toHTML
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: RCurl
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: bitops
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning:
Attaching package: ‘RCurl’
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:tidyr’:
complete
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: GenomicRanges
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: BiocGenerics
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: parallel
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning:
Attaching package: ‘BiocGenerics’
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:parallel’:
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
clusterExport, clusterMap, parApply, parCapply, parLapply,
parLapplyLB, parRapply, parSapply, parSapplyLB
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:dplyr’:
combine, intersect, setdiff, union
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:stats’:
xtabs
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame, as.vector, cbind, colnames,
do.call, duplicated, eval, evalq, Filter, Find, get, intersect,
is.unsorted, lapply, Map, mapply, match, mget, order, paste, pmax,
pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rep.int,
rownames, sapply, setdiff, sort, table, tapply, union, unique,
unlist, unsplit
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: S4Vectors
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: stats4
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning:
Attaching package: ‘S4Vectors’
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:dplyr’:
rename
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: IRanges
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning:
Attaching package: ‘IRanges’
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:tidyr’:
expand
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:dplyr’:
collapse, desc, slice
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: GenomeInfoDb
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: Biostrings
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: XVector
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning:
Attaching package: ‘genomes’
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following object is masked from ‘package:GenomeInfoDb’:
species
res = super(Function, self).__call__(*new_args, **new_kwargs)
In [10]:
%%R
data(proks)
summary(proks)
$`Total genomes`
[1] 27570 genome projects on Sep 04, 2014
$`By status`
Total
Contig 13074
Scaffold 10718
Gapless Chromosome 3053
Chromosome 373
Chromosome with gaps 343
Complete 9
$`Recent submissions`
released name status
1 2014-09-02 Altuibacter lentus Scaffold
2 2014-09-02 Bacillus cereus ATCC 4342 Scaffold
3 2014-09-02 Bacillus licheniformis Scaffold
4 2014-09-02 Bacillus megaterium Scaffold
5 2014-09-02 Paenibacillus macerans Scaffold
In [11]:
%%R
update(proks)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: proks has been successfully updated
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 29217 new project IDs added
res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 235 old project IDs removed
res = super(Function, self).__call__(*new_args, **new_kwargs)
In [12]:
%%R
summary(proks)
$`Total genomes`
[1] 57093 genome projects on Jan 06, 2016
$`By status`
Total
Contig 34539
Scaffold 17035
Complete Genome 4649
Chromosome 870
$`Recent submissions`
released name status
1 2015-12-30 Listeria monocytogenes Complete Genome
2 2015-12-30 Listeria monocytogenes Complete Genome
3 2015-12-30 Listeria monocytogenes Complete Genome
4 2015-12-30 Tenacibaculum dicentrarchi Complete Genome
5 2015-12-29 Acinetobacter johnsonii Contig
In [23]:
%%R -w 600 -h 300
# plotting GC distribution
ggplot(proks, aes(gc)) +
geom_histogram(binwidth=1) +
geom_vline(xintercept=50, linetype='dashed', color='red', alpha=0.7) +
labs(x='G+C') +
theme(
text = element_text(size=16)
)
In [17]:
%%R
proks.complete = proks %>% as.data.frame %>%
filter(status == 'Complete Genome')
proks.complete %>% head
pid name status released
1 12997 Acaryochloris marina MBIC11017 Complete Genome 2007-10-16
2 60713 Acetobacterium woodii DSM 1030 Complete Genome 2012-02-14
3 242487 Acetobacter pasteurianus Complete Genome 2015-07-21
4 214045 Acetobacter pasteurianus 386B Complete Genome 2013-08-01
5 31129 Acetobacter pasteurianus IFO 3283-01 Complete Genome 2009-08-27
6 31141 Acetobacter pasteurianus IFO 3283-01-42C Complete Genome 2009-08-27
taxid bioproject group subgroup size gc
1 329726 PRJNA12997 Cyanobacteria Oscillatoriophycideae 8.36160 46.9889
2 931626 PRJNA60713 Firmicutes Clostridia 4.04478 39.3000
3 438 PRJNA242487 Proteobacteria Alphaproteobacteria 2.80615 53.3000
4 1266844 PRJEB1172 Proteobacteria Alphaproteobacteria 3.07865 52.8532
5 634452 PRJDA31129 Proteobacteria Alphaproteobacteria 3.34025 53.0701
6 634458 PRJDA31141 Proteobacteria Alphaproteobacteria 3.24799 53.1587
refseq insdc
1 NC_009925.1 CP000828.1
2 NC_016894.1 CP002987.1
3 NZ_CP012111.1 CP012111.1
4 NC_021991.1 HF677570.1
5 NC_013209.1 AP011121.1
6 NC_017150.1 AP011163.1
plasmid.refseq
1 NC_009926.1,NC_009929.1,NC_009927.1,NC_009928.1,NC_009930.1,NC_009934.1,NC_009931.1,NC_009933.1,NC_009932.1
2 <NA>
3 <NA>
4 NC_021992.1,NC_021976.1,NC_021993.1,NC_021978.1,NC_021979.1,NZ_HF677572.1,NC_021977.1
5 NC_013210.1,NC_013212.1,NC_013213.1,NC_013214.1,NC_013211.1,NC_013215.1
6 NC_017105.1,NC_017104.1,NC_017151.1,NC_017107.1,NC_017106.1,NC_017152.1
plasmid.insdc
1 CP000838.1,CP000841.1,CP000839.1,CP000840.1,CP000842.1,CP000846.1,CP000843.1,CP000845.1,CP000844.1
2 <NA>
3 <NA>
4 HF677573.1,HF677571.1,HF677575.1,HF677576.1,HF677577.1,HF677572.1,HF677574.1
5 AP011122.1,AP011124.1,AP011125.1,AP011126.1,AP011123.1,AP011127.1
6 AP011165.1,AP011164.1,AP011166.1,AP011168.1,AP011167.1,AP011169.1
wgs scaffolds genes proteins modified
1 <NA> 10 7469 7187 2015-07-30
2 <NA> 1 3649 3521 2015-08-18
3 <NA> 1 2688 2535 2015-12-09
4 <NA> 8 2866 2740 2015-08-18
5 <NA> 7 3148 3019 2015-08-19
6 <NA> 7 3070 2955 2015-08-19
center biosample
1 Washington University SAMN02604308
2 Georg-August-University Goettingen SAMN02603267
3 Zhejiang Gongshang University SAMN02709032
4 Vrije Universiteit Brussel SAMEA3139047
5 Acetobacter pasteurianus genome sequencing consortium <NA>
6 Acetobacter pasteurianus genome sequencing consortium <NA>
assembly reference ftp pubmed
1 GCA_000018105.1 REFR NA 18252824
2 GCA_000247605.1 REFR NA 22479398
3 GCA_001183745.1 <NA> NA <NA>
4 GCA_000723785.1 <NA> NA 23902333
5 GCA_000010825.1 REFR NA 19638423
6 GCA_000010945.1 <NA> NA 19638423
In [21]:
%%R -w 600 -h 300
# plotting GC distribution
ggplot(proks.complete, aes(gc)) +
geom_histogram(binwidth=1) +
geom_vline(xintercept=50, linetype='dashed', color='red', alpha=0.7) +
labs(x='G+C') +
theme(
text = element_text(size=16)
)
In [29]:
%%R -w 900 -h 500
# plotting GC distribution
ggplot(proks.complete, aes(group, gc)) +
geom_boxplot() +
geom_hline(yintercept=50, linetype='dashed', color='red', alpha=0.7) +
labs(x='G+C') +
theme(
text = element_text(size=16),
axis.text.x = element_text(angle=60, hjust=1)
)
Content source: nick-youngblut/SIPSim
Similar notebooks: