filtering reads
@ system76-server:~/notebook/deltaGC_notes/data/amplicon/140331_allBac/amp454-L215-N1k/GC_dist
screen -S read_filter -L bash -c "grinder_readFilter.pl -r ../amp454-L215-N1k_reads.fa > amp454-L215-N1k_reads_filt.fa"
* overlap window = 100bp
Number of reads pre-filtering: 1091000
Number remaining post-filtering: 4105
getting GC distribution (sliding window)
@ system76-server:~/notebook/deltaGC_notes/data/amplicon/140331_allBac/amp454-L215-N1k/GC_dist
screen -S GC_dist -L bash -c "GC_dist.pl -r amp454-L215-N1k_reads_filt.fa -g ../../prokBac-CompRep.fna -a -t 12 > prokBac-CompRep_16S_GC-byWindow.txt"
* took ~1 min
In [5]:
%load_ext rmagic
%load_ext rpy2.ipython
In [6]:
%%R
require(ggplot2)
require(data.table)
require(gridExtra)
In [7]:
%%R
tbl = fread("~/notebook/deltaGC_notes/data/amplicon/140331_allBac/amp454-L215-N1k/GC_dist/prokBac-CompRep_16S_GC-byWindow.txt",
header=T)
## setting position around center
tbl[,pos_local:=(pos_local - 10000)]
In [8]:
%%R
# plotting interquartile ranges of values at each relative position
## median & stdev by pos_local
tbl.median = tbl[, median(GC), by="pos_local"]
setkey(tbl.median, pos_local)
tbl.stdev = tbl[, sd(GC), by="pos_local"]
setkey(tbl.stdev, pos_local)
# merging
tbl.sum = merge(tbl.median, tbl.stdev)
colnames(tbl.sum) = c('pos_local', 'median', 'stdev')
# ymin & ymax
tbl.sum$ymin = tbl.sum$median - tbl.sum$stdev
tbl.sum$ymax = tbl.sum$median + tbl.sum$stdev
In [23]:
%%R -w 12 -h 4 -u in
p = ggplot(tbl.sum, aes(x=pos_local, y=median, ymin=ymin, ymax=ymax))+
geom_pointrange() +
labs(y='Median G+C content (%)', x='Nucleotide position relative to the middle of the 16S rRNA gene (bp)') +
theme(
text = element_text(size=16),
axis.title.x = element_text(vjust=0),
axis.title.y = element_text(vjust=1),
axis.text.x = element_text(color='black'),
axis.text.y = element_text(color='black')
)
#ggsave(p,
# file="~/notebook/deltaGC_notes/data/amplicon/140331_allBac/amp454-L215-N1k/GC_dist/prokBac-CompRep_16S_GC-byWindow_sd.pdf",
# width=12, height=4, units='in')
print(p)
#dev.off()
xxx=1
In [35]:
%%R
print(apply(tbl, 2, class))
In [70]:
%%R -w 1000 -h 400
# parsing out select taxa to show GC content
tbl$genome = gsub("\n", " ", tbl$genome)
## unique list of genomes
u.genome = unique(tbl$genome)
## selecting examples
examples = c(
"CP001666 Clostridium ljungdahlii DSM 13528, complete genome.",
"CP004009 Escherichia coli APEC O78, complete genome.",
"AP009493 Streptomyces griseus subsp. griseus NBRC 13350 DNA, complete genome.",
"CP000413 Lactobacillus gasseri ATCC 33323, complete genome."
)
q = paste(examples, collapse="|");
tbl.p = tbl[grep(q, tbl$genome),]
tbl.p$read_num = as.character(tbl.p$read_num)
## editing names
new.names = c(
"C. ljungdahlii",
"E. coli",
"S. griseus",
"L. gasseri"
)
for(i in 1:length(examples)){
tbl.p$genome = gsub(examples[i], new.names[i], tbl.p$genome)
}
## ordering
tbl.p$genome = factor(tbl.p$genome, levels=new.names)
## plotting
p.ex = ggplot(tbl.p, aes(pos_local, GC, group=readID, color=read_num)) +
geom_line(alpha=0.5, size=0.7) +
facet_grid(genome ~ .) +
scale_y_continuous(limits=c(15,85)) +
scale_color_discrete(name='Copy\nnumber') +
labs(y='G+C content (%)', x='Nucleotide position relative to the middle of the 16S rRNA gene (bp)') +
theme(
text = element_text(size=16),
strip.text = element_text(face="italic"),
axis.title.x = element_text(vjust=0),
axis.title.y = element_text(vjust=1),
axis.text.x = element_text(color='black'),
axis.text.y = element_text(color='black')
)
## adding arrows to L gasseri plot
arrow.tbl = data.frame(genome='L. gasseri', readID="1", read_num="1", x=c(-6300, -5000), xend=c(-6300, -5000), y=82, yend=55)
p.ex = p.ex + geom_segment(data=arrow.tbl,
arrow = arrow(length = unit(0.15,"cm")),
aes(x=x, xend=xend, y=y, yend=yend, genome=genome)
)
#ggsave(p.ex,
# file="~/notebook/deltaGC_notes/data/amplicon/140331_allBac/amp454-L215-N1k/GC_dist/prokBac-CompRep_16S_GC-byWindow_Ex.pdf",
# width=12, height=5, units='in')
print(p.ex)
In [74]:
%%R -w 13 -h 9 -u in
# combining the plots
blankPanel = grid.rect(gp=gpar(col="white"))
## making plot
#tiff(file="~/notebook/deltaGC_notes/data/amplicon/140331_allBac/amp454-L215-N1k/GC_dist/prokBac-CompRep_16S_GC-byWindow_comb.tiff",
# width=13, height=9, units='in', res=300)
#svg(file="~/notebook/deltaGC_notes/data/amplicon/140331_allBac/amp454-L215-N1k/GC_dist/prokBac-CompRep_16S_GC-byWindow_comb.svg",
# width=13, height=9)
grid.arrange(arrangeGrob(p, blankPanel, ncol=2, widths=c(8.9,1)), p.ex, ncol=1, heights=c(1.3,2))
#dev.off()
xxx=1
@ system76-server:~/notebook/deltaGC_notes/data/amplicon/140411_allBac/amp454-L215-N1k/140428_rrn_GCstats
Getting all rnammer ssu hits
egrep -v "^#" /var/seq_data/ncbi_db/genome/rnammer/bacteria_ssu/*gff | perl -pe 's/:/\t/' | grep -f <(grep ">" ../amp454-L215-N1k_reads.fa | perl -pe 's/.+description="|"//g;s/$/_rrn.gff/' | sort -u) > rnammer_bac-amp454.txt
Calculating stats
$ rnammer_rrn_GCstats.pl -g <(perl -pe 's/_rrn.gff//' rnammer_bac-amp454.txt) -f /var/seq_data/ncbi_db/genome/prok-bac-genomes/ --debug | less
gene min mean median max stdev
5s_rRNA 41.667 57.83 57.522 71.552 5.078
23s_rRNA 22.421 52.703 52.508 66.031 3.029
ITS 9.375 44.256 44.136 73.42 8.816
16s_rRNA 35.178 54.704 54.525 65.911 2.863
Filtering reads
@ system76-server:~/notebook/deltaGC_notes/data/amplicon/140403_allArc/amp454-L215-N1k/GC_dist
grinder_readFilter.pl -r ../amp454-L215-N1k_reads.fa > amp454-L215-N1k_reads_filt.fa
* overlap window = 100bp
Number of reads pre-filtering: 101000
Number remaining post-filtering: 175
Getting GC distribution (sliding window)
@ system76-server:~/notebook/deltaGC_notes/data/amplicon/140403_allArc/amp454-L215-N1k/GC_dist
GC_dist.pl -r amp454-L215-N1k_reads_filt.fa -g ../../prokArc-CompRep.fna -a -t 12 > prokArc-CompRep_16S_GC-byWindow.txt
In [1]:
%load_ext rmagic
In [2]:
%%R
require(ggplot2)
require(data.table)
In [3]:
%%R
tbl = fread("~/notebook/deltaGC_notes/data/amplicon/140403_allArc/amp454-L215-N1k/GC_dist/prokArc-CompRep_16S_GC-byWindow.txt",
header=T)
## setting position around center
tbl[,pos_local:=(pos_local - 10000)]
In [6]:
%%R -w 900 -h 300
# plotting interquartile ranges of values at each relative position
## median & stdev by pos_local
tbl.median = tbl[, median(GC), by="pos_local"]
setkey(tbl.median, pos_local)
tbl.stdev = tbl[, sd(GC), by="pos_local"]
setkey(tbl.stdev, pos_local)
# merging
tbl.sum = merge(tbl.median, tbl.stdev)
colnames(tbl.sum) = c('pos_local', 'median', 'stdev')
# ymin & ymax
tbl.sum$ymin = tbl.sum$median - tbl.sum$stdev
tbl.sum$ymax = tbl.sum$median + tbl.sum$stdev
# plotting
p = ggplot(tbl.sum, aes(x=pos_local, y=median, ymin=ymin, ymax=ymax))+
geom_pointrange() +
labs(y='Median GC content (%)', x='Nucleotide position relative to the middle of the 16S rRNA gene') +
theme( text = element_text(size=18))
ggsave(file="~/notebook/deltaGC_notes/data/amplicon/140403_allArc/amp454-L215-N1k/GC_dist/prokArc-CompRep_16S_GC-byWindow_sd.tiff",
width=12, height=4, units='in', dpi=300)
print(p)
xxx=1
Getting all rnammer ssu hits
egrep -v "^#" /var/seq_data/ncbi_db/genome/rnammer/archaea_ssu/*gff | perl -pe 's/:/\t/' | grep -f <(grep ">" ../amp454-L215-N1k_reads.fa | perl -pe 's/.+description="|"//g;s/$/_rrn.gff/' | sort -u) > rnammer_bac-amp454.txt
Calculating GC content stats
@ system76-server:~/notebook/deltaGC_notes/data/amplicon/140403_allArc/amp454-L215-N1k/140428_rrn_GCstats
$ rnammer_rrn_GCstats.pl -g <(perl -pe 's/_rrn.gff//' rnammer_arc-amp454.txt) -f /var/seq_data/ncbi_db/genome/prok-arc-genomes/ --debug | less
In [ ]: