workflow
@ system76-server:~/notebook/deltaGC_notes/data/amplicon/140411_allBac/amp454-L215-N1k
Grinder run
@ system76-server:~/notebook/deltaGC_notes/data/amplicon/140409_allBac/amp454-L215-N1k
mkdir read_files
for file in `find ~/notebook/deltaGC_notes/data/genome_data/prok-bac-genomes_Ngaps/ -name "*fasta" | perl -pe 's/.+\///'`
do
grinder -reference_file ~/notebook/deltaGC_notes/data/genome_data/prok-bac-genomes_Ngaps/$file \
-abundance_model uniform -forward_reverse 515Fm-927Rm.fna -rd 215 normal 0 \
-tr 1000 -copy_bias 0 -length_bias 0 -unidirectional 1 -bn read_files/$file
done
* had to use for loop because some genomes did not produce any amplicons
* 1092 of the 1163 genomes had amplicons
* all have 1000 reads
Combining read files
@ system76-server:~/notebook/deltaGC_notes/data/amplicon/140409_allBac/amp454-L215-N1k
cat $(find read_files -name "*fa" | grep -v "Azospirillum_brasilense_Sp245.fasta-reads.fa") | seqIO.pl -i fasta -o fasta > amp454-L215-N1k_reads.fa
* 1091000 sequences extracted
* seqIO.pl used to make line wrapping consistent
Grinder run
@ system76-server:~/notebook/deltaGC_notes/data/mg-reads/140409_allBac/mg454-L215-N1k
mkdir read_files
screen -S grinder_batch -L bash -c \
"find /home/nick/notebook/deltaGC_notes/data/genome_data/prok-bac-genomes_Ngaps/ -name '*fasta' | \
perl -pe 's/.+\///' | xargs -n 1 -I % -P 28 \
grinder -reference_file /home/nick/notebook/deltaGC_notes/data/genome_data/prok-bac-genomes_Ngaps/% \
-total_reads 1000 -bn read_files/% -read_dist 215 -length_bias 0 -unidirectional 1"
* 1162 genomes produced shotgun reads
* all have 1000 reads
Combining read files
@ system76-server:~/notebook/deltaGC_notes/data/mg-reads/140409_allBac/mg454-L215-N10k
@ system76-server:~/notebook/deltaGC_notes/data/mg-reads/140411_allBac/mg454-L215-N10k
grinder_descByFileName.pl ../../140409_allBac/mg454-L215-N10k/read_files/ > mg454-L215-N10k_reads.fa
* combining read files
* correcting multiple chromosome problem; description is now the same for each taxon
grep -f <(find ~/notebook/deltaGC_notes/data/amplicon/140409_allBac/amp454-L215-N1k/read_files/ -name "*fa" | perl -pe 's/.+\/|\.fasta-read.+//g; s/\*/\\*/') mg454-L215-N10k_reads.fa | fasta_selectSeqs.pl -f mg454-L215-N10k_reads.fa -l - | seqIO.pl -i fasta -o fasta > mg454-L215-N10k_reads_e.fa
* 10910000 sequences
* 10000 sequences per genome
@ system76-server:~/notebook/deltaGC_notes/data/amplicon/140411_allBac/amp454-L215-N1k
deltaGC.pl -a -g /home/nick/notebook/deltaGC_notes/data/genome_data/prokBac-CompRep.fna -r amp454-L215-N1k_reads.fa -range 4000-20000 -mean 15000 -sd f -t 12 > skewnorm_m15k_r4k-20k.txt
deltaGC.pl -a -g /home/nick/notebook/deltaGC_notes/data/genome_data/prokBac-CompRep.fna -r amp454-L215-N1k_reads.fa -range 4000-11500 -mean 10000 -sd f -df 50 50 -t 12 -i > skewnorm_m10k_r4k-11.5k.txt
binning by buoyant density
deltaGCBin.pl skewnorm_m15k_r4k-20k.txt > skewnorm_m15k_r4k-20k_bin.txt
deltaGCBin.pl skewnorm_m10k_r4k-11.5k.txt > skewnorm_m10k_r4k-11.5k_bin.txt
In [1]:
%load_ext rmagic
%load_ext rpy2.ipython
In [ ]:
In [ ]:
In [30]:
%%R
require(ggplot2)
require(reshape)
require(data.table)
require(gridExtra)
require(colorspace)
In [3]:
%%R
# loading amplicon data
base.dir = "~/notebook/deltaGC_notes/data/amplicon/140411_allBac/amp454-L215-N1k";
files = c(
'skewnorm_m15k_r4k-20k',
'skewnorm_m10k_r4k-11.5k'
)
tbl.l = list()
for(i in files){
in.file = paste(c(i, "txt"), collapse=".")
in.file = paste(c(base.dir, in.file), collapse="/")
tbl.l[[i]] = fread(in.file, header=T, sep="\t")
tbl.l[[i]]$file = rep(i, nrow(tbl.l[[i]]))
}
tbl.bin.l = list()
for(i in files){
in.file = paste(c(i, "bin.txt"), collapse="_")
in.file = paste(c(base.dir, in.file), collapse="/")
tbl.bin.l[[i]] = fread(in.file, header=T, sep="\t")
tbl.bin.l[[i]]$file = rep(i, nrow(tbl.bin.l[[i]]))
}
# loading in genome GC table
tbl.genGC = fread("~/notebook/deltaGC_notes/data/genome_data/prok-bac-genomes_Ngaps_GC/prok-bac-genomes_Ngaps_GC.txt",
header=F)
## editing table
colnames(tbl.genGC) = c('genome', 'genome_GC')
tbl.genGC[,genome:=gsub(".+\\/|\\.fasta", "", genome)]
## selecting just genomes in other tables
tbl.genGC = subset(tbl.genGC, tbl.genGC$genome %in% unique(tbl.bin.l[[i]]$genome) )
## adding rank
tbl.genGC[,genome_GC_rank:=rank(genome_GC, ties.method='first')]
## merging tables
for(i in names(tbl.bin.l)){
tbl.bin.l[[i]] = merge(tbl.bin.l[[i]], tbl.genGC, by='genome')
}
## merging lists
tbl.amp = do.call(rbind, tbl.l)
tbl.amp$data = rep('amplicon\nfragments', nrow(tbl.amp))
tbl.l = list()
tbl.bin.amp = do.call(rbind, tbl.bin.l)
tbl.bin.amp$data = rep('amplicon\nfragments', nrow(tbl.bin.amp))
tbl.bin.l = list()
In [4]:
%%R
# loading shotgun read data
base.dir = "~/notebook/deltaGC_notes/data/mg-reads/140411_allBac/mg454-L215-N10k";
files = list(
'skewnorm_m15k_r4k-20k',
'skewnorm_m10k_r4k-11.5k'
)
tbl.l = list()
for(i in files){
in.file = paste(c(i, "txt"), collapse=".")
in.file = paste(c(base.dir, in.file), collapse="/")
tbl.l[[i]] = fread(in.file, header=T, sep="\t")
tbl.l[[i]]$file = rep(i, nrow(tbl.l[[i]]))
}
tbl.bin.l = list()
for(i in files){
in.file = paste(c(i, "bin.txt"), collapse="_")
in.file = paste(c(base.dir, in.file), collapse="/")
tbl.bin.l[[i]] = fread(in.file, header=T, sep="\t")
tbl.bin.l[[i]]$file = rep(i, nrow(tbl.bin.l[[i]]))
}
# loading in genome GC table
tbl.genGC = fread("~/notebook/deltaGC_notes/data/genome_data/prok-bac-genomes_Ngaps_GC/prok-bac-genomes_Ngaps_GC.txt",
header=F)
## editing table
colnames(tbl.genGC) = c('genome', 'genome_GC')
tbl.genGC[,genome:=gsub(".+\\/|\\.fasta", "", genome)]
## selecting just genomes in other tables
tbl.genGC = subset(tbl.genGC, tbl.genGC$genome %in% unique(tbl.bin.l[[i]]$genome) )
## adding rank
tbl.genGC[,genome_GC_rank:=rank(genome_GC, ties.method='first')]
## merging tables
for(i in names(tbl.bin.l)){
tbl.bin.l[[i]] = merge(tbl.bin.l[[i]], tbl.genGC, by='genome')
}
## merging lists
tbl.mg = do.call(rbind, tbl.l)
tbl.mg$data = rep('shotgun\nfragments', nrow(tbl.mg))
tbl.l = list()
tbl.bin.mg = do.call(rbind, tbl.bin.l)
tbl.bin.mg$data = rep('shotgun\nfragments', nrow(tbl.bin.mg))
tbl.bin.l = list()
In [5]:
%%R
## merging amp & mg tables
tbl = rbind(tbl.amp, tbl.mg)
tbl.amp = vector() # clearing memory
tbl.mg = vector()
tbl.bin = rbind(tbl.bin.amp, tbl.bin.mg)
tbl.bin.amp = vector()
tbl.bin.mg = vector()
In [6]:
%%R
# editing $file names
tbl[, file:=gsub("skewnorm_m15k_r4k-20k", "Fragment size\ndistribution: 12.5kb", file)]
tbl[, file:=gsub("skewnorm_m10k_r4k-11.5k", "Fragment size\ndistribution: 6kb", file)]
tbl.bin[, file:=gsub("skewnorm_m15k_r4k-20k", "Fragment size\ndistribution: 12.5kb", file)]
tbl.bin[, file:=gsub("skewnorm_m10k_r4k-11.5k", "Fragment size\ndistribution: 6kb", file)]
In [7]:
%%R
# simulating label incorporation by moving fragment buoyant density
## 100% 15N
tbl[, label_inc_15N := fragment_buoyant_density + 0.016]
## 100% 13C
tbl[, label_inc_13C := fragment_buoyant_density + 0.036]
In [8]:
%%R
# parsing out rows & columns needed for fragment density plot
tbl.p = subset(tbl, file=="Fragment size\ndistribution: 12.5kb")
tbl.p = tbl.p[, c('data','fragment_buoyant_density', 'label_inc_15N', 'label_inc_13C'), with=F]
## editing data column
tbl.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.p[data == 'shotgun\nfragments', data:='shotgun-fragments']
## melting
tbl.m = melt(tbl.p, id.vars=c('data'))
In [9]:
%%R -w 170 -h 300
## making the density plots for amplicons & shotgun sequences
make_density_plot <- function(tbl, x.axis=TRUE){
p = ggplot(tbl, aes(value, group=variable, fill=variable,
alpha=variable, linetype=variable)) +
geom_density() +
scale_linetype_manual(values=c('solid', 'dashed', 'dashed')) +
scale_fill_manual(values=c('blue', 'red', 'green')) +
scale_alpha_manual(values=c(0.7, 0.15, 0.15)) +
scale_x_reverse(limits=c(1.77,1.66)) +
coord_flip() +
facet_grid(data ~ .) +
labs(y='Kernel density') +
theme(
text = element_text(size=22),
axis.title.y = element_blank(),
axis.title.x = element_text( vjust=1 ),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
strip.text.y = element_text(size=22),
legend.position = 'none'
)
# removing x axis title if needed
if(x.axis==FALSE){ # no x-axis, top plot
p = p + theme( axis.title.x = element_blank() )
p = p + theme( plot.margin = unit(c(1,1,0,0), "lines") ) # top, right, bottom, left
} else {
p = p + theme( plot.margin = unit(c(0,1,1,0), "lines") ) # top, right, bottom, left
}
return(p)
}
### amplicon
p.dense.amp = make_density_plot( tbl.m[tbl.m$data=='amplicon-fragments', ], x.axis=FALSE)
#print(p.dense.amp)
### shotgun
p.dense.shotgun = make_density_plot( tbl.m[tbl.m$data=='shotgun-fragments', ] )
#print(p.dense.shotgun)
xxx=1
In [10]:
%%R
# parsing out columns & rows needed for heatmap plotting
tbl.bin.p = subset(tbl.bin, file=="Fragment size\ndistribution: 12.5kb")
tbl.bin.p = tbl.bin.p[, c('data', 'genome_GC_rank', 'bin_min', 'frag_count'), with=F]
## editing data column
tbl.bin.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.bin.p[data == 'shotgun\nfragments', data:='shotgun-fragments']
In [31]:
%%R -h 250 -w 900
# making heatmaps of buoyant density distributions
make_density_heatmap <- function(tbl, x.axis=TRUE){
## heatmap colors: black only for lowest value
cols = c(
rep("#000000", 1),
rep("darkblue", 99),
rep("#1F1FFF", 100),
rep("#3F3FFF", 100),
rep("#5F5FFF", 100),
rep("#7F7FFF", 100),
rep("#9F9FFF", 100),
rep("#BFBFFF", 100),
rep("#DFDFFF", 100),
rep("#FFFFFF", 100)
)
## black and white
cols = c(
rep("#FFFFFF", 1),
rep("#E6E6E6", 99),
rep("#D9D9D9", 100),
rep("#CCCCCC", 100),
rep("#BEBEBE", 100),
rep("#AEAEAE", 100),
rep("#9D9D9D", 100),
rep("#888888", 100),
rep("#6F6F6F", 100),
rep("#4D4D4D", 100)
)
# rev(heat_hcl)
cols=c(
rep("#FFFFFF", 1),
rep("#E5D961", 99),
rep("#E8C842", 100),
rep("#E9B62D", 100),
rep("#EAA428", 100),
rep("#E89132", 100),
rep("#E57E41", 100),
rep("#E06B50", 100),
rep("#DA565E", 100),
rep("#D33F6A", 100)
)
cols = c('white', rev(heat_hcl(99)) )
## heatmap plotting
p.heat = ggplot(tbl, aes(genome_GC_rank, bin_min, fill=frag_count)) +
geom_tile() +
scale_alpha_continuous( guide=FALSE ) +
scale_fill_gradientn( colours=cols, name='Count' ) +
scale_x_continuous( expand = c(0,0) ) +
scale_y_reverse( expand = c(0,0), limits=c(1.77,1.66) ) +
labs( x='Genome (ordered by genome G+C content)', y='Fragment buoyant density') +
#facet_grid(data ~ .) +
theme(
text = element_text(size=22),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
legend.justification=c(0,0),
legend.position=c(0,0),
legend.text = element_text(size=14),
plot.margin = unit(c(0,0,0,1), "lines") # top, right, bottom, left
)
# removing x axis title if needed
if(x.axis==FALSE){ # no x-axis; top plot
p.heat = p.heat + theme( axis.title.x = element_blank() )
p.heat = p.heat + theme( plot.margin = unit(c(1,0,0,1), "lines") ) # top, right, bottom, left
} else {
p.heat = p.heat + theme( plot.margin = unit(c(0,0,1,1), "lines") ) # top, right, bottom, left
}
return(p.heat)
}
### amplicon
p.heat.amp = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='amplicon-fragments', ], x.axis=FALSE )
#print(p.heat.amp)
### shotgun
p.heat.shotgun = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='shotgun-fragments', ] )
#print(p.heat.shotgun)
xxx=1
In [ ]:
%%R -h 8 -w 16 -u in
# combined plots
## combined y axis
y.axis.t = expression(paste('Fragment buoyant density (g ', ml^-1, ')'))
y.axis.t = textGrob(y.axis.t, vjust = 1, rot=90, gp=gpar(cex = 1.9))
## plotting
tiff( "~/notebook/deltaGC_notes/data/amp-mg/figures/dense-htmp_12.5kb/dense-htmp_12.5kb.tiff",
width=17, height=8, units='in', res=300 )
grid.arrange(p.heat.amp, p.dense.amp, p.heat.shotgun, p.dense.shotgun,
nrow=2, ncol=2, widths=c(7,1), left = y.axis.t)
dev.off()
xxx=1
In [ ]:
In [13]:
%%R
# parsing out rows & columns needed for fragment density plot
tbl.p = subset(tbl, file=="Fragment size\ndistribution: 6kb")
tbl.p = tbl.p[, c('data','fragment_buoyant_density', 'label_inc_15N', 'label_inc_13C'), with=F]
## editing data column
tbl.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.p[data == 'shotgun\nfragments', data:='shotgun-fragments']
## melting
tbl.m = melt(tbl.p, id.vars=c('data'))
In [14]:
%%R -w 170 -h 300
# making density plots
### amplicon
p.dense.amp = make_density_plot( tbl.m[tbl.m$data=='amplicon-fragments', ], x.axis=FALSE)
print(p.dense.amp)
### shotgun
p.dense.shotgun = make_density_plot( tbl.m[tbl.m$data=='shotgun-fragments', ] )
print(p.dense.shotgun)
In [15]:
%%R
# parsing out columns & rows needed for heatmap plotting
tbl.bin.p = subset(tbl.bin, file=="Fragment size\ndistribution: 6kb")
tbl.bin.p = tbl.bin.p[, c('data', 'genome_GC_rank', 'bin_min', 'frag_count'), with=F]
## editing data column
tbl.bin.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.bin.p[data == 'shotgun\nfragments', data:='shotgun-fragments']
In [16]:
%%R -h 4 -w 16 -u in
# making density heatmaps
### amplicon
p.heat.amp = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='amplicon-fragments', ], x.axis=FALSE )
print(p.heat.amp)
### shotgun
p.heat.shotgun = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='shotgun-fragments', ] )
print(p.heat.shotgun)
In [17]:
%%R -h 8 -w 16 -u in
# combined plots
## combined y axis
y.axis.t = expression(paste('Fragment buoyant density (g ', ml^-1, ')'))
y.axis.t = textGrob(y.axis.t, vjust = 1, rot=90, gp=gpar(cex = 1.75))
## plotting
#tiff( "~/notebook/deltaGC_notes/data/amp-mg/figures/dense-htmp_6kb/dense-htmp_6kb.tiff", width=17, height=8, units='in', res=300 )
grid.arrange(p.heat.amp, p.dense.amp, p.heat.shotgun, p.dense.shotgun,
nrow=2, ncol=2, widths=c(7,1), left = y.axis.t)
#dev.off()
xxx=1
In [222]:
%%R
# simulating label incorporation by moving fragment buoyant density
tbl[, label_inc_no := fragment_buoyant_density]
tbl[, label_inc_N := fragment_buoyant_density + 0.016]
tbl[, label_inc_C := fragment_buoyant_density + 0.036]
# melting table
require(reshape)
tbl.m = melt(tbl[, c('file', 'data', 'label_inc_no', 'label_inc_N', 'label_inc_C'), with=F],
id.vars=c('file', 'data'))
In [223]:
%%R
# making histograms for each fragment size
## getting unique file|data categories
u.cat = as.matrix(unique(tbl[, c('file','data'), with=F]))
breaks = seq(1.660, 1.781, 0.001)
hist.l = list(list(list()))
for(i in 1:nrow(u.cat)){
r = u.cat[i,]
tmp = subset(tbl, file == r[1] & data == r[2] )
hist.l[[r[1]]][[r[2]]][['no']] = hist( tmp$label_inc_no, breaks=breaks )
hist.l[[r[1]]][[r[2]]][['N']] = hist( tmp$label_inc_N, breaks=breaks )
hist.l[[r[1]]][[r[2]]][['C']] = hist( tmp$label_inc_C, breaks=breaks )
}
#print(hist.l)
xxx=1
In [224]:
%%R
# calculating intersect of histograms
require(HistogramTools)
for(file in names(hist.l)){
if(file == ''){next}
for(data in names(hist.l[[file]])){
if(data == ''){next}
hist.no = hist.l[[file]][[data]][['no']]
hist.N = hist.l[[file]][[data]][['N']]
hist.C = hist.l[[file]][[data]][['C']]
no.N = sum(IntersectHistograms(hist.no, hist.N)$counts) / sum(hist.no$counts) * 100
no.C = sum(IntersectHistograms(hist.no, hist.C)$counts) / sum(hist.no$counts) * 100
print(c(file, data, no.N, no.C))
}
}
In [226]:
%%R -w 7 -h 4 -u in
# q5-q95 range
tbl[, q05.q95 := quantile(fragment_buoyant_density, 0.95) - quantile(fragment_buoyant_density,0.05), by='file,data,genome']
# dereplicating by file,data,genome,q05.q95
tbl.s = tbl[, sum(1), by='file,data,genome,q05.q95']
# plotting
p = ggplot( tbl.s, aes(data, q05.q95) ) +
geom_boxplot() +
labs(x='fragment simulation', y='Buoyant density range\n(95% of all fragments)') +
facet_grid(. ~ file) +
theme(
text = element_text(size=16),
axis.title.x = element_blank()
)
#ggsave("~/notebook/deltaGC_notes/data/amp-mg/figures/BD-95range_bac_boxplot.pdf")
print(p)
@ system76-server:~/notebook/deltaGC/data/amplicon/140416_allArc/amp454-L215-N1k
mkdir read_files
for file in `find ~/notebook/deltaGC_notes/data/genome_data/prok-arc-genomes_Ngaps -name "*fasta" | perl -pe 's/.+\///'`
do
grinder -reference_file ~/notebook/deltaGC_notes/data/genome_data/prok-arc-genomes_Ngaps/$file -abundance_model uniform -forward_reverse ../515Fm-927Rm.fna -rd 215 normal 0 -tr 1000 -copy_bias 0 -length_bias 0 -unidirectional 1 -bn read_files/$file
done
* had to use for loop because some genomes did not produce any amplicons
* 101 of the 119 genomes had amplicons
* all have 1000 reads
Combining read files
@ system76-server:~/notebook/deltaGC/data/amplicon/140416_allArc/amp454-L215-N1k
grinder_descByFileName.pl ./read_files/ | seqIO.pl -i fasta -o fasta > amp454-L215-N1k_reads.fa
* seqIO.pl used to make line wrapping consistent
* grinder_descByFileName.pl corrects for any multi-chromosome issues
* reads for 101 genomes
* 101000 total reads
@ system76-server:~/notebook/deltaGC_notes/data/amplicon/140416_allArc/amp454-L215-N1k
ln -s ~/notebook/deltaGC_notes/data/genome_data/prokArc-CompRep.fna ../
deltaGC.pl -a -g ../prokArc-CompRep.fna -r amp454-L215-N1k_reads.fa -range 4000-20000 -mean 15000 -sd f -t 12 > skewnorm_m15k_r4k-20k.txt
deltaGC.pl -a -g ../prokArc-CompRep.fna -r amp454-L215-N1k_reads.fa -range 4000-11500 -mean 10000 -sd f -df 50 50 -t 12 -i > skewnorm_m10k_r4k-11.5k.txt
binning by buoyant density
deltaGCBin.pl -f 10 skewnorm_m15k_r4k-20k.txt > skewnorm_m15k_r4k-20k_bin.txt
deltaGCBin.pl -f 12 skewnorm_m10k_r4k-11.5k.txt > skewnorm_m10k_r4k-11.5k_bin.txt
@ system76-server:~/notebook/deltaGC_notes/data/mg-reads/140417_allArc/mg454-L215-N10k
mkdir read_files
screen -S grinder_batch -L bash -c \
"find /home/nick/notebook/deltaGC_notes/data/genome_data/prok-arc-genomes_Ngaps/ -name '*fasta' | \
perl -pe 's/.+\///' | xargs -n 1 -I % -P 32 \
grinder -reference_file /home/nick/notebook/deltaGC_notes/data/genome_data/prok-arc-genomes_Ngaps/% \
-total_reads 10000 -bn read_files/% -read_dist 215 -length_bias 0 -unidirectional 1"
* 119 genomes produced shotgun reads
* all have 10000 reads
Combining read files
@ system76-server:~/notebook/deltaGC_notes/data/mg-reads/140417_allBac/mg454-L215-N10k
just using genomes that amplified with 515F and 927R primers (consistent with 16S amplicon simulations)
grinder_descByFileName.pl ./read_files/ > mg454-L215-N10k_reads.fa
* combining read files
* correcting multiple chromosome problem; description is now the same for each taxon
grep -f <(find ~/notebook/deltaGC_notes/data/amplicon/140416_allArc/amp454-L215-N1k/read_files/ -name "*fa" | perl -pe 's/.+\/|.fasta-read.+//g') mg454-L215-N10k_reads.fa | \ fasta_selectSeqs.pl -f mg454-L215-N10k_reads.fa -l - | seqIO.pl -i fasta -o fasta > mg454-L215-N10k_reads_e.fa
* 1010000 sequences
* 10000 sequences per genome
@ system76-server:~/notebook/deltaGC_notes/data/mg-reads/140417_allBac/mg454-L215-N10k
deltaGC.pl -g /home/nick/notebook/deltaGC_notes/data/genome_data/prokArc-CompRep.fna -r mg454-L215-N10k_reads_e.fa -range 4000-20000 -mean 15000 -sd f -t 12 -p 1 > skewnorm_m15k_r4k-20k.txt
deltaGC.pl -g /home/nick/notebook/deltaGC_notes/data/genome_data/prokArc-CompRep.fna -r mg454-L215-N10k_reads_e.fa -range 4000-11500 -mean 10000 -sd f -df 50 50 -t 12 -i -p 1 > skewnorm_m10k_r4k-11.5k.txt
binning by buoyant density (for heatmap plotting)
deltaGCBin.pl -f 8 skewnorm_m15k_r4k-20k.txt > skewnorm_m15k_r4k-20k_bin.txt
deltaGCBin.pl -f 8 skewnorm_m10k_r4k-11.5k.txt > skewnorm_m10k_r4k-11.5k_bin.txt
In [17]:
%%R
base.dir = "~/notebook/deltaGC_notes/data/amplicon/140416_allArc/amp454-L215-N1k";
files = c(
'skewnorm_m15k_r4k-20k',
'skewnorm_m10k_r4k-11.5k'
)
tbl.l = list()
for(i in files){
in.file = paste(c(i, "txt"), collapse=".")
in.file = paste(c(base.dir, in.file), collapse="/")
tbl.l[[i]] = fread(in.file, header=T, sep="\t")
tbl.l[[i]]$file = rep(i, nrow(tbl.l[[i]]))
}
tbl.bin.l = list()
for(i in files){
in.file = paste(c(i, "bin.txt"), collapse="_")
in.file = paste(c(base.dir, in.file), collapse="/")
tbl.bin.l[[i]] = fread(in.file, header=T, sep="\t")
tbl.bin.l[[i]]$file = rep(i, nrow(tbl.bin.l[[i]]))
}
# loading in genome GC table
tbl.genGC = fread("~/notebook/deltaGC_notes/data/genome_data/prok-arc-genomes_Ngaps_GC/prok-arc-genomes_Ngaps_GC.txt",
header=F)
## editing table
colnames(tbl.genGC) = c('genome', 'genome_GC')
tbl.genGC[,genome:=gsub(".+\\/|\\.fasta", "", genome)]
## selecting just genomes in other tables
tbl.genGC = subset(tbl.genGC, tbl.genGC$genome %in% unique(tbl.bin.l[[i]]$genome) )
## adding rank
tbl.genGC[,genome_GC_rank:=rank(genome_GC, ties.method='first')]
## merging tables
for(i in names(tbl.bin.l)){
tbl.bin.l[[i]] = merge(tbl.bin.l[[i]], tbl.genGC, by='genome')
}
## merging lists
tbl.amp = do.call(rbind, tbl.l)
tbl.amp$data = rep('amplicon\nfragments', nrow(tbl.amp))
tbl.l = list()
tbl.bin.amp = do.call(rbind, tbl.bin.l)
tbl.bin.amp$data = rep('amplicon\nfragments', nrow(tbl.bin.amp))
tbl.bin.l = list()
print(tbl.bin.amp)
In [18]:
%%R
base.dir = "~/notebook/deltaGC_notes/data/mg-reads/140417_allArc/mg454-L215-N10k";
files = list(
'skewnorm_m15k_r4k-20k',
'skewnorm_m10k_r4k-11.5k'
)
tbl.l = list()
for(i in files){
in.file = paste(c(i, "txt"), collapse=".")
in.file = paste(c(base.dir, in.file), collapse="/")
tbl.l[[i]] = fread(in.file, header=T, sep="\t")
tbl.l[[i]]$file = rep(i, nrow(tbl.l[[i]]))
}
tbl.bin.l = list()
for(i in files){
in.file = paste(c(i, "bin.txt"), collapse="_")
in.file = paste(c(base.dir, in.file), collapse="/")
tbl.bin.l[[i]] = fread(in.file, header=T, sep="\t")
tbl.bin.l[[i]]$file = rep(i, nrow(tbl.bin.l[[i]]))
}
# loading in genome GC table
tbl.genGC = fread("~/notebook/deltaGC_notes/data/genome_data/prok-arc-genomes_Ngaps_GC/prok-arc-genomes_Ngaps_GC.txt",
header=F)
## editing table
colnames(tbl.genGC) = c('genome', 'genome_GC')
tbl.genGC[,genome:=gsub(".+\\/|\\.fasta", "", genome)]
## selecting just genomes in other tables
tbl.genGC = subset(tbl.genGC, tbl.genGC$genome %in% unique(tbl.bin.l[[i]]$genome) )
## adding rank
tbl.genGC[,genome_GC_rank:=rank(genome_GC, ties.method='first')]
## merging tables
for(i in names(tbl.bin.l)){
tbl.bin.l[[i]] = merge(tbl.bin.l[[i]], tbl.genGC, by='genome')
}
## merging lists
tbl.mg = do.call(rbind, tbl.l)
tbl.mg$data = rep('shotgun\nfragments', nrow(tbl.mg))
tbl.l = list()
tbl.bin.mg = do.call(rbind, tbl.bin.l)
tbl.bin.mg$data = rep('shotgun\nfragments', nrow(tbl.bin.mg))
tbl.bin.l = list()
print(tbl.bin.mg)
In [19]:
%%R
## merging amp & mg tables
tbl = rbind(tbl.amp, tbl.mg)
tbl.amp = vector()
tbl.mg = vector()
tbl.bin = rbind(tbl.bin.amp, tbl.bin.mg)
tbl.bin.amp = vector()
tbl.bin.mg = vector()
In [20]:
%%R
# editing $file names
tbl[, file:=gsub("skewnorm_m15k_r4k-20k", "Fragment size\ndistribution: 12.5kb", file)]
tbl[, file:=gsub("skewnorm_m10k_r4k-11.5k", "Fragment size\ndistribution: 6kb", file)]
tbl.bin[, file:=gsub("skewnorm_m15k_r4k-20k", "Fragment size\ndistribution: 12.5kb", file)]
tbl.bin[, file:=gsub("skewnorm_m10k_r4k-11.5k", "Fragment size\ndistribution: 6kb", file)]
In [22]:
%%R
# simulating label incorporation by moving fragment buoyant density
## 100% 15N
tbl[, label_inc_15N := fragment_buoyant_density + 0.016]
## 100% 13C
tbl[, label_inc_13C := fragment_buoyant_density + 0.036]
In [23]:
%%R
# parsing out rows & columns needed for fragment density plot
tbl.p = subset(tbl, file=="Fragment size\ndistribution: 12.5kb")
tbl.p = tbl.p[, c('data','fragment_buoyant_density', 'label_inc_15N', 'label_inc_13C'), with=F]
## editing data column
tbl.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.p[data == 'shotgun\nfragments', data:='shotgun-fragments']
## melting
tbl.m = melt(tbl.p, id.vars=c('data'))
In [24]:
%%R -w 170 -h 300
## making the density plots for amplicons & shotgun sequences
make_density_plot <- function(tbl, x.axis=TRUE){
p = ggplot(tbl, aes(value, group=variable, fill=variable,
alpha=variable, linetype=variable)) +
geom_density() +
scale_linetype_manual(values=c('solid', 'dashed', 'dashed')) +
scale_fill_manual(values=c('blue', 'red', 'green')) +
scale_alpha_manual(values=c(0.7, 0.15, 0.15)) +
scale_x_reverse(limits=c(1.77,1.66)) +
coord_flip() +
facet_grid(data ~ .) +
labs(y='Kernel density') +
theme(
text = element_text(size=18),
axis.title.y = element_blank(),
axis.title.x = element_text( vjust=1 ),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
strip.text.y = element_text(size=18),
legend.position = 'none'
)
# removing x axis title if needed
if(x.axis==FALSE){ # no x-axis, top plot
p = p + theme( axis.title.x = element_blank() )
p = p + theme( plot.margin = unit(c(1,1,0,0), "lines") ) # top, right, bottom, left
p = p + labs(title="")
p = p + theme( title = element_text(size=16))
} else {
p = p + theme( plot.margin = unit(c(0,1,1,0), "lines") ) # top, right, bottom, left
}
return(p)
}
### amplicon
p.dense.amp.12.5 = make_density_plot( tbl.m[tbl.m$data=='amplicon-fragments', ], x.axis=FALSE)
print(p.dense.amp.12.5)
### shotgun
p.dense.shotgun.12.5 = make_density_plot( tbl.m[tbl.m$data=='shotgun-fragments', ] )
print(p.dense.shotgun.12.5)
In [25]:
%%R
# parsing out columns & rows needed for heatmap plotting
tbl.bin.p = subset(tbl.bin, file=="Fragment size\ndistribution: 12.5kb")
tbl.bin.p = tbl.bin.p[, c('data', 'genome_GC_rank', 'bin_min', 'frag_count', 'file'), with=F]
tbl.bin.p[, file := "Fragment size distribution: 12.5kb"]
## editing data column
tbl.bin.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.bin.p[data == 'shotgun\nfragments', data:='shotgun-fragments']
In [26]:
%%R -h 250 -w 900
# making heatmaps of buoyant density distributions
make_density_heatmap <- function(tbl, x.axis=TRUE){
## heatmap colors: black only for lowest value
cols = c(
rep("#000000", 1),
rep("darkblue", 99),
rep("#1F1FFF", 100),
rep("#3F3FFF", 100),
rep("#5F5FFF", 100),
rep("#7F7FFF", 100),
rep("#9F9FFF", 100),
rep("#BFBFFF", 100),
rep("#DFDFFF", 100),
rep("#FFFFFF", 100)
)
# plot title
plot.title = unique(tbl$file)
## heatmap plotting
p.heat = ggplot(tbl, aes(genome_GC_rank, bin_min, fill=frag_count)) +
geom_tile() +
scale_alpha_continuous( guide=FALSE ) +
scale_fill_gradientn( colours=cols, name='Count' ) +
scale_x_continuous( expand = c(0,0) ) +
scale_y_reverse( expand = c(0,0), limits=c(1.77,1.66) ) +
labs( x='Genome (ordered by genome G+C content)', y='Fragment buoyant density') +
theme(
text = element_text(size=18),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
legend.justification=c(0,0),
legend.position=c(0,0),
legend.text = element_text(size=14),
plot.margin = unit(c(0,0,0,1), "lines") # top, right, bottom, left
)
# removing x axis title if needed
if(x.axis==FALSE){ # no x-axis; top plot
p.heat = p.heat + theme( axis.title.x = element_blank() )
p.heat = p.heat + theme( plot.margin = unit(c(1,0,0,1), "lines") ) # top, right, bottom, left
p.heat = p.heat + labs( title = plot.title )
p.heat = p.heat + theme( title = element_text( vjust=1, hjust=0.75, size=16) )
} else {
p.heat = p.heat + theme( plot.margin = unit(c(0,0,1,1), "lines") ) # top, right, bottom, left
}
return(p.heat)
}
### amplicon
p.heat.amp.12.5 = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='amplicon-fragments', ], x.axis=FALSE )
print(p.heat.amp.12.5)
### shotgun
p.heat.shotgun.12.5 = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='shotgun-fragments', ] )
print(p.heat.shotgun.12.5)
In [237]:
%%R
# parsing out rows & columns needed for fragment density plot
tbl.p = subset(tbl, file=="Fragment size\ndistribution: 6kb")
tbl.p = tbl.p[, c('data','fragment_buoyant_density', 'label_inc_15N', 'label_inc_13C'), with=F]
## editing data column
tbl.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.p[data == 'shotgun\nfragments', data:='shotgun-fragments']
## melting
tbl.m = melt(tbl.p, id.vars=c('data'))
In [238]:
%%R -w 170 -h 300
# making density plots
### amplicon
p.dense.amp.6 = make_density_plot( tbl.m[tbl.m$data=='amplicon-fragments', ], x.axis=FALSE)
print(p.dense.amp.6)
### shotgun
p.dense.shotgun.6 = make_density_plot( tbl.m[tbl.m$data=='shotgun-fragments', ] )
print(p.dense.shotgun.6)
In [239]:
%%R
# parsing out columns & rows needed for heatmap plotting
tbl.bin.p = subset(tbl.bin, file=="Fragment size\ndistribution: 6kb")
tbl.bin.p = tbl.bin.p[, c('data', 'genome_GC_rank', 'bin_min', 'frag_count', 'file'), with=F]
tbl.bin.p[, file := "Fragment size distribution: 6kb"]
## editing data column
tbl.bin.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.bin.p[data == 'shotgun\nfragments', data:='shotgun-fragments']
In [240]:
%%R -h 300
# making density heatmaps
### amplicon
p.heat.amp.6 = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='amplicon-fragments', ], x.axis=FALSE )
print(p.heat.amp.6)
### shotgun
p.heat.shotgun.6 = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='shotgun-fragments', ] )
print(p.heat.shotgun.6)
In [241]:
%%R -h 10 -w 18 -u in
y.axis.t = expression(paste('Fragment buoyant density (g ', ml^-1, ')'))
y.axis.t = textGrob(y.axis.t, vjust = 1, rot=90, gp=gpar(cex = 1.75))
## plotting
#tiff( "~/notebook/deltaGC_notes/data/amp-mg/figures/dense-htmp_arc_12.5-6kb/dense-htmp_arc_12.5-6kb.tiff",
# width=17, height=10, units='in', res=300 )
grid.arrange(p.heat.amp.12.5, p.dense.amp.12.5, p.heat.amp.6, p.dense.amp.6,
p.heat.shotgun.12.5, p.dense.shotgun.12.5, p.heat.shotgun.6, p.dense.shotgun.6,
nrow=2, ncol=4, widths=c(7,2,7,2), left = y.axis.t)
#dev.off()
xxx=1
In [ ]: