workflow
@ system76-server:~/notebook/deltaGC_notes/data/amplicon/140411_allBac/amp454-L215-N1k
Grinder run
@ system76-server:~/notebook/deltaGC_notes/data/amplicon/140409_allBac/amp454-L215-N1k
mkdir read_files
for file in `find ~/notebook/deltaGC_notes/data/genome_data/prok-bac-genomes_Ngaps/ -name "*fasta" | perl -pe 's/.+\///'`
do
    grinder -reference_file ~/notebook/deltaGC_notes/data/genome_data/prok-bac-genomes_Ngaps/$file  \
    -abundance_model uniform -forward_reverse 515Fm-927Rm.fna -rd 215 normal 0 \
    -tr 1000 -copy_bias 0 -length_bias 0 -unidirectional 1 -bn read_files/$file
done
    * had to use for loop because some genomes did not produce any amplicons
    * 1092 of the 1163 genomes had amplicons
    * all have 1000 reads
Combining read files
@ system76-server:~/notebook/deltaGC_notes/data/amplicon/140409_allBac/amp454-L215-N1k
cat $(find read_files -name "*fa" | grep -v "Azospirillum_brasilense_Sp245.fasta-reads.fa") | seqIO.pl -i fasta -o fasta > amp454-L215-N1k_reads.fa
    * 1091000 sequences extracted
    * seqIO.pl used to make line wrapping consistent
Grinder run
@ system76-server:~/notebook/deltaGC_notes/data/mg-reads/140409_allBac/mg454-L215-N1k
mkdir read_files
screen -S grinder_batch -L bash -c \
    "find /home/nick/notebook/deltaGC_notes/data/genome_data/prok-bac-genomes_Ngaps/ -name '*fasta' | \
    perl -pe 's/.+\///' | xargs -n 1 -I % -P 28 \
    grinder -reference_file /home/nick/notebook/deltaGC_notes/data/genome_data/prok-bac-genomes_Ngaps/% \
    -total_reads 1000 -bn read_files/% -read_dist 215 -length_bias 0 -unidirectional 1"
    * 1162 genomes produced shotgun reads
    * all have 1000 reads
Combining read files
@ system76-server:~/notebook/deltaGC_notes/data/mg-reads/140409_allBac/mg454-L215-N10k
@ system76-server:~/notebook/deltaGC_notes/data/mg-reads/140411_allBac/mg454-L215-N10k
grinder_descByFileName.pl ../../140409_allBac/mg454-L215-N10k/read_files/ > mg454-L215-N10k_reads.fa
    * combining read files
    * correcting multiple chromosome problem; description is now the same for each taxon
grep -f <(find ~/notebook/deltaGC_notes/data/amplicon/140409_allBac/amp454-L215-N1k/read_files/ -name "*fa" | perl -pe 's/.+\/|\.fasta-read.+//g; s/\*/\\*/') mg454-L215-N10k_reads.fa | fasta_selectSeqs.pl -f mg454-L215-N10k_reads.fa -l - | seqIO.pl -i fasta -o fasta > mg454-L215-N10k_reads_e.fa
    * 10910000 sequences
    * 10000 sequences per genome
@ system76-server:~/notebook/deltaGC_notes/data/amplicon/140411_allBac/amp454-L215-N1k
deltaGC.pl -a -g /home/nick/notebook/deltaGC_notes/data/genome_data/prokBac-CompRep.fna -r amp454-L215-N1k_reads.fa -range 4000-20000 -mean 15000 -sd f -t 12 > skewnorm_m15k_r4k-20k.txt
deltaGC.pl -a -g /home/nick/notebook/deltaGC_notes/data/genome_data/prokBac-CompRep.fna -r amp454-L215-N1k_reads.fa -range 4000-11500 -mean 10000 -sd f -df 50 50 -t 12 -i > skewnorm_m10k_r4k-11.5k.txt
binning by buoyant density
deltaGCBin.pl skewnorm_m15k_r4k-20k.txt > skewnorm_m15k_r4k-20k_bin.txt
deltaGCBin.pl skewnorm_m10k_r4k-11.5k.txt > skewnorm_m10k_r4k-11.5k_bin.txt
In [1]:
    
%load_ext rmagic
%load_ext rpy2.ipython
    
    
In [ ]:
    
    
In [ ]:
    
    
In [30]:
    
%%R
require(ggplot2)
require(reshape)
require(data.table)
require(gridExtra)
require(colorspace)
    
    
In [3]:
    
%%R
# loading amplicon data 
base.dir = "~/notebook/deltaGC_notes/data/amplicon/140411_allBac/amp454-L215-N1k";
files = c(
    'skewnorm_m15k_r4k-20k',
    'skewnorm_m10k_r4k-11.5k'
    )
tbl.l = list()
for(i in files){
    in.file = paste(c(i, "txt"), collapse=".")
    in.file = paste(c(base.dir, in.file), collapse="/")
    tbl.l[[i]] = fread(in.file, header=T, sep="\t")
    tbl.l[[i]]$file = rep(i, nrow(tbl.l[[i]]))
    }
tbl.bin.l = list()
for(i in files){
    in.file = paste(c(i, "bin.txt"), collapse="_")
    in.file = paste(c(base.dir, in.file), collapse="/")
    tbl.bin.l[[i]] = fread(in.file, header=T, sep="\t")
    tbl.bin.l[[i]]$file = rep(i, nrow(tbl.bin.l[[i]]))
    }
# loading in genome GC table
tbl.genGC = fread("~/notebook/deltaGC_notes/data/genome_data/prok-bac-genomes_Ngaps_GC/prok-bac-genomes_Ngaps_GC.txt",
                    header=F)
## editing table
colnames(tbl.genGC) = c('genome', 'genome_GC')
tbl.genGC[,genome:=gsub(".+\\/|\\.fasta", "", genome)]
## selecting just genomes in other tables
tbl.genGC = subset(tbl.genGC, tbl.genGC$genome %in% unique(tbl.bin.l[[i]]$genome) )
## adding rank
tbl.genGC[,genome_GC_rank:=rank(genome_GC, ties.method='first')]
## merging tables
for(i in names(tbl.bin.l)){
    tbl.bin.l[[i]] = merge(tbl.bin.l[[i]], tbl.genGC, by='genome')
}
## merging lists
tbl.amp = do.call(rbind, tbl.l)
tbl.amp$data = rep('amplicon\nfragments', nrow(tbl.amp))
tbl.l = list()
tbl.bin.amp = do.call(rbind, tbl.bin.l)
tbl.bin.amp$data = rep('amplicon\nfragments', nrow(tbl.bin.amp))
tbl.bin.l = list()
    
    
In [4]:
    
%%R
# loading shotgun read data
base.dir = "~/notebook/deltaGC_notes/data/mg-reads/140411_allBac/mg454-L215-N10k";
files = list(
    'skewnorm_m15k_r4k-20k',
    'skewnorm_m10k_r4k-11.5k'
    )
tbl.l = list()
for(i in files){
    in.file = paste(c(i, "txt"), collapse=".")
    in.file = paste(c(base.dir, in.file), collapse="/")
    tbl.l[[i]] = fread(in.file, header=T, sep="\t")
    tbl.l[[i]]$file = rep(i, nrow(tbl.l[[i]]))
    }
tbl.bin.l = list()
for(i in files){
    in.file = paste(c(i, "bin.txt"), collapse="_")
    in.file = paste(c(base.dir, in.file), collapse="/")
    tbl.bin.l[[i]] = fread(in.file, header=T, sep="\t")
    tbl.bin.l[[i]]$file = rep(i, nrow(tbl.bin.l[[i]]))
    }
# loading in genome GC table
tbl.genGC = fread("~/notebook/deltaGC_notes/data/genome_data/prok-bac-genomes_Ngaps_GC/prok-bac-genomes_Ngaps_GC.txt",
                    header=F)
## editing table
colnames(tbl.genGC) = c('genome', 'genome_GC')
tbl.genGC[,genome:=gsub(".+\\/|\\.fasta", "", genome)]
## selecting just genomes in other tables
tbl.genGC = subset(tbl.genGC, tbl.genGC$genome %in% unique(tbl.bin.l[[i]]$genome) )
## adding rank
tbl.genGC[,genome_GC_rank:=rank(genome_GC, ties.method='first')]
## merging tables
for(i in names(tbl.bin.l)){
    tbl.bin.l[[i]] = merge(tbl.bin.l[[i]], tbl.genGC, by='genome')
}
## merging lists
tbl.mg = do.call(rbind, tbl.l)
tbl.mg$data = rep('shotgun\nfragments', nrow(tbl.mg))
tbl.l = list()
tbl.bin.mg = do.call(rbind, tbl.bin.l)
tbl.bin.mg$data = rep('shotgun\nfragments', nrow(tbl.bin.mg))
tbl.bin.l = list()
    
    
In [5]:
    
%%R
## merging amp & mg tables
tbl = rbind(tbl.amp, tbl.mg)
tbl.amp = vector()  # clearing memory
tbl.mg = vector()
tbl.bin = rbind(tbl.bin.amp, tbl.bin.mg)
tbl.bin.amp = vector()
tbl.bin.mg = vector()
    
In [6]:
    
%%R
# editing $file names
tbl[, file:=gsub("skewnorm_m15k_r4k-20k", "Fragment size\ndistribution: 12.5kb", file)]
tbl[, file:=gsub("skewnorm_m10k_r4k-11.5k", "Fragment size\ndistribution: 6kb", file)]
tbl.bin[, file:=gsub("skewnorm_m15k_r4k-20k", "Fragment size\ndistribution: 12.5kb", file)]
tbl.bin[, file:=gsub("skewnorm_m10k_r4k-11.5k", "Fragment size\ndistribution: 6kb", file)]
    
    
In [7]:
    
%%R
# simulating label incorporation by moving fragment buoyant density
## 100% 15N
tbl[, label_inc_15N := fragment_buoyant_density + 0.016]
## 100% 13C
tbl[, label_inc_13C := fragment_buoyant_density + 0.036]
    
    
In [8]:
    
%%R
# parsing out rows & columns needed for fragment density plot
tbl.p = subset(tbl, file=="Fragment size\ndistribution: 12.5kb")
tbl.p = tbl.p[, c('data','fragment_buoyant_density', 'label_inc_15N', 'label_inc_13C'), with=F]
## editing data column
tbl.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.p[data == 'shotgun\nfragments', data:='shotgun-fragments']
## melting
tbl.m = melt(tbl.p, id.vars=c('data'))
    
In [9]:
    
%%R -w 170 -h 300
## making the density plots for amplicons & shotgun sequences
make_density_plot <- function(tbl, x.axis=TRUE){
    p = ggplot(tbl, aes(value, group=variable, fill=variable, 
                                 alpha=variable, linetype=variable)) +   
            geom_density() +
            scale_linetype_manual(values=c('solid', 'dashed', 'dashed')) +
            scale_fill_manual(values=c('blue', 'red', 'green')) +
            scale_alpha_manual(values=c(0.7, 0.15, 0.15)) +
            scale_x_reverse(limits=c(1.77,1.66)) +
            coord_flip() + 
            facet_grid(data ~ .) +
            labs(y='Kernel density') +
            theme( 
                text = element_text(size=22),
                axis.title.y = element_blank(),
                axis.title.x = element_text( vjust=1 ),
                axis.text.x = element_blank(),
                axis.ticks.x = element_blank(),
                axis.text.y = element_blank(),
                axis.ticks.y = element_blank(),
                strip.text.y = element_text(size=22),
                legend.position = 'none'
                )
    # removing x axis title if needed
    if(x.axis==FALSE){   # no x-axis, top plot
        p = p + theme( axis.title.x = element_blank() )
        p = p + theme( plot.margin = unit(c(1,1,0,0), "lines") ) # top, right, bottom, left
    } else {
        p = p + theme( plot.margin = unit(c(0,1,1,0), "lines") ) # top, right, bottom, left
    }
    return(p)
}
### amplicon
p.dense.amp = make_density_plot( tbl.m[tbl.m$data=='amplicon-fragments', ], x.axis=FALSE)
#print(p.dense.amp)
### shotgun
p.dense.shotgun = make_density_plot( tbl.m[tbl.m$data=='shotgun-fragments', ] )
#print(p.dense.shotgun)
xxx=1
    
In [10]:
    
%%R
# parsing out columns & rows needed for heatmap plotting
tbl.bin.p = subset(tbl.bin, file=="Fragment size\ndistribution: 12.5kb")
tbl.bin.p = tbl.bin.p[, c('data', 'genome_GC_rank', 'bin_min', 'frag_count'), with=F]
## editing data column
tbl.bin.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.bin.p[data == 'shotgun\nfragments', data:='shotgun-fragments']
    
    
In [31]:
    
%%R -h 250 -w 900
# making heatmaps of buoyant density distributions
make_density_heatmap <- function(tbl, x.axis=TRUE){
    ## heatmap colors: black only for lowest value
    cols = c(
    rep("#000000", 1),
    rep("darkblue", 99),
    rep("#1F1FFF", 100),
    rep("#3F3FFF", 100),
    rep("#5F5FFF", 100),
    rep("#7F7FFF", 100),
    rep("#9F9FFF", 100),
    rep("#BFBFFF", 100),
    rep("#DFDFFF", 100),
    rep("#FFFFFF", 100)
    )
    
    ## black and white
    cols = c(
    rep("#FFFFFF", 1),
    rep("#E6E6E6", 99),
    rep("#D9D9D9", 100),
    rep("#CCCCCC", 100),    
    rep("#BEBEBE", 100),    
    rep("#AEAEAE", 100),
    rep("#9D9D9D", 100),    
    rep("#888888", 100),    
    rep("#6F6F6F", 100),
    rep("#4D4D4D", 100)
    )
    # rev(heat_hcl)
    cols=c(
    rep("#FFFFFF", 1),
    rep("#E5D961", 99),
    rep("#E8C842", 100),
    rep("#E9B62D", 100),
    rep("#EAA428", 100),
    rep("#E89132", 100),
    rep("#E57E41", 100),
    rep("#E06B50", 100),
    rep("#DA565E", 100),
    rep("#D33F6A", 100)
    )
    
    cols = c('white', rev(heat_hcl(99)) )
    
    ## heatmap plotting
    p.heat = ggplot(tbl, aes(genome_GC_rank, bin_min, fill=frag_count)) +
        geom_tile() +
        scale_alpha_continuous( guide=FALSE ) +
        scale_fill_gradientn( colours=cols, name='Count' ) +
        scale_x_continuous( expand = c(0,0) ) +
        scale_y_reverse( expand = c(0,0), limits=c(1.77,1.66) ) +
        labs( x='Genome (ordered by genome G+C content)', y='Fragment buoyant density') +
        #facet_grid(data ~ .) +
        theme(
            text = element_text(size=22),
            axis.text.x = element_blank(),
            axis.ticks.x = element_blank(),
            axis.title.y = element_blank(),
            legend.justification=c(0,0),
            legend.position=c(0,0),
            legend.text = element_text(size=14),
            plot.margin = unit(c(0,0,0,1), "lines") # top, right, bottom, left
            )
    # removing x axis title if needed
    if(x.axis==FALSE){  # no x-axis; top plot
        p.heat = p.heat + theme( axis.title.x = element_blank() )
        p.heat = p.heat + theme( plot.margin = unit(c(1,0,0,1), "lines") ) # top, right, bottom, left
    } else {
        p.heat = p.heat + theme( plot.margin = unit(c(0,0,1,1), "lines") ) # top, right, bottom, left
    }
    
    return(p.heat)
}
### amplicon
p.heat.amp = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='amplicon-fragments', ], x.axis=FALSE )
#print(p.heat.amp)
### shotgun
p.heat.shotgun = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='shotgun-fragments', ] )
#print(p.heat.shotgun)
xxx=1
    
In [ ]:
    
%%R -h 8 -w 16 -u in
# combined plots
## combined y axis
y.axis.t = expression(paste('Fragment buoyant density (g ', ml^-1, ')'))
y.axis.t = textGrob(y.axis.t, vjust = 1, rot=90, gp=gpar(cex = 1.9))
## plotting 
tiff( "~/notebook/deltaGC_notes/data/amp-mg/figures/dense-htmp_12.5kb/dense-htmp_12.5kb.tiff", 
        width=17, height=8, units='in', res=300 )
grid.arrange(p.heat.amp, p.dense.amp, p.heat.shotgun, p.dense.shotgun, 
                 nrow=2, ncol=2, widths=c(7,1), left = y.axis.t)
dev.off()
xxx=1
    
In [ ]:
    
    
In [13]:
    
%%R
# parsing out rows & columns needed for fragment density plot
tbl.p = subset(tbl, file=="Fragment size\ndistribution: 6kb")
tbl.p = tbl.p[, c('data','fragment_buoyant_density', 'label_inc_15N', 'label_inc_13C'), with=F]
## editing data column
tbl.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.p[data == 'shotgun\nfragments', data:='shotgun-fragments']
## melting
tbl.m = melt(tbl.p, id.vars=c('data'))
    
In [14]:
    
%%R -w 170 -h 300
# making density plots
### amplicon
p.dense.amp = make_density_plot( tbl.m[tbl.m$data=='amplicon-fragments', ], x.axis=FALSE)
print(p.dense.amp)
### shotgun
p.dense.shotgun = make_density_plot( tbl.m[tbl.m$data=='shotgun-fragments', ] )
print(p.dense.shotgun)
    
    
    
In [15]:
    
%%R
# parsing out columns & rows needed for heatmap plotting
tbl.bin.p = subset(tbl.bin, file=="Fragment size\ndistribution: 6kb")
tbl.bin.p = tbl.bin.p[, c('data', 'genome_GC_rank', 'bin_min', 'frag_count'), with=F]
## editing data column
tbl.bin.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.bin.p[data == 'shotgun\nfragments', data:='shotgun-fragments']
    
    
In [16]:
    
%%R -h 4 -w 16 -u in
# making density heatmaps 
### amplicon
p.heat.amp = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='amplicon-fragments', ], x.axis=FALSE )
print(p.heat.amp)
### shotgun
p.heat.shotgun = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='shotgun-fragments', ] )
print(p.heat.shotgun)
    
    
    
In [17]:
    
%%R -h 8 -w 16 -u in
# combined plots
## combined y axis
y.axis.t = expression(paste('Fragment buoyant density (g ', ml^-1, ')'))
y.axis.t = textGrob(y.axis.t, vjust = 1, rot=90, gp=gpar(cex = 1.75))
## plotting 
#tiff( "~/notebook/deltaGC_notes/data/amp-mg/figures/dense-htmp_6kb/dense-htmp_6kb.tiff", width=17, height=8, units='in', res=300 )
grid.arrange(p.heat.amp, p.dense.amp, p.heat.shotgun, p.dense.shotgun, 
                 nrow=2, ncol=2, widths=c(7,1), left = y.axis.t)
#dev.off()
xxx=1
    
    
In [222]:
    
%%R
# simulating label incorporation by moving fragment buoyant density
tbl[, label_inc_no := fragment_buoyant_density]
tbl[, label_inc_N := fragment_buoyant_density + 0.016]
tbl[, label_inc_C := fragment_buoyant_density + 0.036]
# melting table
require(reshape)
tbl.m = melt(tbl[, c('file', 'data', 'label_inc_no', 'label_inc_N', 'label_inc_C'), with=F], 
                 id.vars=c('file', 'data'))
    
In [223]:
    
%%R
# making histograms for each fragment size
## getting unique file|data categories
u.cat = as.matrix(unique(tbl[, c('file','data'), with=F]))
breaks = seq(1.660, 1.781, 0.001)
hist.l = list(list(list()))
for(i in 1:nrow(u.cat)){
    r = u.cat[i,]
    tmp = subset(tbl, file == r[1] & data == r[2] )
    hist.l[[r[1]]][[r[2]]][['no']] = hist( tmp$label_inc_no, breaks=breaks )
    hist.l[[r[1]]][[r[2]]][['N']] = hist( tmp$label_inc_N, breaks=breaks )
    hist.l[[r[1]]][[r[2]]][['C']] = hist( tmp$label_inc_C, breaks=breaks )
}
#print(hist.l)
xxx=1
    
    
    
    
    
    
    
    
    
    
    
    
    
In [224]:
    
%%R
# calculating intersect of histograms
require(HistogramTools)
for(file in names(hist.l)){
    if(file == ''){next}
    for(data in names(hist.l[[file]])){
        if(data == ''){next}
        hist.no = hist.l[[file]][[data]][['no']]
        hist.N = hist.l[[file]][[data]][['N']]
        hist.C = hist.l[[file]][[data]][['C']]
        
        no.N = sum(IntersectHistograms(hist.no, hist.N)$counts) / sum(hist.no$counts) * 100
        no.C = sum(IntersectHistograms(hist.no, hist.C)$counts) / sum(hist.no$counts) * 100  
        print(c(file, data, no.N, no.C))
    }
}
    
    
In [226]:
    
%%R -w 7 -h 4 -u in
# q5-q95 range
tbl[, q05.q95 := quantile(fragment_buoyant_density, 0.95) - quantile(fragment_buoyant_density,0.05), by='file,data,genome']
# dereplicating by file,data,genome,q05.q95
tbl.s = tbl[, sum(1), by='file,data,genome,q05.q95']
# plotting 
p = ggplot( tbl.s, aes(data, q05.q95) ) +
        geom_boxplot() +
        labs(x='fragment simulation', y='Buoyant density range\n(95% of all fragments)') +
        facet_grid(. ~ file) +        
        theme( 
            text = element_text(size=16),
            axis.title.x = element_blank()
            )
#ggsave("~/notebook/deltaGC_notes/data/amp-mg/figures/BD-95range_bac_boxplot.pdf")
print(p)
    
    
@ system76-server:~/notebook/deltaGC/data/amplicon/140416_allArc/amp454-L215-N1k
mkdir read_files
for file in `find ~/notebook/deltaGC_notes/data/genome_data/prok-arc-genomes_Ngaps -name "*fasta" | perl -pe 's/.+\///'`
do
    grinder -reference_file ~/notebook/deltaGC_notes/data/genome_data/prok-arc-genomes_Ngaps/$file  -abundance_model uniform -forward_reverse ../515Fm-927Rm.fna -rd 215 normal 0 -tr 1000 -copy_bias 0 -length_bias 0 -unidirectional 1 -bn read_files/$file
done
    * had to use for loop because some genomes did not produce any amplicons
    * 101 of the 119 genomes had amplicons
     * all have 1000 reads
Combining read files
@ system76-server:~/notebook/deltaGC/data/amplicon/140416_allArc/amp454-L215-N1k
grinder_descByFileName.pl ./read_files/ | seqIO.pl -i fasta -o fasta > amp454-L215-N1k_reads.fa
    * seqIO.pl used to make line wrapping consistent
    * grinder_descByFileName.pl corrects for any multi-chromosome issues
    * reads for 101 genomes
    * 101000 total reads
@ system76-server:~/notebook/deltaGC_notes/data/amplicon/140416_allArc/amp454-L215-N1k
ln -s ~/notebook/deltaGC_notes/data/genome_data/prokArc-CompRep.fna ../
deltaGC.pl -a -g ../prokArc-CompRep.fna -r amp454-L215-N1k_reads.fa -range 4000-20000 -mean 15000 -sd f -t 12 > skewnorm_m15k_r4k-20k.txt
deltaGC.pl -a -g ../prokArc-CompRep.fna -r amp454-L215-N1k_reads.fa -range 4000-11500 -mean 10000 -sd f -df 50 50 -t 12 -i  > skewnorm_m10k_r4k-11.5k.txt
binning by buoyant density
deltaGCBin.pl -f 10 skewnorm_m15k_r4k-20k.txt > skewnorm_m15k_r4k-20k_bin.txt
deltaGCBin.pl -f 12 skewnorm_m10k_r4k-11.5k.txt > skewnorm_m10k_r4k-11.5k_bin.txt
@ system76-server:~/notebook/deltaGC_notes/data/mg-reads/140417_allArc/mg454-L215-N10k
mkdir read_files
screen -S grinder_batch -L bash -c \
    "find /home/nick/notebook/deltaGC_notes/data/genome_data/prok-arc-genomes_Ngaps/ -name '*fasta' | \
    perl -pe 's/.+\///' | xargs -n 1 -I % -P 32 \
    grinder -reference_file /home/nick/notebook/deltaGC_notes/data/genome_data/prok-arc-genomes_Ngaps/% \
    -total_reads 10000 -bn read_files/% -read_dist 215 -length_bias 0 -unidirectional 1"
    * 119 genomes produced shotgun reads
    * all have 10000 reads
Combining read files
@ system76-server:~/notebook/deltaGC_notes/data/mg-reads/140417_allBac/mg454-L215-N10k
just using genomes that amplified with 515F and 927R primers (consistent with 16S amplicon simulations)
grinder_descByFileName.pl ./read_files/ > mg454-L215-N10k_reads.fa
  * combining read files
  * correcting multiple chromosome problem; description is now the same for each taxon
grep -f <(find ~/notebook/deltaGC_notes/data/amplicon/140416_allArc/amp454-L215-N1k/read_files/ -name "*fa" | perl -pe 's/.+\/|.fasta-read.+//g') mg454-L215-N10k_reads.fa | \ fasta_selectSeqs.pl -f mg454-L215-N10k_reads.fa -l - | seqIO.pl -i fasta -o fasta > mg454-L215-N10k_reads_e.fa
  * 1010000 sequences
  * 10000 sequences per genome
@ system76-server:~/notebook/deltaGC_notes/data/mg-reads/140417_allBac/mg454-L215-N10k
deltaGC.pl -g /home/nick/notebook/deltaGC_notes/data/genome_data/prokArc-CompRep.fna -r mg454-L215-N10k_reads_e.fa -range 4000-20000 -mean 15000 -sd f -t 12 -p 1 > skewnorm_m15k_r4k-20k.txt
deltaGC.pl -g /home/nick/notebook/deltaGC_notes/data/genome_data/prokArc-CompRep.fna -r mg454-L215-N10k_reads_e.fa -range 4000-11500 -mean 10000 -sd f -df 50 50 -t 12 -i -p 1 > skewnorm_m10k_r4k-11.5k.txt
binning by buoyant density (for heatmap plotting)
deltaGCBin.pl -f 8 skewnorm_m15k_r4k-20k.txt > skewnorm_m15k_r4k-20k_bin.txt
deltaGCBin.pl -f 8 skewnorm_m10k_r4k-11.5k.txt > skewnorm_m10k_r4k-11.5k_bin.txt
In [17]:
    
%%R
base.dir = "~/notebook/deltaGC_notes/data/amplicon/140416_allArc/amp454-L215-N1k";
files = c(
    'skewnorm_m15k_r4k-20k',
    'skewnorm_m10k_r4k-11.5k'
    )
tbl.l = list()
for(i in files){
    in.file = paste(c(i, "txt"), collapse=".")
    in.file = paste(c(base.dir, in.file), collapse="/")
    tbl.l[[i]] = fread(in.file, header=T, sep="\t")
    tbl.l[[i]]$file = rep(i, nrow(tbl.l[[i]]))
    }
tbl.bin.l = list()
for(i in files){
    in.file = paste(c(i, "bin.txt"), collapse="_")
    in.file = paste(c(base.dir, in.file), collapse="/")
    tbl.bin.l[[i]] = fread(in.file, header=T, sep="\t")
    tbl.bin.l[[i]]$file = rep(i, nrow(tbl.bin.l[[i]]))
    }
# loading in genome GC table
tbl.genGC = fread("~/notebook/deltaGC_notes/data/genome_data/prok-arc-genomes_Ngaps_GC/prok-arc-genomes_Ngaps_GC.txt",
                    header=F)
## editing table
colnames(tbl.genGC) = c('genome', 'genome_GC')
tbl.genGC[,genome:=gsub(".+\\/|\\.fasta", "", genome)]
## selecting just genomes in other tables
tbl.genGC = subset(tbl.genGC, tbl.genGC$genome %in% unique(tbl.bin.l[[i]]$genome) )
## adding rank
tbl.genGC[,genome_GC_rank:=rank(genome_GC, ties.method='first')]
## merging tables
for(i in names(tbl.bin.l)){
    tbl.bin.l[[i]] = merge(tbl.bin.l[[i]], tbl.genGC, by='genome')
}
## merging lists
tbl.amp = do.call(rbind, tbl.l)
tbl.amp$data = rep('amplicon\nfragments', nrow(tbl.amp))
tbl.l = list()
tbl.bin.amp = do.call(rbind, tbl.bin.l)
tbl.bin.amp$data = rep('amplicon\nfragments', nrow(tbl.bin.amp))
tbl.bin.l = list()
print(tbl.bin.amp)
    
    
In [18]:
    
%%R
base.dir = "~/notebook/deltaGC_notes/data/mg-reads/140417_allArc/mg454-L215-N10k";
files = list(
    'skewnorm_m15k_r4k-20k',
    'skewnorm_m10k_r4k-11.5k'
    )
tbl.l = list()
for(i in files){
    in.file = paste(c(i, "txt"), collapse=".")
    in.file = paste(c(base.dir, in.file), collapse="/")
    tbl.l[[i]] = fread(in.file, header=T, sep="\t")
    tbl.l[[i]]$file = rep(i, nrow(tbl.l[[i]]))
    }
tbl.bin.l = list()
for(i in files){
    in.file = paste(c(i, "bin.txt"), collapse="_")
    in.file = paste(c(base.dir, in.file), collapse="/")
    tbl.bin.l[[i]] = fread(in.file, header=T, sep="\t")
    tbl.bin.l[[i]]$file = rep(i, nrow(tbl.bin.l[[i]]))
    }
# loading in genome GC table
tbl.genGC = fread("~/notebook/deltaGC_notes/data/genome_data/prok-arc-genomes_Ngaps_GC/prok-arc-genomes_Ngaps_GC.txt",
                    header=F)
## editing table
colnames(tbl.genGC) = c('genome', 'genome_GC')
tbl.genGC[,genome:=gsub(".+\\/|\\.fasta", "", genome)]
## selecting just genomes in other tables
tbl.genGC = subset(tbl.genGC, tbl.genGC$genome %in% unique(tbl.bin.l[[i]]$genome) )
## adding rank
tbl.genGC[,genome_GC_rank:=rank(genome_GC, ties.method='first')]
## merging tables
for(i in names(tbl.bin.l)){
    tbl.bin.l[[i]] = merge(tbl.bin.l[[i]], tbl.genGC, by='genome')
}
## merging lists
tbl.mg = do.call(rbind, tbl.l)
tbl.mg$data = rep('shotgun\nfragments', nrow(tbl.mg))
tbl.l = list()
tbl.bin.mg = do.call(rbind, tbl.bin.l)
tbl.bin.mg$data = rep('shotgun\nfragments', nrow(tbl.bin.mg))
tbl.bin.l = list()
print(tbl.bin.mg)
    
    
In [19]:
    
%%R
## merging amp & mg tables
tbl = rbind(tbl.amp, tbl.mg)
tbl.amp = vector()
tbl.mg = vector()
tbl.bin = rbind(tbl.bin.amp, tbl.bin.mg)
tbl.bin.amp = vector()
tbl.bin.mg = vector()
    
In [20]:
    
%%R
# editing $file names
tbl[, file:=gsub("skewnorm_m15k_r4k-20k", "Fragment size\ndistribution: 12.5kb", file)]
tbl[, file:=gsub("skewnorm_m10k_r4k-11.5k", "Fragment size\ndistribution: 6kb", file)]
tbl.bin[, file:=gsub("skewnorm_m15k_r4k-20k", "Fragment size\ndistribution: 12.5kb", file)]
tbl.bin[, file:=gsub("skewnorm_m10k_r4k-11.5k", "Fragment size\ndistribution: 6kb", file)]
    
    
In [22]:
    
%%R
# simulating label incorporation by moving fragment buoyant density
## 100% 15N
tbl[, label_inc_15N := fragment_buoyant_density + 0.016]
## 100% 13C
tbl[, label_inc_13C := fragment_buoyant_density + 0.036]
    
    
In [23]:
    
%%R
# parsing out rows & columns needed for fragment density plot
tbl.p = subset(tbl, file=="Fragment size\ndistribution: 12.5kb")
tbl.p = tbl.p[, c('data','fragment_buoyant_density', 'label_inc_15N', 'label_inc_13C'), with=F]
## editing data column
tbl.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.p[data == 'shotgun\nfragments', data:='shotgun-fragments']
## melting
tbl.m = melt(tbl.p, id.vars=c('data'))
    
In [24]:
    
%%R -w 170 -h 300
## making the density plots for amplicons & shotgun sequences
make_density_plot <- function(tbl, x.axis=TRUE){
    p = ggplot(tbl, aes(value, group=variable, fill=variable, 
                                 alpha=variable, linetype=variable)) +   
            geom_density() +
            scale_linetype_manual(values=c('solid', 'dashed', 'dashed')) +
            scale_fill_manual(values=c('blue', 'red', 'green')) +
            scale_alpha_manual(values=c(0.7, 0.15, 0.15)) +
            scale_x_reverse(limits=c(1.77,1.66)) +
            coord_flip() + 
            facet_grid(data ~ .) +
            labs(y='Kernel density') +
            theme( 
                text = element_text(size=18),
                axis.title.y = element_blank(),
                axis.title.x = element_text( vjust=1 ),
                axis.text.x = element_blank(),
                axis.ticks.x = element_blank(),
                axis.text.y = element_blank(),
                axis.ticks.y = element_blank(),
                strip.text.y = element_text(size=18),
                legend.position = 'none'
                )
    # removing x axis title if needed
    if(x.axis==FALSE){   # no x-axis, top plot
        p = p + theme( axis.title.x = element_blank() )
        p = p + theme( plot.margin = unit(c(1,1,0,0), "lines") ) # top, right, bottom, left
        p = p + labs(title="")
        p = p + theme( title = element_text(size=16))
    } else {
        p = p + theme( plot.margin = unit(c(0,1,1,0), "lines") ) # top, right, bottom, left
    }
    return(p)
}
### amplicon
p.dense.amp.12.5 = make_density_plot( tbl.m[tbl.m$data=='amplicon-fragments', ], x.axis=FALSE)
print(p.dense.amp.12.5)
### shotgun
p.dense.shotgun.12.5 = make_density_plot( tbl.m[tbl.m$data=='shotgun-fragments', ] )
print(p.dense.shotgun.12.5)
    
    
    
In [25]:
    
%%R
# parsing out columns & rows needed for heatmap plotting
tbl.bin.p = subset(tbl.bin, file=="Fragment size\ndistribution: 12.5kb")
tbl.bin.p = tbl.bin.p[, c('data', 'genome_GC_rank', 'bin_min', 'frag_count', 'file'), with=F]
tbl.bin.p[, file := "Fragment size distribution: 12.5kb"]
## editing data column
tbl.bin.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.bin.p[data == 'shotgun\nfragments', data:='shotgun-fragments']
    
    
In [26]:
    
%%R -h 250 -w 900
# making heatmaps of buoyant density distributions
make_density_heatmap <- function(tbl, x.axis=TRUE){
    ## heatmap colors: black only for lowest value
    cols = c(
    rep("#000000", 1),
    rep("darkblue", 99),
    rep("#1F1FFF", 100),
    rep("#3F3FFF", 100),
    rep("#5F5FFF", 100),
    rep("#7F7FFF", 100),
    rep("#9F9FFF", 100),
    rep("#BFBFFF", 100),
    rep("#DFDFFF", 100),
    rep("#FFFFFF", 100)
    )
    # plot title
    plot.title = unique(tbl$file)
    
    ## heatmap plotting
    p.heat = ggplot(tbl, aes(genome_GC_rank, bin_min, fill=frag_count)) +
        geom_tile() +
        scale_alpha_continuous( guide=FALSE ) +
        scale_fill_gradientn( colours=cols, name='Count' ) +
        scale_x_continuous( expand = c(0,0) ) +
        scale_y_reverse( expand = c(0,0), limits=c(1.77,1.66) ) +
        labs( x='Genome (ordered by genome G+C content)', y='Fragment buoyant density') +
        theme(
            text = element_text(size=18),
            axis.text.x = element_blank(),
            axis.ticks.x = element_blank(),
            axis.title.y = element_blank(),
            legend.justification=c(0,0),
            legend.position=c(0,0),
            legend.text = element_text(size=14),
            plot.margin = unit(c(0,0,0,1), "lines") # top, right, bottom, left
            )
    # removing x axis title if needed
    if(x.axis==FALSE){  # no x-axis; top plot
        p.heat = p.heat + theme( axis.title.x = element_blank() )
        p.heat = p.heat + theme( plot.margin = unit(c(1,0,0,1), "lines") ) # top, right, bottom, left
        p.heat = p.heat + labs( title = plot.title )
        p.heat = p.heat + theme( title = element_text( vjust=1, hjust=0.75, size=16) )
    } else {
        p.heat = p.heat + theme( plot.margin = unit(c(0,0,1,1), "lines") ) # top, right, bottom, left
    }
    
    return(p.heat)
}
### amplicon
p.heat.amp.12.5 = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='amplicon-fragments', ], x.axis=FALSE )
print(p.heat.amp.12.5)
### shotgun
p.heat.shotgun.12.5 = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='shotgun-fragments', ] )
print(p.heat.shotgun.12.5)
    
    
    
In [237]:
    
%%R
# parsing out rows & columns needed for fragment density plot
tbl.p = subset(tbl, file=="Fragment size\ndistribution: 6kb")
tbl.p = tbl.p[, c('data','fragment_buoyant_density', 'label_inc_15N', 'label_inc_13C'), with=F]
## editing data column
tbl.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.p[data == 'shotgun\nfragments', data:='shotgun-fragments']
## melting
tbl.m = melt(tbl.p, id.vars=c('data'))
    
In [238]:
    
%%R -w 170 -h 300
# making density plots
### amplicon
p.dense.amp.6 = make_density_plot( tbl.m[tbl.m$data=='amplicon-fragments', ], x.axis=FALSE)
print(p.dense.amp.6)
### shotgun
p.dense.shotgun.6 = make_density_plot( tbl.m[tbl.m$data=='shotgun-fragments', ] )
print(p.dense.shotgun.6)
    
    
    
In [239]:
    
%%R
# parsing out columns & rows needed for heatmap plotting
tbl.bin.p = subset(tbl.bin, file=="Fragment size\ndistribution: 6kb")
tbl.bin.p = tbl.bin.p[, c('data', 'genome_GC_rank', 'bin_min', 'frag_count', 'file'), with=F]
tbl.bin.p[, file := "Fragment size distribution: 6kb"]
## editing data column
tbl.bin.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.bin.p[data == 'shotgun\nfragments', data:='shotgun-fragments']
    
    
In [240]:
    
%%R -h 300
# making density heatmaps 
### amplicon
p.heat.amp.6 = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='amplicon-fragments', ], x.axis=FALSE )
print(p.heat.amp.6)
### shotgun
p.heat.shotgun.6 = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='shotgun-fragments', ] )
print(p.heat.shotgun.6)
    
    
    
In [241]:
    
%%R -h 10 -w 18 -u in
y.axis.t = expression(paste('Fragment buoyant density (g ', ml^-1, ')'))
y.axis.t = textGrob(y.axis.t, vjust = 1, rot=90, gp=gpar(cex = 1.75))
## plotting 
#tiff( "~/notebook/deltaGC_notes/data/amp-mg/figures/dense-htmp_arc_12.5-6kb/dense-htmp_arc_12.5-6kb.tiff", 
#    width=17, height=10, units='in', res=300 )
grid.arrange(p.heat.amp.12.5, p.dense.amp.12.5,           p.heat.amp.6, p.dense.amp.6,
             p.heat.shotgun.12.5, p.dense.shotgun.12.5,   p.heat.shotgun.6, p.dense.shotgun.6, 
             nrow=2, ncol=4, widths=c(7,2,7,2), left = y.axis.t)
#dev.off()
xxx=1
    
    
In [ ]: