Description:

Misc notebook with double checks on workflows from the other notebooks.


In [55]:
%load_ext rmagic


The rmagic extension is already loaded. To reload it, use:
  %reload_ext rmagic

In [56]:
%%R
require(ggplot2)
require(data.table)
require(gridExtra)

Checking to make sure Archeaea fragment BD heatmaps are ordered correctly


In [57]:
%%R
base.dir = "~/notebook/deltaGC_notes/data/amplicon/140416_allArc/amp454-L215-N1k";
files = c(
    'skewnorm_m15k_r4k-20k',
    'skewnorm_m10k_r4k-11.5k'
    )

tbl.l = list()
for(i in files){
    in.file = paste(c(i, "txt"), collapse=".")
    in.file = paste(c(base.dir, in.file), collapse="/")
    tbl.l[[i]] = fread(in.file, header=T, sep="\t")
    tbl.l[[i]]$file = rep(i, nrow(tbl.l[[i]]))
    }

tbl.bin.l = list()
for(i in files){
    in.file = paste(c(i, "bin.txt"), collapse="_")
    in.file = paste(c(base.dir, in.file), collapse="/")
    tbl.bin.l[[i]] = fread(in.file, header=T, sep="\t")
    tbl.bin.l[[i]]$file = rep(i, nrow(tbl.bin.l[[i]]))
    }



# loading in genome GC table
tbl.genGC = fread("~/notebook/deltaGC_notes/data/genome_data/prok-arc-genomes_Ngaps_GC/prok-arc-genomes_Ngaps_GC.txt",
                    header=F)
## editing table
colnames(tbl.genGC) = c('genome', 'genome_GC')
tbl.genGC[,genome:=gsub(".+\\/|\\.fasta", "", genome)]

## selecting just genomes in other tables
tbl.genGC = subset(tbl.genGC, tbl.genGC$genome %in% unique(tbl.bin.l[[i]]$genome) )

## adding rank
tbl.genGC[,genome_GC_rank:=rank(genome_GC, ties.method='first')]

## merging tables
for(i in names(tbl.bin.l)){
    tbl.bin.l[[i]] = merge(tbl.bin.l[[i]], tbl.genGC, by='genome')
}

## merging lists
tbl.amp = do.call(rbind, tbl.l)
tbl.amp$data = rep('amplicon\nfragments', nrow(tbl.amp))
#tbl.l = list()

tbl.bin.amp = do.call(rbind, tbl.bin.l)
tbl.bin.amp$data = rep('amplicon\nfragments', nrow(tbl.bin.amp))
#tbl.bin.l = list()

#print(tbl.l)
print(head(tbl.genGC[order(tbl.genGC$genome_GC)], n=20))


                                     genome genome_GC genome_GC_rank
 1:      Methanosphaera_stadtmanae_DSM_3091 0.2718069              1
 2:                 Methanococcus_voltae_A3 0.2812582              2
 3:     Methanothermococcus_okinawensis_IH1 0.2881838              3
 4:       Caldisphaera_lagunensis_DSM_15908 0.2954165              4
 5:         Methanococcus_aeolicus_Nankai-3 0.2954504              5
 6:   Methanobrevibacter_smithii_ATCC_35061 0.3052548              6
 7:              Methanococcus_vannielii_SB 0.3081473              7
 8:  Methanocaldococcus_jannaschii_DSM_2661 0.3091147              8
 9:         Methanocaldococcus_vulcanius_M7 0.3097321              9
10:        Methanothermus_fervidus_DSM_2088 0.3111835             10
11:         Methanocaldococcus_fervens_AG86 0.3168188             11
12:              Methanotorris_igneus_Kol_5 0.3177040             12
13:       Methanobrevibacter_ruminantium_M1 0.3210589             13
14:               Sulfolobus_tokodaii_str_7 0.3224922             14
15:            Methanococcus_maripaludis_C5 0.3246160             15
16:          Methanocaldococcus_infernus_ME 0.3306737             16
17:                 Acidianus_hospitalis_W1 0.3358832             17
18:           Nitrosopumilus_maritimus_SCM1 0.3360738             18
19: Candidatus_Nitrosopumilus_koreensis_AR1 0.3362280             19
20:          Sulfolobus_islandicus_L_S_2_15 0.3453542             20

In [59]:
%%R
# editing $file names
tbl.bin.amp[, file:=gsub("skewnorm_m15k_r4k-20k", "Fragment size\ndistribution: 12.5kb", file)]
tbl.bin.amp[, file:=gsub("skewnorm_m10k_r4k-11.5k", "Fragment size\ndistribution: 6kb", file)]


                                genome scaffold          read  read_GC
    1:         Acidianus_hospitalis_W1 CP002535 CP002535__654 63.72093
    2:         Acidianus_hospitalis_W1 CP002535 CP002535__753 63.72093
    3:         Acidianus_hospitalis_W1       NA            NA       NA
    4:         Acidianus_hospitalis_W1       NA            NA       NA
    5:         Acidianus_hospitalis_W1       NA            NA       NA
   ---                                                                
22418: Vulcanisaeta_moutnovskia_768-28       NA            NA       NA
22419: Vulcanisaeta_moutnovskia_768-28       NA            NA       NA
22420: Vulcanisaeta_moutnovskia_768-28       NA            NA       NA
22421: Vulcanisaeta_moutnovskia_768-28       NA            NA       NA
22422: Vulcanisaeta_moutnovskia_768-28 CP002529 CP002529__554 69.30233
       read_buoyant_density read_start read_length fragment_GC
    1:             1.722447    2593074         215    41.93737
    2:             1.722447    2593074         215    56.49376
    3:                   NA         NA          NA          NA
    4:                   NA         NA          NA          NA
    5:                   NA         NA          NA          NA
   ---                                                        
22418:                   NA         NA          NA          NA
22419:                   NA         NA          NA          NA
22420:                   NA         NA          NA          NA
22421:                   NA         NA          NA          NA
22422:             1.727916    2975367         215    56.03850
       fragment_buoyant_density fragment_start fragment_length bin_min bin_max
    1:                 1.701099        2581381           13699   1.701   1.702
    2:                 1.715364        2591081           12589   1.715   1.716
    3:                       NA             NA              NA   1.725   1.726
    4:                       NA             NA              NA   1.759   1.760
    5:                       NA             NA              NA   1.768   1.769
   ---                                                                        
22418:                       NA             NA              NA   1.708   1.709
22419:                       NA             NA              NA   1.766   1.767
22420:                       NA             NA              NA   1.672   1.673
22421:                       NA             NA              NA   1.748   1.749
22422:                 1.714918        2974508            6649   1.714   1.715
       read_count frag_count   median median_rank
    1:          0         44 1.711382          43
    2:          0         67 1.711382          43
    3:          0          0 1.711382          43
    4:          0          0 1.711382          43
    5:          0          0 1.711382          43
   ---                                           
22418:          0          0 1.718509          79
22419:          0          0 1.718509          79
22420:          0          0 1.718509          79
22421:          0          0 1.718509          79
22422:          0         82 1.718509          79
                                      file genome_GC genome_GC_rank
    1: Fragment size\ndistribution: 12.5kb 0.3358832             17
    2: Fragment size\ndistribution: 12.5kb 0.3358832             17
    3: Fragment size\ndistribution: 12.5kb 0.3358832             17
    4: Fragment size\ndistribution: 12.5kb 0.3358832             17
    5: Fragment size\ndistribution: 12.5kb 0.3358832             17
   ---                                                             
22418:    Fragment size\ndistribution: 6kb 0.4169213             37
22419:    Fragment size\ndistribution: 6kb 0.4169213             37
22420:    Fragment size\ndistribution: 6kb 0.4169213             37
22421:    Fragment size\ndistribution: 6kb 0.4169213             37
22422:    Fragment size\ndistribution: 6kb 0.4169213             37
                      data
    1: amplicon\nfragments
    2: amplicon\nfragments
    3: amplicon\nfragments
    4: amplicon\nfragments
    5: amplicon\nfragments
   ---                    
22418: amplicon\nfragments
22419: amplicon\nfragments
22420: amplicon\nfragments
22421: amplicon\nfragments
22422: amplicon\nfragments

In [73]:
%%R
# parsing out columns & rows needed for heatmap plotting
tbl.bin.p = subset(tbl.bin.amp, file=="Fragment size\ndistribution: 12.5kb")
tbl.bin.p = tbl.bin.p[, c('data', 'genome', 'genome_GC', 'bin_min', 'frag_count', 'file'), with=F]
tbl.bin.p[, file := "Fragment size distribution: 12.5kb"]

## editing data column
tbl.bin.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.bin.p[data == 'shotgun\nfragments', data:='shotgun-fragments']

# combining genome & genome_GC
tbl.bin.p[, genome_GC_name := paste(round(genome_GC*100, digits=1), genome, sep='-')]
print(tbl.bin.p)


                     data                          genome genome_GC bin_min
    1: amplicon-fragments         Acidianus_hospitalis_W1 0.3358832   1.701
    2: amplicon-fragments         Acidianus_hospitalis_W1 0.3358832   1.715
    3: amplicon-fragments         Acidianus_hospitalis_W1 0.3358832   1.725
    4: amplicon-fragments         Acidianus_hospitalis_W1 0.3358832   1.759
    5: amplicon-fragments         Acidianus_hospitalis_W1 0.3358832   1.768
   ---                                                                     
11207: amplicon-fragments Vulcanisaeta_moutnovskia_768-28 0.4169213   1.708
11208: amplicon-fragments Vulcanisaeta_moutnovskia_768-28 0.4169213   1.766
11209: amplicon-fragments Vulcanisaeta_moutnovskia_768-28 0.4169213   1.672
11210: amplicon-fragments Vulcanisaeta_moutnovskia_768-28 0.4169213   1.748
11211: amplicon-fragments Vulcanisaeta_moutnovskia_768-28 0.4169213   1.714
       frag_count                               file
    1:         44 Fragment size distribution: 12.5kb
    2:         67 Fragment size distribution: 12.5kb
    3:          0 Fragment size distribution: 12.5kb
    4:          0 Fragment size distribution: 12.5kb
    5:          0 Fragment size distribution: 12.5kb
   ---                                              
11207:         18 Fragment size distribution: 12.5kb
11208:          0 Fragment size distribution: 12.5kb
11209:          0 Fragment size distribution: 12.5kb
11210:          0 Fragment size distribution: 12.5kb
11211:         50 Fragment size distribution: 12.5kb
                             genome_GC_name
    1:         33.6-Acidianus_hospitalis_W1
    2:         33.6-Acidianus_hospitalis_W1
    3:         33.6-Acidianus_hospitalis_W1
    4:         33.6-Acidianus_hospitalis_W1
    5:         33.6-Acidianus_hospitalis_W1
   ---                                     
11207: 41.7-Vulcanisaeta_moutnovskia_768-28
11208: 41.7-Vulcanisaeta_moutnovskia_768-28
11209: 41.7-Vulcanisaeta_moutnovskia_768-28
11210: 41.7-Vulcanisaeta_moutnovskia_768-28
11211: 41.7-Vulcanisaeta_moutnovskia_768-28

In [82]:
%%R -h 700 -w 1100
# making heatmaps of buoyant density distributions
make_density_heatmap <- function(tbl, x.axis=TRUE){

    ## heatmap colors: black only for lowest value
    cols = c(
    rep("#000000", 1),
    rep("darkblue", 99),
    rep("#1F1FFF", 100),
    rep("#3F3FFF", 100),
    rep("#5F5FFF", 100),
    rep("#7F7FFF", 100),
    rep("#9F9FFF", 100),
    rep("#BFBFFF", 100),
    rep("#DFDFFF", 100),
    rep("#FFFFFF", 100)
    )

    # plot title
    plot.title = unique(tbl$file)
    
    ## heatmap plotting
    p.heat = ggplot(tbl, aes(genome_GC_name, bin_min, fill=frag_count)) +
        geom_tile() +
        scale_alpha_continuous( guide=FALSE ) +
        scale_fill_gradientn( colours=cols, name='Count' ) +
        #scale_x_continuous( expand = c(0,0) ) +
        scale_y_reverse( expand = c(0,0), limits=c(1.77,1.66) ) +
        labs( x='Genome (ordered by genome G+C content)', y='Fragment buoyant density') +
        theme(
            text = element_text(size=18),
            axis.text.x = element_text(angle=90, hjust=1, vjust=0.5),
            #axis.ticks.x = element_blank(),
            axis.title.y = element_blank(),
            legend.justification=c(0,0),
            legend.position=c(0,0),
            legend.text = element_text(size=14),
            plot.margin = unit(c(0,0,0,1), "lines") # top, right, bottom, left
            )
    # removing x axis title if needed
    if(x.axis==FALSE){  # no x-axis; top plot
        p.heat = p.heat + theme( axis.title.x = element_blank() )
        p.heat = p.heat + theme( plot.margin = unit(c(1,0,0,1), "lines") ) # top, right, bottom, left
        p.heat = p.heat + labs( title = plot.title )
        p.heat = p.heat + theme( title = element_text( vjust=1, hjust=0.75, size=16) )
    } else {
        p.heat = p.heat + theme( plot.margin = unit(c(0,0,1,1), "lines") ) # top, right, bottom, left
    }
    
    return(p.heat)
}


### amplicon
p.heat.amp.12.5 = make_density_heatmap( tbl.bin.p , x.axis=FALSE )
print(p.heat.amp.12.5)


plotting genome BD as points on heatmap


In [45]:
%%R
# parsing out columns & rows needed for heatmap plotting
tbl.bin.p = subset(tbl.bin.amp, file=="Fragment size\ndistribution: 12.5kb")
tbl.bin.p = tbl.bin.p[, c('data', 'genome', 'genome_GC', 'genome_GC_rank', 'bin_min', 'frag_count', 'file'), with=F]
tbl.bin.p[, file := "Fragment size distribution: 12.5kb"]

## editing data column
tbl.bin.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.bin.p[data == 'shotgun\nfragments', data:='shotgun-fragments']

# BD of each genome
tbl.bin.p[, genome_BD := genome_GC * 0.098 + 1.66]


                     data                          genome genome_GC
    1: amplicon-fragments         Acidianus_hospitalis_W1 0.3358832
    2: amplicon-fragments         Acidianus_hospitalis_W1 0.3358832
    3: amplicon-fragments         Acidianus_hospitalis_W1 0.3358832
    4: amplicon-fragments         Acidianus_hospitalis_W1 0.3358832
    5: amplicon-fragments         Acidianus_hospitalis_W1 0.3358832
   ---                                                             
11207: amplicon-fragments Vulcanisaeta_moutnovskia_768-28 0.4169213
11208: amplicon-fragments Vulcanisaeta_moutnovskia_768-28 0.4169213
11209: amplicon-fragments Vulcanisaeta_moutnovskia_768-28 0.4169213
11210: amplicon-fragments Vulcanisaeta_moutnovskia_768-28 0.4169213
11211: amplicon-fragments Vulcanisaeta_moutnovskia_768-28 0.4169213
       genome_GC_rank bin_min frag_count                               file
    1:             17   1.701         44 Fragment size distribution: 12.5kb
    2:             17   1.715         67 Fragment size distribution: 12.5kb
    3:             17   1.725          0 Fragment size distribution: 12.5kb
    4:             17   1.759          0 Fragment size distribution: 12.5kb
    5:             17   1.768          0 Fragment size distribution: 12.5kb
   ---                                                                     
11207:             37   1.708         18 Fragment size distribution: 12.5kb
11208:             37   1.766          0 Fragment size distribution: 12.5kb
11209:             37   1.672          0 Fragment size distribution: 12.5kb
11210:             37   1.748          0 Fragment size distribution: 12.5kb
11211:             37   1.714         50 Fragment size distribution: 12.5kb
       genome_BD
    1:  1.692917
    2:  1.692917
    3:  1.692917
    4:  1.692917
    5:  1.692917
   ---          
11207:  1.700858
11208:  1.700858
11209:  1.700858
11210:  1.700858
11211:  1.700858

In [46]:
%%R -h 700 -w 1100
# making heatmaps of buoyant density distributions
make_density_heatmap <- function(tbl, x.axis=TRUE){

    ## heatmap colors: black only for lowest value
    cols = c(
    rep("#000000", 1),
    rep("darkblue", 99),
    rep("#1F1FFF", 100),
    rep("#3F3FFF", 100),
    rep("#5F5FFF", 100),
    rep("#7F7FFF", 100),
    rep("#9F9FFF", 100),
    rep("#BFBFFF", 100),
    rep("#DFDFFF", 100),
    rep("#FFFFFF", 100)
    )

    # plot title
    plot.title = unique(tbl$file)
    
    ## heatmap plotting
    p.heat = ggplot(tbl, aes(genome_GC_rank, bin_min, fill=frag_count)) +
        geom_tile() +
        geom_point(aes(genome_GC_rank, genome_BD, color='red')) +
        scale_alpha_continuous( guide=FALSE ) +
        scale_fill_gradientn( colours=cols, name='Count' ) +
        #scale_x_continuous( expand = c(0,0) ) +
        scale_y_reverse( expand = c(0,0), limits=c(1.77,1.66) ) +
        labs( x='Genome (ordered by genome G+C content)', y='Fragment buoyant density') +
        theme(
            text = element_text(size=18),
            axis.text.x = element_text(angle=90, hjust=1, vjust=0.5),
            #axis.ticks.x = element_blank(),
            axis.title.y = element_blank(),
            legend.justification=c(0,0),
            legend.position=c(0,0),
            legend.text = element_text(size=14),
            plot.margin = unit(c(0,0,0,1), "lines") # top, right, bottom, left
            )        
    
    # removing x axis title if needed
    if(x.axis==FALSE){  # no x-axis; top plot
        p.heat = p.heat + theme( axis.title.x = element_blank() )
        p.heat = p.heat + theme( plot.margin = unit(c(1,0,0,1), "lines") ) # top, right, bottom, left
        p.heat = p.heat + labs( title = plot.title )
        p.heat = p.heat + theme( title = element_text( vjust=1, hjust=0.75, size=16) )
    } else {
        p.heat = p.heat + theme( plot.margin = unit(c(0,0,1,1), "lines") ) # top, right, bottom, left
    }
    
    return(p.heat)
}


### amplicon
p.heat.amp.12.5 = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='amplicon-fragments', ], x.axis=FALSE )
print(p.heat.amp.12.5)



In [46]:


In [47]:
%%R
base.dir = "~/notebook/deltaGC_notes/data/mg-reads/140417_allArc/mg454-L215-N10k";
files = list(
    'skewnorm_m15k_r4k-20k',
    'skewnorm_m10k_r4k-11.5k'
    )

tbl.l = list()
for(i in files){
    in.file = paste(c(i, "txt"), collapse=".")
    in.file = paste(c(base.dir, in.file), collapse="/")
    tbl.l[[i]] = fread(in.file, header=T, sep="\t")
    tbl.l[[i]]$file = rep(i, nrow(tbl.l[[i]]))
    }

tbl.bin.l = list()
for(i in files){
    in.file = paste(c(i, "bin.txt"), collapse="_")
    in.file = paste(c(base.dir, in.file), collapse="/")
    tbl.bin.l[[i]] = fread(in.file, header=T, sep="\t")
    tbl.bin.l[[i]]$file = rep(i, nrow(tbl.bin.l[[i]]))
    }


# loading in genome GC table
tbl.genGC = fread("~/notebook/deltaGC_notes/data/genome_data/prok-arc-genomes_Ngaps_GC/prok-arc-genomes_Ngaps_GC.txt",
                    header=F)
## editing table
colnames(tbl.genGC) = c('genome', 'genome_GC')
tbl.genGC[,genome:=gsub(".+\\/|\\.fasta", "", genome)]

## selecting just genomes in other tables
tbl.genGC = subset(tbl.genGC, tbl.genGC$genome %in% unique(tbl.bin.l[[i]]$genome) )

## adding rank
tbl.genGC[,genome_GC_rank:=rank(genome_GC, ties.method='first')]

## merging tables
for(i in names(tbl.bin.l)){
    tbl.bin.l[[i]] = merge(tbl.bin.l[[i]], tbl.genGC, by='genome')
}

## merging lists
tbl.mg = do.call(rbind, tbl.l)
tbl.mg$data = rep('shotgun\nfragments', nrow(tbl.mg))
tbl.l = list()

tbl.bin.mg = do.call(rbind, tbl.bin.l)
tbl.bin.mg$data = rep('shotgun\nfragments', nrow(tbl.bin.mg))
tbl.bin.l = list()

print(tbl.bin.mg)


Read 61.4% of 1010000 rows
Read 92.1% of 1010000 rows
Read 1010000 rows and 11 (of 11) columns from 0.139 GB file in 00:00:04
                                genome scaffold           read  read_GC
    1:         Acidianus_hospitalis_W1 CP002535 CP002535__4287 42.79070
    2:         Acidianus_hospitalis_W1 CP002535 CP002535__6816 60.93023
    3:         Acidianus_hospitalis_W1 CP002535 CP002535__2171 66.51163
    4:         Acidianus_hospitalis_W1       NA             NA       NA
    5:         Acidianus_hospitalis_W1       NA             NA       NA
   ---                                                                 
22418: Vulcanisaeta_moutnovskia_768-28 CP002529 CP002529__3236 49.76744
22419: Vulcanisaeta_moutnovskia_768-28       NA             NA       NA
22420: Vulcanisaeta_moutnovskia_768-28       NA             NA       NA
22421: Vulcanisaeta_moutnovskia_768-28       NA             NA       NA
22422: Vulcanisaeta_moutnovskia_768-28 CP002529 CP002529__2873 55.34884
       read_buoyant_density read_start read_length fragment_GC
    1:             1.701935    1337380         215    33.36678
    2:             1.719712    2594753         215    57.13936
    3:             1.725181    2594258         215    62.77217
    4:                   NA         NA          NA          NA
    5:                   NA         NA          NA          NA
   ---                                                        
22418:             1.708772    1493457         215    43.28138
22419:                   NA         NA          NA          NA
22420:                   NA         NA          NA          NA
22421:                   NA         NA          NA          NA
22422:             1.714242    4337477         215    38.91552
       fragment_buoyant_density fragment_start fragment_length bin_min bin_max
    1:                 1.692699        1337332           10963   1.701   1.702
    2:                 1.715997        2591963           12242   1.715   1.716
    3:                 1.721517        2592048            8910   1.725   1.726
    4:                       NA             NA              NA   1.759   1.760
    5:                       NA             NA              NA   1.768   1.769
   ---                                                                        
22418:                 1.702416        1488477            6095   1.708   1.709
22419:                       NA             NA              NA   1.766   1.767
22420:                       NA             NA              NA   1.672   1.673
22421:                       NA             NA              NA   1.748   1.749
22422:                 1.698137        4336467            6049   1.714   1.715
       read_count frag_count   median median_rank                    file
    1:        304          2 1.693479          19   skewnorm_m15k_r4k-20k
    2:          2          1 1.693479          19   skewnorm_m15k_r4k-20k
    3:          3          0 1.693479          19   skewnorm_m15k_r4k-20k
    4:          0          0 1.693479          19   skewnorm_m15k_r4k-20k
    5:          0          0 1.693479          19   skewnorm_m15k_r4k-20k
   ---                                                                   
22418:        350         78 1.701386          37 skewnorm_m10k_r4k-11.5k
22419:          0          0 1.701386          37 skewnorm_m10k_r4k-11.5k
22420:          0          0 1.701386          37 skewnorm_m10k_r4k-11.5k
22421:          0          0 1.701386          37 skewnorm_m10k_r4k-11.5k
22422:         62          3 1.701386          37 skewnorm_m10k_r4k-11.5k
       genome_GC genome_GC_rank               data
    1: 0.3358832             17 shotgun\nfragments
    2: 0.3358832             17 shotgun\nfragments
    3: 0.3358832             17 shotgun\nfragments
    4: 0.3358832             17 shotgun\nfragments
    5: 0.3358832             17 shotgun\nfragments
   ---                                            
22418: 0.4169213             37 shotgun\nfragments
22419: 0.4169213             37 shotgun\nfragments
22420: 0.4169213             37 shotgun\nfragments
22421: 0.4169213             37 shotgun\nfragments
22422: 0.4169213             37 shotgun\nfragments

In [48]:
%%R
# editing $file names
tbl.bin.mg[, file:=gsub("skewnorm_m15k_r4k-20k", "Fragment size\ndistribution: 12.5kb", file)]
tbl.bin.mg[, file:=gsub("skewnorm_m10k_r4k-11.5k", "Fragment size\ndistribution: 6kb", file)]


                                genome scaffold           read  read_GC
    1:         Acidianus_hospitalis_W1 CP002535 CP002535__4287 42.79070
    2:         Acidianus_hospitalis_W1 CP002535 CP002535__6816 60.93023
    3:         Acidianus_hospitalis_W1 CP002535 CP002535__2171 66.51163
    4:         Acidianus_hospitalis_W1       NA             NA       NA
    5:         Acidianus_hospitalis_W1       NA             NA       NA
   ---                                                                 
22418: Vulcanisaeta_moutnovskia_768-28 CP002529 CP002529__3236 49.76744
22419: Vulcanisaeta_moutnovskia_768-28       NA             NA       NA
22420: Vulcanisaeta_moutnovskia_768-28       NA             NA       NA
22421: Vulcanisaeta_moutnovskia_768-28       NA             NA       NA
22422: Vulcanisaeta_moutnovskia_768-28 CP002529 CP002529__2873 55.34884
       read_buoyant_density read_start read_length fragment_GC
    1:             1.701935    1337380         215    33.36678
    2:             1.719712    2594753         215    57.13936
    3:             1.725181    2594258         215    62.77217
    4:                   NA         NA          NA          NA
    5:                   NA         NA          NA          NA
   ---                                                        
22418:             1.708772    1493457         215    43.28138
22419:                   NA         NA          NA          NA
22420:                   NA         NA          NA          NA
22421:                   NA         NA          NA          NA
22422:             1.714242    4337477         215    38.91552
       fragment_buoyant_density fragment_start fragment_length bin_min bin_max
    1:                 1.692699        1337332           10963   1.701   1.702
    2:                 1.715997        2591963           12242   1.715   1.716
    3:                 1.721517        2592048            8910   1.725   1.726
    4:                       NA             NA              NA   1.759   1.760
    5:                       NA             NA              NA   1.768   1.769
   ---                                                                        
22418:                 1.702416        1488477            6095   1.708   1.709
22419:                       NA             NA              NA   1.766   1.767
22420:                       NA             NA              NA   1.672   1.673
22421:                       NA             NA              NA   1.748   1.749
22422:                 1.698137        4336467            6049   1.714   1.715
       read_count frag_count   median median_rank
    1:        304          2 1.693479          19
    2:          2          1 1.693479          19
    3:          3          0 1.693479          19
    4:          0          0 1.693479          19
    5:          0          0 1.693479          19
   ---                                           
22418:        350         78 1.701386          37
22419:          0          0 1.701386          37
22420:          0          0 1.701386          37
22421:          0          0 1.701386          37
22422:         62          3 1.701386          37
                                      file genome_GC genome_GC_rank
    1: Fragment size\ndistribution: 12.5kb 0.3358832             17
    2: Fragment size\ndistribution: 12.5kb 0.3358832             17
    3: Fragment size\ndistribution: 12.5kb 0.3358832             17
    4: Fragment size\ndistribution: 12.5kb 0.3358832             17
    5: Fragment size\ndistribution: 12.5kb 0.3358832             17
   ---                                                             
22418:    Fragment size\ndistribution: 6kb 0.4169213             37
22419:    Fragment size\ndistribution: 6kb 0.4169213             37
22420:    Fragment size\ndistribution: 6kb 0.4169213             37
22421:    Fragment size\ndistribution: 6kb 0.4169213             37
22422:    Fragment size\ndistribution: 6kb 0.4169213             37
                     data
    1: shotgun\nfragments
    2: shotgun\nfragments
    3: shotgun\nfragments
    4: shotgun\nfragments
    5: shotgun\nfragments
   ---                   
22418: shotgun\nfragments
22419: shotgun\nfragments
22420: shotgun\nfragments
22421: shotgun\nfragments
22422: shotgun\nfragments

In [53]:
%%R
# parsing out columns & rows needed for heatmap plotting
tbl.bin.p = subset(tbl.bin.mg, file=="Fragment size\ndistribution: 12.5kb")
tbl.bin.p = tbl.bin.p[, c('data', 'genome', 'genome_GC', 'genome_GC_rank', 'bin_min', 'frag_count', 'file'), with=F]
tbl.bin.p[, file := "Fragment size distribution: 12.5kb"]

## editing data column
tbl.bin.p[data == 'amplicon\nfragments', data:='amplicon-fragments']
tbl.bin.p[data == 'shotgun\nfragments', data:='shotgun-fragments']

# BD of each genome
tbl.bin.p[, genome_BD := genome_GC * 0.098 + 1.66]


                    data                          genome genome_GC
    1: shotgun-fragments         Acidianus_hospitalis_W1 0.3358832
    2: shotgun-fragments         Acidianus_hospitalis_W1 0.3358832
    3: shotgun-fragments         Acidianus_hospitalis_W1 0.3358832
    4: shotgun-fragments         Acidianus_hospitalis_W1 0.3358832
    5: shotgun-fragments         Acidianus_hospitalis_W1 0.3358832
   ---                                                            
11207: shotgun-fragments Vulcanisaeta_moutnovskia_768-28 0.4169213
11208: shotgun-fragments Vulcanisaeta_moutnovskia_768-28 0.4169213
11209: shotgun-fragments Vulcanisaeta_moutnovskia_768-28 0.4169213
11210: shotgun-fragments Vulcanisaeta_moutnovskia_768-28 0.4169213
11211: shotgun-fragments Vulcanisaeta_moutnovskia_768-28 0.4169213
       genome_GC_rank bin_min frag_count                               file
    1:             17   1.701          2 Fragment size distribution: 12.5kb
    2:             17   1.715          1 Fragment size distribution: 12.5kb
    3:             17   1.725          0 Fragment size distribution: 12.5kb
    4:             17   1.759          0 Fragment size distribution: 12.5kb
    5:             17   1.768          0 Fragment size distribution: 12.5kb
   ---                                                                     
11207:             37   1.708         73 Fragment size distribution: 12.5kb
11208:             37   1.766          0 Fragment size distribution: 12.5kb
11209:             37   1.672          0 Fragment size distribution: 12.5kb
11210:             37   1.748          0 Fragment size distribution: 12.5kb
11211:             37   1.714          2 Fragment size distribution: 12.5kb
       genome_BD
    1:  1.692917
    2:  1.692917
    3:  1.692917
    4:  1.692917
    5:  1.692917
   ---          
11207:  1.700858
11208:  1.700858
11209:  1.700858
11210:  1.700858
11211:  1.700858

In [54]:
%%R -h 700 -w 1100
# making heatmaps of buoyant density distributions
make_density_heatmap <- function(tbl, x.axis=TRUE){

    ## heatmap colors: black only for lowest value
    cols = c(
    rep("#000000", 1),
    rep("darkblue", 99),
    rep("#1F1FFF", 100),
    rep("#3F3FFF", 100),
    rep("#5F5FFF", 100),
    rep("#7F7FFF", 100),
    rep("#9F9FFF", 100),
    rep("#BFBFFF", 100),
    rep("#DFDFFF", 100),
    rep("#FFFFFF", 100)
    )

    # plot title
    plot.title = unique(tbl$file)
    
    ## heatmap plotting
    p.heat = ggplot(tbl, aes(genome_GC_rank, bin_min, fill=frag_count)) +
        geom_tile() +
        geom_point(aes(genome_GC_rank, genome_BD, color='red')) +
        scale_alpha_continuous( guide=FALSE ) +
        scale_fill_gradientn( colours=cols, name='Count' ) +
        #scale_x_continuous( expand = c(0,0) ) +
        scale_y_reverse( expand = c(0,0), limits=c(1.77,1.66) ) +
        labs( x='Genome (ordered by genome G+C content)', y='Fragment buoyant density') +
        theme(
            text = element_text(size=18),
            axis.text.x = element_text(angle=90, hjust=1, vjust=0.5),
            #axis.ticks.x = element_blank(),
            axis.title.y = element_blank(),
            legend.justification=c(0,0),
            legend.position=c(0,0),
            legend.text = element_text(size=14),
            plot.margin = unit(c(0,0,0,1), "lines") # top, right, bottom, left
            )        
    
    # removing x axis title if needed
    if(x.axis==FALSE){  # no x-axis; top plot
        p.heat = p.heat + theme( axis.title.x = element_blank() )
        p.heat = p.heat + theme( plot.margin = unit(c(1,0,0,1), "lines") ) # top, right, bottom, left
        p.heat = p.heat + labs( title = plot.title )
        p.heat = p.heat + theme( title = element_text( vjust=1, hjust=0.75, size=16) )
    } else {
        p.heat = p.heat + theme( plot.margin = unit(c(0,0,1,1), "lines") ) # top, right, bottom, left
    }
    
    return(p.heat)
}


### amplicon
p.heat.mg.12.5 = make_density_heatmap( tbl.bin.p[tbl.bin.p$data=='shotgun-fragments', ], x.axis=FALSE )
print(p.heat.mg.12.5)


Notes:

  • The ordering appears to be correct. The amplicon fragments just vary considerably from their genome G+C content.
  • The variability in amplicon-fragment distributions for balanced G+C genomes appears to be caused by thermophilic crenarchaea having a higher rrn G+C than methanogens, although both groups have approx. the same genome G+C.

In [ ]: