Description:

  • Making boxplots show BD range encompassing 95% of reads
    • Plotting for bacteria & archaea
    • both fragment size distributions
    • amplicon- & shotgun-fragments

In [1]:
%load_ext rmagic
%load_ext rpy2.ipython


/opt/anaconda/lib/python2.7/site-packages/pytz/__init__.py:29: UserWarning: Module argparse was already imported from /opt/anaconda/lib/python2.7/argparse.pyc, but /opt/anaconda/lib/python2.7/site-packages is being added to sys.path
  from pkg_resources import resource_stream

In [2]:
%%R
require(ggplot2)
require(reshape)
require(data.table)
require(gridExtra)


Loading required package: ggplot2
Loading required package: reshape
Loading required package: data.table
data.table 1.9.2  For help type: help("data.table")
Loading required package: gridExtra
Loading required package: grid

Loading bacteria data

Loading amplicon data


In [3]:
%%R
# loading amplicon data 
base.dir = "~/notebook/deltaGC_notes/data/amplicon/140411_allBac/amp454-L215-N1k";
files = c(
    'skewnorm_m15k_r4k-20k',
    'skewnorm_m10k_r4k-11.5k'
    )

tbl.l = list()
for(i in files){
    in.file = paste(c(i, "txt"), collapse=".")
    in.file = paste(c(base.dir, in.file), collapse="/")
    tbl.l[[i]] = fread(in.file, header=T, sep="\t")
    tbl.l[[i]]$file = rep(i, nrow(tbl.l[[i]]))
    }


## merging lists
tbl.amp = do.call(rbind, tbl.l)
tbl.amp$data = rep('amplicon\nfragments', nrow(tbl.amp))
tbl.l = list()


Read 52.2% of 1091000 rows
Read 78.8% of 1091000 rows
Read 1091000 rows and 11 (of 11) columns from 0.151 GB file in 00:00:04

Read 96.2% of 1091000 rows
Read 1091000 rows and 11 (of 11) columns from 0.150 GB file in 00:00:03

Loading shotgun data


In [4]:
%%R
# loading shotgun read data
base.dir = "~/notebook/deltaGC_notes/data/mg-reads/140411_allBac/mg454-L215-N10k";
files = list(
    'skewnorm_m15k_r4k-20k',
    'skewnorm_m10k_r4k-11.5k'
    )

tbl.l = list()
for(i in files){
    in.file = paste(c(i, "txt"), collapse=".")
    in.file = paste(c(base.dir, in.file), collapse="/")
    tbl.l[[i]] = fread(in.file, header=T, sep="\t")
    tbl.l[[i]]$file = rep(i, nrow(tbl.l[[i]]))
    }


## merging lists
tbl.mg = do.call(rbind, tbl.l)
tbl.mg$data = rep('shotgun\nfragments', nrow(tbl.mg))
tbl.l = list()


Read 0.0% of 10910000 rows
Read 6.0% of 10910000 rows
Read 9.1% of 10910000 rows
Read 11.5% of 10910000 rows
Read 14.2% of 10910000 rows
Read 16.8% of 10910000 rows
Read 19.2% of 10910000 rows
Read 21.1% of 10910000 rows
Read 23.1% of 10910000 rows
Read 24.7% of 10910000 rows
Read 27.0% of 10910000 rows
Read 29.6% of 10910000 rows
Read 32.3% of 10910000 rows
Read 35.0% of 10910000 rows
Read 37.9% of 10910000 rows
Read 41.0% of 10910000 rows
Read 44.2% of 10910000 rows
Read 47.6% of 10910000 rows
Read 51.1% of 10910000 rows
Read 54.9% of 10910000 rows
Read 58.8% of 10910000 rows
Read 62.9% of 10910000 rows
Read 67.2% of 10910000 rows
Read 71.7% of 10910000 rows
Read 76.4% of 10910000 rows
Read 80.1% of 10910000 rows
Read 81.5% of 10910000 rows
Read 86.7% of 10910000 rows
Read 92.2% of 10910000 rows
Read 98.0% of 10910000 rows
Read 10910000 rows and 11 (of 11) columns from 1.523 GB file in 00:00:51

Read 0.0% of 10910000 rows
Read 6.9% of 10910000 rows
Read 13.7% of 10910000 rows
Read 20.7% of 10910000 rows
Read 27.7% of 10910000 rows
Read 34.6% of 10910000 rows
Read 41.4% of 10910000 rows
Read 48.2% of 10910000 rows
Read 54.9% of 10910000 rows
Read 61.6% of 10910000 rows
Read 68.2% of 10910000 rows
Read 74.7% of 10910000 rows
Read 81.6% of 10910000 rows
Read 88.6% of 10910000 rows
Read 95.9% of 10910000 rows
Read 10910000 rows and 11 (of 11) columns from 1.512 GB file in 00:00:20

In [5]:
%%R
## merging amp & mg tables
tbl.bac = rbind(tbl.amp, tbl.mg)
tbl.amp = vector()  # clearing memory
tbl.mg = vector()

In [6]:
%%R
# editing $file names
tbl.bac[, file:=gsub("skewnorm_m15k_r4k-20k", "Fragment size\ndistribution: 12.5kb", file)]
tbl.bac[, file:=gsub("skewnorm_m10k_r4k-11.5k", "Fragment size\ndistribution: 6kb", file)]


                                 genome scaffold           read  read_GC
       1:  Agrobacterium_fabrum_str_C58 AE007869  AE007869__499 52.55814
       2:  Agrobacterium_fabrum_str_C58 AE007869  AE007869__498 52.55814
       3:  Agrobacterium_fabrum_str_C58 AE007869  AE007869__495 52.55814
       4:  Agrobacterium_fabrum_str_C58 AE007869  AE007869__497 52.55814
       5:  Agrobacterium_fabrum_str_C58 AE007869  AE007869__496 52.55814
      ---                                                               
24001996: Salmonella_bongori_NCTC_12419 FR877557 FR877557__2790 57.20930
24001997: Salmonella_bongori_NCTC_12419 FR877557 FR877557__2797 29.76744
24001998: Salmonella_bongori_NCTC_12419 FR877557 FR877557__2796 44.18605
24001999: Salmonella_bongori_NCTC_12419 FR877557 FR877557__2795 50.23256
24002000: Salmonella_bongori_NCTC_12419 FR877557 FR877557__2794 53.95349
          read_buoyant_density read_start read_length fragment_GC
       1:             1.711507     114437         215    53.02019
       2:             1.711507     114437         215    55.41002
       3:             1.711507    5035543         215    52.92066
       4:             1.711507    5035543         215    53.45668
       5:             1.711507     114437         215    56.14006
      ---                                                        
24001996:             1.716065    1863392         215    53.96161
24001997:             1.689172    8744011         215    44.52086
24001998:             1.703302    8668917         215    46.68961
24001999:             1.709228    6849897         215    54.76029
24002000:             1.712874    3975166         215    45.70962
          fragment_buoyant_density fragment_start fragment_length
       1:                 1.711960         114273           12284
       2:                 1.714302         107900           14048
       3:                 1.711862        5024735           14911
       4:                 1.712388        5030946           10053
       5:                 1.715017         107683           12109
      ---                                                        
24001996:                 1.712882        1861101            5730
24001997:                 1.703630        8742874            6543
24001998:                 1.705756        8667460            6102
24001999:                 1.713665        6849320            5882
24002000:                 1.704795        3972279            6130
                                         file                data
       1: Fragment size\ndistribution: 12.5kb amplicon\nfragments
       2: Fragment size\ndistribution: 12.5kb amplicon\nfragments
       3: Fragment size\ndistribution: 12.5kb amplicon\nfragments
       4: Fragment size\ndistribution: 12.5kb amplicon\nfragments
       5: Fragment size\ndistribution: 12.5kb amplicon\nfragments
      ---                                                        
24001996:    Fragment size\ndistribution: 6kb  shotgun\nfragments
24001997:    Fragment size\ndistribution: 6kb  shotgun\nfragments
24001998:    Fragment size\ndistribution: 6kb  shotgun\nfragments
24001999:    Fragment size\ndistribution: 6kb  shotgun\nfragments
24002000:    Fragment size\ndistribution: 6kb  shotgun\nfragments

Loading archaea data

Loading amplicon data


In [7]:
%%R
base.dir = "~/notebook/deltaGC_notes/data/amplicon/140416_allArc/amp454-L215-N1k";
files = c(
    'skewnorm_m15k_r4k-20k',
    'skewnorm_m10k_r4k-11.5k'
    )

tbl.l = list()
for(i in files){
    in.file = paste(c(i, "txt"), collapse=".")
    in.file = paste(c(base.dir, in.file), collapse="/")
    tbl.l[[i]] = fread(in.file, header=T, sep="\t")
    tbl.l[[i]]$file = rep(i, nrow(tbl.l[[i]]))
    }

## merging lists
tbl.amp = do.call(rbind, tbl.l)
tbl.amp$data = rep('amplicon\nfragments', nrow(tbl.amp))
tbl.l = list()

Loading shotgun read data


In [8]:
%%R
base.dir = "~/notebook/deltaGC_notes/data/mg-reads/140417_allArc/mg454-L215-N10k";
files = list(
    'skewnorm_m15k_r4k-20k',
    'skewnorm_m10k_r4k-11.5k'
    )

tbl.l = list()
for(i in files){
    in.file = paste(c(i, "txt"), collapse=".")
    in.file = paste(c(base.dir, in.file), collapse="/")
    tbl.l[[i]] = fread(in.file, header=T, sep="\t")
    tbl.l[[i]]$file = rep(i, nrow(tbl.l[[i]]))
    }


## merging lists
tbl.mg = do.call(rbind, tbl.l)
tbl.mg$data = rep('shotgun\nfragments', nrow(tbl.mg))
tbl.l = list()

In [9]:
%%R
## merging amp & mg tables
tbl.arc = rbind(tbl.amp, tbl.mg)
tbl.amp = vector()
tbl.mg = vector()

In [10]:
%%R
# editing $file names
tbl.arc[, file:=gsub("skewnorm_m15k_r4k-20k", "Fragment size\ndistribution: 12.5kb", file)]
tbl.arc[, file:=gsub("skewnorm_m10k_r4k-11.5k", "Fragment size\ndistribution: 6kb", file)]


                                    genome scaffold           read  read_GC
      1: Haloarcula_marismortui_ATCC_43049 AY596297   AY596297__33 54.41860
      2: Haloarcula_marismortui_ATCC_43049 AY596297  AY596297__791 63.72093
      3: Haloarcula_marismortui_ATCC_43049 AY596297   AY596297__37 63.72093
      4: Haloarcula_marismortui_ATCC_43049 AY596297   AY596297__36 63.72093
      5: Haloarcula_marismortui_ATCC_43049 AY596297  AY596297__797 54.41860
     ---                                                                   
2221996:         Haloquadratum_walsbyi_C23 FR746099 FR746099__8237 44.65116
2221997:         Haloquadratum_walsbyi_C23 FR746099 FR746099__8230 49.76744
2221998:         Haloquadratum_walsbyi_C23 FR746099 FR746099__8231 43.25581
2221999:         Haloquadratum_walsbyi_C23 FR746099 FR746099__8232 42.32558
2222000:         Haloquadratum_walsbyi_C23 FR746099 FR746099__8233 40.46512
         read_buoyant_density read_start read_length fragment_GC
      1:             1.713330    2215176         215    57.38426
      2:             1.722447    5265667         215    57.17372
      3:             1.722447    5265667         215    57.25134
      4:             1.722447    5265667         215    56.66473
      5:             1.713330    2215176         215    55.90927
     ---                                                        
2221996:             1.703758    1283771         215    45.85610
2221997:             1.708772    1506868         215    52.50473
2221998:             1.702391    3992795         215    45.10173
2221999:             1.701479     323240         215    47.09707
2222000:             1.699656    5308016         215    46.21100
         fragment_buoyant_density fragment_start fragment_length
      1:                 1.716237        2203273           12960
      2:                 1.716030        5261552           11110
      3:                 1.716106        5261559           10798
      4:                 1.715531        5264670           13774
      5:                 1.714791        2211480           12565
     ---                                                        
2221996:                 1.704939        1277976            6588
2221997:                 1.711455        1506629            5809
2221998:                 1.704200        3987706            5308
2221999:                 1.706155         318123            6459
2222000:                 1.705287        5305459            5384
                                        file                data
      1: Fragment size\ndistribution: 12.5kb amplicon\nfragments
      2: Fragment size\ndistribution: 12.5kb amplicon\nfragments
      3: Fragment size\ndistribution: 12.5kb amplicon\nfragments
      4: Fragment size\ndistribution: 12.5kb amplicon\nfragments
      5: Fragment size\ndistribution: 12.5kb amplicon\nfragments
     ---                                                        
2221996:    Fragment size\ndistribution: 6kb  shotgun\nfragments
2221997:    Fragment size\ndistribution: 6kb  shotgun\nfragments
2221998:    Fragment size\ndistribution: 6kb  shotgun\nfragments
2221999:    Fragment size\ndistribution: 6kb  shotgun\nfragments
2222000:    Fragment size\ndistribution: 6kb  shotgun\nfragments

Merging bacterial & archaeal tables


In [11]:
%%R
# merging tables
tbl.bac[, domain:= 'Bacteria']
tbl.arc[, domain:= 'Archaea']
tbl = rbind(tbl.bac, tbl.arc)
tbl$domain = factor(tbl$domain, levels=c('Bacteria', 'Archaea'))

Calculating confidence intervals


In [12]:
%%R
# t-based CI
t.ci.range = function(x){
# 0.95 CI
# x = vector    
    x = as.vector(x)
    m = mean(x)
    s = sd(x)
    n = length(x)
    
    error <- qt(0.975,df=n-1)*s/sqrt(n)
    ci.range = error * 2
    return(ci.range)
}
tbl[, t.ci.range := t.ci.range(fragment_buoyant_density), by='file,data,genome']

tbl[,frag_variance := var(fragment_buoyant_density), by='file,data,genome']
tbl[,frag_stdev := sd(fragment_buoyant_density), by='file,data,genome']


print(summary(unique(tbl$t.ci.range)))
print(summary(unique(tbl$frag_variance)))
print(summary(unique(tbl$frag_stdev)))


     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
3.039e-05 1.017e-04 1.400e-04 1.998e-04 2.623e-04 9.366e-04 
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
5.997e-08 3.556e-06 6.471e-06 7.990e-06 1.029e-05 7.358e-05 
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
0.0002449 0.0018860 0.0025440 0.0026000 0.0032080 0.0085780 

In [16]:
%%R -w 7 -h 6 -u in
# plotting range of values
tbl[, range := (max(fragment_buoyant_density) - min(fragment_buoyant_density)), by='file,data,genome']

# dereplicating by file,data,genome,range
tbl.s = tbl[, sum(1), by='domain,file,data,genome,range']

# plotting 
y.axis.t = expression(atop('Intra-genomic fragment buoyant', paste('density range (g ', ml^-1, ')')))
p = ggplot( tbl.s, aes(data, range) ) +
        geom_boxplot() +
        labs(x='fragment simulation', y=y.axis.t) +
        facet_grid(domain ~ file) +        
        theme( 
            text = element_text(size=18),
            axis.title.x = element_blank(),
            axis.text.x = element_text(color='black'),
            axis.text.y = element_text(color='black')            
            )
ggsave("/home/nick/notebook/deltaGC_notes/data/amp-mg/figures/BD_min-max-range_bac-arc_boxplot.tiff",
          width=7, height=6, units='in', dpi=300)
print(p)



In [50]:
%%R
# min-max ranges
tbl.min = tbl.s[, min(range), by='domain,file,data']
print(tbl.min)
tbl.max = tbl.s[, max(range), by='domain,file,data']
print(tbl.max)


     domain                                file                data          V1
1: Bacteria Fragment size\ndistribution: 12.5kb amplicon\nfragments 0.001927566
2: Bacteria    Fragment size\ndistribution: 6kb amplicon\nfragments 0.001481565
3: Bacteria Fragment size\ndistribution: 12.5kb  shotgun\nfragments 0.007203698
4: Bacteria    Fragment size\ndistribution: 6kb  shotgun\nfragments 0.009448307
5:  Archaea Fragment size\ndistribution: 12.5kb amplicon\nfragments 0.003496030
6:  Archaea    Fragment size\ndistribution: 6kb amplicon\nfragments 0.001884219
7:  Archaea Fragment size\ndistribution: 12.5kb  shotgun\nfragments 0.016577551
8:  Archaea    Fragment size\ndistribution: 6kb  shotgun\nfragments 0.020041700
     domain                                file                data         V1
1: Bacteria Fragment size\ndistribution: 12.5kb amplicon\nfragments 0.02843551
2: Bacteria    Fragment size\ndistribution: 6kb amplicon\nfragments 0.02263619
3: Bacteria Fragment size\ndistribution: 12.5kb  shotgun\nfragments 0.04146382
4: Bacteria    Fragment size\ndistribution: 6kb  shotgun\nfragments 0.04488895
5:  Archaea Fragment size\ndistribution: 12.5kb amplicon\nfragments 0.02719587
6:  Archaea    Fragment size\ndistribution: 6kb amplicon\nfragments 0.02304143
7:  Archaea Fragment size\ndistribution: 12.5kb  shotgun\nfragments 0.04106626
8:  Archaea    Fragment size\ndistribution: 6kb  shotgun\nfragments 0.04376268

t-test: BD variance > for shorter fragment sizes?


In [90]:
%%R
tmp = t.test(range ~ file, data=subset(tbl.s, data=='amplicon\nfragments' & domain=='Bacteria'), alternative='greater')

print(tmp)
print(names(tmp))


	Welch Two Sample t-test

data:  range by file
t = 10.7926, df = 2123.143, p-value < 2.2e-16
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
 0.001599368         Inf
sample estimates:
mean in group Fragment size\ndistribution: 12.5kb 
                                      0.010175775 
   mean in group Fragment size\ndistribution: 6kb 
                                      0.008288676 

[1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
[6] "null.value"  "alternative" "method"      "data.name"  

In [93]:
%%R
require(effsize)

# t-test on just amplicon fragments
print('#---amplicon-bacteria---#')
print(
    t.test(range ~ file, data=subset(tbl.s, data=='amplicon\nfragments' & domain=='Bacteria'), alternative='greater')
    )
print(
    cohen.d(range ~ file, data=subset(tbl.s, data=='amplicon\nfragments' & domain=='Bacteria') )
    )

print('#---amplicon-archaea---#')
print(
    t.test(range ~ file, data=subset(tbl.s, data=='amplicon\nfragments' & domain=='Archaea'), alternative='greater')
    )
print(
    cohen.d(range ~ file, data=subset(tbl.s, data=='amplicon\nfragments' & domain=='Archaea') )
    )



# t-test on just shotgun fragments
print('#---shotgun-bacteria---#')
print(
    t.test(range ~ file, data=subset(tbl.s, data=='shotgun\nfragments' & domain=='Bacteria'), alternative='less')
    )
print(
    cohen.d(range ~ file, data=subset(tbl.s, data=='shotgun\nfragments' & domain=='Bacteria') )
    )


print('#---shotgun-archaea---#')
print(
    t.test(range ~ file, data=subset(tbl.s, data=='shotgun\nfragments' & domain=='Archaea'), alternative='less')
    )
print(
    cohen.d(range ~ file, data=subset(tbl.s, data=='shotgun\nfragments' & domain=='Archaea') )
    )


[1] "#---amplicon-bacteria---#"

	Welch Two Sample t-test

data:  range by file
t = 10.7926, df = 2123.143, p-value < 2.2e-16
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
 0.001599368         Inf
sample estimates:
mean in group Fragment size\ndistribution: 12.5kb 
                                      0.010175775 
   mean in group Fragment size\ndistribution: 6kb 
                                      0.008288676 


Cohen's d

d estimate: 0.4620912 (small)
95 percent confidence interval:
      inf       sup 
0.3769742 0.5472081 
[1] "#---amplicon-archaea---#"

	Welch Two Sample t-test

data:  range by file
t = 2.7073, df = 196.389, p-value = 0.00369
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
 0.000969188         Inf
sample estimates:
mean in group Fragment size\ndistribution: 12.5kb 
                                      0.012214231 
   mean in group Fragment size\ndistribution: 6kb 
                                      0.009726363 


Cohen's d

d estimate: 0.3809748 (small)
95 percent confidence interval:
      inf       sup 
0.0995631 0.6623864 
[1] "#---shotgun-bacteria---#"

	Welch Two Sample t-test

data:  range by file
t = -16.5409, df = 2179.723, p-value < 2.2e-16
alternative hypothesis: true difference in means is less than 0
95 percent confidence interval:
         -Inf -0.003521847
sample estimates:
mean in group Fragment size\ndistribution: 12.5kb 
                                       0.02332746 
   mean in group Fragment size\ndistribution: 6kb 
                                       0.02723838 


Cohen's d

d estimate: -0.7082104 (medium)
95 percent confidence interval:
       inf        sup 
-0.7948082 -0.6216125 
[1] "#---shotgun-archaea---#"

	Welch Two Sample t-test

data:  range by file
t = -6.2815, df = 199.928, p-value = 1.027e-09
alternative hypothesis: true difference in means is less than 0
95 percent confidence interval:
         -Inf -0.003163461
sample estimates:
mean in group Fragment size\ndistribution: 12.5kb 
                                       0.02758458 
   mean in group Fragment size\ndistribution: 6kb 
                                       0.03187737 


Cohen's d

d estimate: -0.8839299 (large)
95 percent confidence interval:
       inf        sup 
-1.1762289 -0.5916308 

In [ ]: