Goal

  • Analyze results from atomIncorp_taxaIncorp simulation
    • BD shift estimations

Var


In [1]:
workDir = '/ebio/abt3_projects/methanogen_host_evo/SIPSim_pt2/data/bac_genome1147/atomIncorp_taxaIncorp/'

Init


In [2]:
library(dplyr)
library(tidyr)
library(ggplot2)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


In [3]:
# classifying true positives, neg, ...
clsfy = function(guess,known){
    if(is.na(guess) | is.na(known)){
        return(NA)
        }
    if(guess == TRUE){
        if(guess == known){
            return('True positive')
            } else {
            return('False positive')
            }
        } else
    if(guess == FALSE){
        if(guess == known){
            return('True negative')
            } else {
            return('False negative')
            }
        } else {
        stop('Error: true or false needed')
        }
    }

Load

True BD shift


In [4]:
BDshift_files = list.files(path=workDir, pattern='BD-shift_stats.txt', full.names=TRUE, recursive=TRUE)
BDshift_files %>% length %>% print


[1] 300

In [5]:
df_shift = list()
for(F in BDshift_files){
    tmp = read.delim(F, sep='\t') 
    FF = strsplit(F, '/') %>% unlist
    FFl = length(FF)
    tmp$percIncorp = FF[FFl-3]
    tmp$percTaxa = FF[FFl-2]
    tmp$rep = FF[FFl-1]
    tmp$file = FF[FFl]
    df_shift[[F]] = tmp
}

df_shift = do.call(rbind, df_shift)
rownames(df_shift) = 1:nrow(df_shift)

df_shift = df_shift %>%
    filter(library %in% c(2,4,6)) %>%
    group_by(taxon, percIncorp, percTaxa, rep) %>%
    summarize(median = median(median)) %>%
    ungroup() %>%
    rename('median_true_BD_shift' = median) 

# status
df_shift %>% nrow %>% print
df_shift %>% head(n=3)


[1] 330600
taxonpercIncorppercTaxarepmedian_true_BD_shift
1Acaryochloris_marina_MBIC110170 1 1 0
2Acaryochloris_marina_MBIC110170 1 10 0
3Acaryochloris_marina_MBIC110170 1 2 0

MW-HR-SIP incorp calls

  • filtering BD shift estimates to just incorporators identified by MW-HR-SIP

In [6]:
incorp_files = list.files(path=workDir, pattern='OTU_abs1e9_PCR_sub_filt_MW_DESeq2_incorp.txt', full.names=TRUE, recursive=TRUE)
incorp_files %>% length %>% print


[1] 300

In [7]:
df_incorp = list()
for(F in incorp_files){
    tmp = read.delim(F, sep='\t') 
    FF = strsplit(F, '/') %>% unlist
    FFl = length(FF)
    tmp$percIncorp = FF[FFl-3]
    tmp$percTaxa = FF[FFl-2]
    tmp$rep = FF[FFl-1]
    tmp$file = FF[FFl]
    df_incorp[[F]] = tmp
}

df_incorp = do.call(rbind, df_incorp)
rownames(df_incorp) = 1:nrow(df_incorp)
df_incorp %>% head(n=3) %>% print


     baseMean log2FoldChange      lfcSE        stat    pvalue      padj
1  0.88410595    0.111588469 0.24466482  0.45608710 0.6483273 0.9991016
2  0.01655482    0.002347091 0.09279511  0.02529326 0.9798210 0.9991016
3 74.32476439   -0.062207250 0.26568859 -0.23413595 0.8148794 0.9991016
          p occur_all occur_heavy heavy_BD_min heavy_BD_max
1 0.7142076         0           0          1.7         1.73
2 0.9961940         0           0          1.7         1.73
3 0.8800201         0           0          1.7         1.73
                              taxon incorp percIncorp percTaxa rep
1    Acetobacterium_woodii_DSM_1030  FALSE          0        1   1
2 Acetohalobium_arabaticum_DSM_5501  FALSE          0        1   1
3      Acholeplasma_laidlawii_PG-8A  FALSE          0        1   1
                                          file
1 OTU_abs1e9_PCR_sub_filt_MW_DESeq2_incorp.txt
2 OTU_abs1e9_PCR_sub_filt_MW_DESeq2_incorp.txt
3 OTU_abs1e9_PCR_sub_filt_MW_DESeq2_incorp.txt

In [8]:
# just incorporators
df_incorp = df_incorp %>%
    filter(incorp == TRUE) %>%
    dplyr::distinct(taxon, incorp, percIncorp, percTaxa, rep) %>%
    rename('HWHRSIP_incorp' = incorp)

df_incorp %>% nrow %>% print
df_incorp %>% head(n=3) %>% print


[1] 29879
                               taxon HWHRSIP_incorp percIncorp percTaxa rep
1 Bifidobacterium_pseudolongum_PV8-2           TRUE          0        1   6
2     Alkaliphilus_oremlandii_OhILAs           TRUE          0        1   7
3    Paracoccus_denitrificans_PD1222           TRUE          0        1   7

q-SIP


In [ ]:
atomX_files = list.files(path=workDir, pattern='*_qSIP_atom.txt', full.names=TRUE, recursive=TRUE)
atomX_files %>% length %>% print

In [10]:
df_atomX = list()
for(F in atomX_files){
    tmp = read.delim(F, sep='\t') 
    FF = strsplit(F, '/') %>% unlist
    FFl = length(FF)
    tmp$percIncorp = FF[FFl-3]
    tmp$percTaxa = FF[FFl-2]
    tmp$rep = FF[FFl-1]
    tmp$file = FF[FFl]
    df_atomX[[F]] = tmp
}

df_atomX = do.call(rbind, df_atomX)
rownames(df_atomX) = 1:nrow(df_atomX)
df_atomX %>% head(n=3) %>% print


                                 taxon  control treatment       BD_diff
1       Acaryochloris_marina_MBIC11017 1.711015        NA            NA
2 Acetobacter_pasteurianus_IFO_3283-03 1.712271  1.711311 -0.0009600536
3       Acetobacterium_woodii_DSM_1030 1.705148  1.703813 -0.0013348264
  control_GC control_MW treatment_max_MW treatment_MW atom_fraction_excess
1  0.7778801   308.0768         317.6634           NA                   NA
2  0.7929281   308.0843         317.6634     307.9116          -0.01783259
3  0.7076277   308.0420         317.6636     307.8008          -0.02478392
  atom_CI_low atom_CI_high percIncorp percTaxa rep
1          NA           NA          0        1   1
2 -0.05099068   0.01526571          0        1   1
3 -0.08805054   0.03707605          0        1   1
                              file
1 OTU_abs1e9_PCR_sub_qSIP_atom.txt
2 OTU_abs1e9_PCR_sub_qSIP_atom.txt
3 OTU_abs1e9_PCR_sub_qSIP_atom.txt

Joining estimate with true values


In [53]:
# table join
df_atomX %>% nrow %>% print

df_atomX_j = left_join(df_atomX, df_shift, c('taxon' = 'taxon',
                                       'percIncorp'='percIncorp',
                                       'percTaxa'='percTaxa',
                                       'rep'='rep')) %>%
   filter(!is.na(BD_diff)) %>%
   mutate(true_incorporator = ifelse(median_true_BD_shift > 0.002, TRUE, FALSE),
          true_atom_fraction_excess = median_true_BD_shift / 0.036,
          atom_fraction_excess = ifelse(is.na(atom_CI_low), 0, atom_fraction_excess))

df_atomX_j %>% nrow %>% print
df_atomX_j %>% head(n=3) %>% print


[1] 344100
Warning message in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y):
“joining factors with different levels, coercing to character vector”
[1] 326401
                                 taxon  control treatment       BD_diff
1 Acetobacter_pasteurianus_IFO_3283-03 1.712271  1.711311 -0.0009600536
2       Acetobacterium_woodii_DSM_1030 1.705148  1.703813 -0.0013348264
3    Acetohalobium_arabaticum_DSM_5501 1.691176  1.694380  0.0032041590
  control_GC control_MW treatment_max_MW treatment_MW atom_fraction_excess
1  0.7929281   308.0843         317.6634     307.9116          -0.01783259
2  0.7076277   308.0420         317.6636     307.8008          -0.02478392
3  0.5403036   307.9590         317.6641     308.5425           0.05945183
  atom_CI_low atom_CI_high percIncorp percTaxa rep
1 -0.05099068   0.01526571          0        1   1
2 -0.08805054   0.03707605          0        1   1
3 -0.07397948   0.19925343          0        1   1
                              file median_true_BD_shift true_incorporator
1 OTU_abs1e9_PCR_sub_qSIP_atom.txt                    0             FALSE
2 OTU_abs1e9_PCR_sub_qSIP_atom.txt                    0             FALSE
3 OTU_abs1e9_PCR_sub_qSIP_atom.txt                    0             FALSE
  true_atom_fraction_excess
1                         0
2                         0
3                         0

Joining with MW-HR-SIP


In [54]:
df_atomX_j = left_join(df_atomX_j, df_incorp, c('taxon' = 'taxon',
                                    'percIncorp'='percIncorp',
                                    'percTaxa'='percTaxa',
                                    'rep'='rep')) %>%
    mutate(HWHRSIP_incorp = ifelse(is.na(HWHRSIP_incorp), FALSE, TRUE))

df_atomX_j %>% nrow %>% print
df_atomX_j %>% head(n=3) %>% print


Warning message in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y):
“joining factor and character vector, coercing into character vector”
[1] 326401
                                 taxon  control treatment       BD_diff
1 Acetobacter_pasteurianus_IFO_3283-03 1.712271  1.711311 -0.0009600536
2       Acetobacterium_woodii_DSM_1030 1.705148  1.703813 -0.0013348264
3    Acetohalobium_arabaticum_DSM_5501 1.691176  1.694380  0.0032041590
  control_GC control_MW treatment_max_MW treatment_MW atom_fraction_excess
1  0.7929281   308.0843         317.6634     307.9116          -0.01783259
2  0.7076277   308.0420         317.6636     307.8008          -0.02478392
3  0.5403036   307.9590         317.6641     308.5425           0.05945183
  atom_CI_low atom_CI_high percIncorp percTaxa rep
1 -0.05099068   0.01526571          0        1   1
2 -0.08805054   0.03707605          0        1   1
3 -0.07397948   0.19925343          0        1   1
                              file median_true_BD_shift true_incorporator
1 OTU_abs1e9_PCR_sub_qSIP_atom.txt                    0             FALSE
2 OTU_abs1e9_PCR_sub_qSIP_atom.txt                    0             FALSE
3 OTU_abs1e9_PCR_sub_qSIP_atom.txt                    0             FALSE
  true_atom_fraction_excess HWHRSIP_incorp
1                         0          FALSE
2                         0          FALSE
3                         0          FALSE

delta BD


In [18]:
dBD_files = list.files(path=workDir, pattern='*_dBD.txt', full.names=TRUE, recursive=TRUE)
dBD_files %>% length %>% print


[1] 300

In [19]:
df_dBD = list()
for(F in dBD_files){
    tmp = read.delim(F, sep='\t') 
    FF = strsplit(F, '/') %>% unlist
    FFl = length(FF)
    tmp$percIncorp = FF[FFl-3]
    tmp$percTaxa = FF[FFl-2]
    tmp$rep = FF[FFl-1]
    tmp$file = FF[FFl]
    df_dBD[[F]] = tmp
}

df_dBD = do.call(rbind, df_dBD)
rownames(df_dBD) = 1:nrow(df_dBD)
df_dBD %>% head(n=3) %>% print


                                 taxon mean_CM_control mean_CM_treatment
1       Acaryochloris_marina_MBIC11017        1.720172          1.724404
2 Acetobacter_pasteurianus_IFO_3283-03        1.714859          1.719583
3       Acetobacterium_woodii_DSM_1030        1.710761          1.708133
  stdev_CM_control stdev_CM_treatment     delta_BD percIncorp percTaxa rep
1      0.033328784        0.017888561  0.004231134          0        1   1
2      0.004887423        0.006516244  0.004723806          0        1   1
3      0.006880538        0.002494913 -0.002627739          0        1   1
                        file
1 OTU_abs1e9_PCR_sub_dBD.txt
2 OTU_abs1e9_PCR_sub_dBD.txt
3 OTU_abs1e9_PCR_sub_dBD.txt

Joining estimate with truth


In [51]:
df_dBD %>% dim %>% print

df_dBD_j = inner_join(df_dBD, df_shift, c('taxon' = 'taxon',
                                       'percIncorp'='percIncorp',
                                       'percTaxa'='percTaxa',
                                       'rep'='rep')) %>%
    filter(!is.na(delta_BD)) %>%
    mutate(true_incorporator = ifelse(median_true_BD_shift > 0.002, TRUE, FALSE),
           true_atom_fraction_excess = median_true_BD_shift / 0.036,
           atom_fraction_excess = delta_BD / 0.036)

df_dBD_j %>% dim %>% print
df_dBD_j %>% head(n=3)


[1] 344100     10
Warning message in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y):
“joining factors with different levels, coercing to character vector”
[1] 330600     14
taxonmean_CM_controlmean_CM_treatmentstdev_CM_controlstdev_CM_treatmentdelta_BDpercIncorppercTaxarepfilemedian_true_BD_shifttrue_incorporatortrue_atom_fraction_excessatom_fraction_excess
1Acaryochloris_marina_MBIC110171.72017237501 1.72440350877 0.0333287835795 0.0178885610258 0.00423113375962 0 1 1 OTU_abs1e9_PCR_sub_dBD.txt 0 FALSE 0 0.117531493322778
2Acetobacter_pasteurianus_IFO_3283-031.71485933284 1.7195831393 0.00488742265951 0.00651624383562 0.00472380645756 0 1 1 OTU_abs1e9_PCR_sub_dBD.txt 0 FALSE 0 0.131216846043333
3Acetobacterium_woodii_DSM_10301.71076084855 1.7081331098 0.00688053751389 0.00249491325664 -0.00262773875329 0 1 1 OTU_abs1e9_PCR_sub_dBD.txt 0 FALSE 0 -0.0729927431469444

Joining with MW-HR-SIP


In [52]:
df_dBD_j %>% nrow %>% print

df_dBD_j = left_join(df_dBD_j, df_incorp, c('taxon' = 'taxon',
                                    'percIncorp'='percIncorp',
                                    'percTaxa'='percTaxa',
                                    'rep'='rep')) %>%
    mutate(HWHRSIP_incorp = ifelse(is.na(HWHRSIP_incorp), FALSE, TRUE)) 

df_dBD_j %>% nrow %>% print
df_dBD_j %>% head(n=3)


[1] 330600
Warning message in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y):
“joining factor and character vector, coercing into character vector”
[1] 330600
taxonmean_CM_controlmean_CM_treatmentstdev_CM_controlstdev_CM_treatmentdelta_BDpercIncorppercTaxarepfilemedian_true_BD_shifttrue_incorporatortrue_atom_fraction_excessatom_fraction_excessHWHRSIP_incorp
1Acaryochloris_marina_MBIC110171.72017237501 1.72440350877 0.0333287835795 0.0178885610258 0.00423113375962 0 1 1 OTU_abs1e9_PCR_sub_dBD.txt 0 FALSE 0 0.117531493322778 FALSE
2Acetobacter_pasteurianus_IFO_3283-031.71485933284 1.7195831393 0.00488742265951 0.00651624383562 0.00472380645756 0 1 1 OTU_abs1e9_PCR_sub_dBD.txt 0 FALSE 0 0.131216846043333 FALSE
3Acetobacterium_woodii_DSM_10301.71076084855 1.7081331098 0.00688053751389 0.00249491325664 -0.00262773875329 0 1 1 OTU_abs1e9_PCR_sub_dBD.txt 0 FALSE 0 -0.0729927431469444 FALSE

Truth versus estimate

Calc

Just true incorporators


In [191]:
# q-SIP
df_atomX_j_true = df_atomX_j %>%
    filter(true_incorporator == TRUE) %>%         # just incorporators identified by q-SIP
    mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
    group_by(percIncorp, percTaxa, true_incorporator) %>%
    summarize(mean_delta_excess = mean(delta_excess),
              sd_delta_excess = sd(delta_excess)) %>%
    ungroup() %>%
    mutate(percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
           percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric),
           method = 'q-SIP')

df_atomX_j_true %>% head(n=3)


percIncorppercTaxatrue_incorporatormean_delta_excesssd_delta_excessmethod
1100 1 TRUE -36.256516480023115.0515360702884 q-SIP
2100 10 TRUE -36.29564633290914.406663788936 q-SIP
3100 25 TRUE -36.070518500407615.1162519051182 q-SIP

In [192]:
# delta-BD
df_dBD_j_true = df_dBD_j %>%
    mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
    filter(true_incorporator == TRUE) %>%
    group_by(percIncorp, percTaxa, true_incorporator) %>%
    summarize(mean_delta_excess = mean(delta_excess),
              sd_delta_excess = sd(delta_excess)) %>%
    ungroup() %>%
    mutate(percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
           percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric),
           method = 'delta BD')

df_dBD_j_true %>% head(n=3)


percIncorppercTaxatrue_incorporatormean_delta_excesssd_delta_excessmethod
1100 1 TRUE -17.052569747662226.5039438882644 delta BD
2100 10 TRUE -16.727341688148820.3378142181526 delta BD
3100 25 TRUE -15.975128433874220.4481996969328 delta BD

Plotting results


In [193]:
# plotting
tmp = rbind(df_atomX_j_true, df_dBD_j_true)

options(repr.plot.width=8, repr.plot.height=2.5)
p_BDshift_true = ggplot(tmp, aes(percIncorp, mean_delta_excess, 
                      color=percTaxa, group=percTaxa,
                      ymin=mean_delta_excess-sd_delta_excess,
                     ymax=mean_delta_excess+sd_delta_excess)) +
    geom_line() +
    geom_linerange(alpha=0.4, size=1) +    
    geom_point() +
    scale_color_discrete('% incorp-\norators') +
    facet_grid(. ~ method) +
    labs(x='13C atom % excess (truth)', 
         y='13C atom % excess\n(truth - estimate)') +
    theme_bw() 

plot(p_BDshift_true)


All incorporators


In [194]:
df_atomX_j_all = df_atomX_j %>%
    filter(atom_CI_low > 0) %>%     
    mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
    group_by(percIncorp, percTaxa) %>%
    summarize(mean_delta_excess = mean(delta_excess),
              sd_delta_excess = sd(delta_excess)) %>%
    ungroup() %>%
    mutate(percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
           percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric),
           method = 'q-SIP')

df_atomX_j %>% head(n=3)


taxoncontroltreatmentBD_diffcontrol_GCcontrol_MWtreatment_max_MWtreatment_MWatom_fraction_excessatom_CI_lowatom_CI_highpercIncorppercTaxarepfilemedian_true_BD_shifttrue_incorporatortrue_atom_fraction_excessHWHRSIP_incorp
1Acetobacter_pasteurianus_IFO_3283-031.71227125665 1.71131120304 -0.000960053607779 0.792928132671 308.084292354 317.663400733 307.911552529 -0.0178325868962 -0.0509906755861 0.0152657111868 0 1 1 OTU_abs1e9_PCR_sub_qSIP_atom.txt 0 FALSE 0 FALSE
2Acetobacterium_woodii_DSM_1030 1.70514816218 1.70381333577 -0.00133482640163 0.707627741435 308.04198336 317.66363345 307.800841516 -0.0247839189809 -0.088050542178 0.037076045015 0 1 1 OTU_abs1e9_PCR_sub_qSIP_atom.txt0 FALSE 0 FALSE
3Acetohalobium_arabaticum_DSM_55011.69117559631 1.69437975532 0.00320415900593 0.540303646624 307.958990609 317.664089944 308.542460223 0.059451829155 -0.0739794813718 0.19925343063 0 1 1 OTU_abs1e9_PCR_sub_qSIP_atom.txt 0 FALSE 0 FALSE

In [195]:
# delta-BD
df_dBD_j_all = df_dBD_j %>%
    mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100) %>%
    group_by(percIncorp, percTaxa) %>%
    summarize(mean_delta_excess = mean(delta_excess),
              sd_delta_excess = sd(delta_excess)) %>%
    ungroup() %>%
    mutate(percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
           percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric),
           method = 'delta BD')

df_dBD_j %>% head(n=3)


taxonmean_CM_controlmean_CM_treatmentstdev_CM_controlstdev_CM_treatmentdelta_BDpercIncorppercTaxarepfilemedian_true_BD_shifttrue_incorporatortrue_atom_fraction_excessatom_fraction_excessHWHRSIP_incorp
1Acaryochloris_marina_MBIC110171.72017237501 1.72440350877 0.0333287835795 0.0178885610258 0.00423113375962 0 1 1 OTU_abs1e9_PCR_sub_dBD.txt 0 FALSE 0 0.117531493322778 FALSE
2Acetobacter_pasteurianus_IFO_3283-031.71485933284 1.7195831393 0.00488742265951 0.00651624383562 0.00472380645756 0 1 1 OTU_abs1e9_PCR_sub_dBD.txt 0 FALSE 0 0.131216846043333 FALSE
3Acetobacterium_woodii_DSM_10301.71076084855 1.7081331098 0.00688053751389 0.00249491325664 -0.00262773875329 0 1 1 OTU_abs1e9_PCR_sub_dBD.txt 0 FALSE 0 -0.0729927431469444 FALSE

In [196]:
# plotting
tmp = rbind(df_atomX_j_all, df_dBD_j_all)

options(repr.plot.width=8, repr.plot.height=2.5)
p_BDshift_all = ggplot(tmp, aes(percIncorp, mean_delta_excess, 
                      color=percTaxa, group=percTaxa,
                      ymin=mean_delta_excess-sd_delta_excess,
                     ymax=mean_delta_excess+sd_delta_excess)) +
    geom_line() +
    geom_linerange(alpha=0.4, size=1) +    
    geom_point() +
    scale_color_discrete('% incorp-\norators') +
    facet_grid(. ~ method) +
    labs(x='13C atom % excess (truth)', 
         y='13C atom % excess\n(truth - estimate)') +
    theme_bw() 

plot(p_BDshift_all)


All incorporators: facetted


In [197]:
# q-SIP
tmp_all = df_atomX_j %>%
    filter(atom_CI_low > 0) %>%     
    mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100,
           true_incorporator = 'All incorp.') %>%
    group_by(percIncorp, percTaxa, true_incorporator) %>%
    summarize(mean_delta_excess = mean(delta_excess),
              sd_delta_excess = sd(delta_excess)) %>%
    ungroup() %>%
    mutate(percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
           percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric),
           method = 'q-SIP')

tmp_split = df_atomX_j %>%
    filter(atom_CI_low > 0) %>%     
    mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100,
           true_incorporator = ifelse(true_incorporator == TRUE, 'True incorp.', 'False incorp.')) %>%
    group_by(percIncorp, percTaxa, true_incorporator) %>%
    summarize(mean_delta_excess = mean(delta_excess),
              sd_delta_excess = sd(delta_excess)) %>%
    ungroup() %>%
    mutate(percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
           percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric),
           method = 'q-SIP')

df_atomX_j_s = rbind(tmp_all, tmp_split) %>%
    mutate(method = 'q-SIP')

df_atomX_j_s %>% head(n=3)


percIncorppercTaxatrue_incorporatormean_delta_excesssd_delta_excessmethod
10 1 All incorp. 5.011595141220626.1409204535886 q-SIP
20 10 All incorp. 4.687931745650216.28721210275491q-SIP
30 25 All incorp. 4.731457937697255.85088275884438q-SIP

In [198]:
# delta BD
tmp_all = df_dBD_j %>%
    mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100,
           true_incorporator = ifelse(true_incorporator == TRUE, 'True incorp.', 'False incorp.')) %>%
    group_by(percIncorp, percTaxa, true_incorporator) %>%
    summarize(mean_delta_excess = mean(delta_excess),
              sd_delta_excess = sd(delta_excess)) %>%
    ungroup() %>%
    mutate(percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
           percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric))

tmp_split = df_dBD_j %>%
    mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100,
           true_incorporator = 'All incorp.') %>%
    group_by(percIncorp, percTaxa, true_incorporator) %>%
    summarize(mean_delta_excess = mean(delta_excess),
              sd_delta_excess = sd(delta_excess)) %>%
    ungroup() %>%
    mutate(percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
           percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric))


df_dBD_j_s = rbind(tmp_all, tmp_split) %>%
    mutate(method = 'delta BD')
df_dBD_j_s %>% head(n=3)


percIncorppercTaxatrue_incorporatormean_delta_excesssd_delta_excessmethod
10 1 False incorp. -0.24935288352125819.9606030109202 delta BD
20 10 False incorp. 0.21880466743539719.6756894001259 delta BD
30 25 False incorp. 0.18523158307173720.1652708651104 delta BD

Combined plot


In [199]:
levs = c('All incorp.', 'True incorp.', 'False incorp.')
tmp = rbind(df_atomX_j_s, df_dBD_j_s) %>%
    mutate(true_incorporator = factor(true_incorporator, levels=levs))

# plotting
options(repr.plot.width=8, repr.plot.height=3)
p_BDshift_facet = ggplot(tmp, aes(percIncorp, mean_delta_excess, 
                      color=percTaxa, group=percTaxa,
                      ymin=mean_delta_excess-sd_delta_excess,
                     ymax=mean_delta_excess+sd_delta_excess)) +
    geom_line() +
    geom_linerange(alpha=0.4, size=1) +    
    geom_point() +
    facet_grid(method ~ true_incorporator) +
    scale_color_discrete('% incorp-\norators') +
    labs(x='13C atom % excess (truth)', 
         y='13C atom % excess\n(truth - estimate)') +
    theme_bw() 

plot(p_BDshift_facet)


Density plots

Just true incorporators


In [211]:
# q-SIP
tmp_atomX = df_atomX_j %>%
    filter(percTaxa == '10',
           true_incorporator == TRUE,
           atom_fraction_excess > 0) %>%
    mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100,
           percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
           percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric), 
           atom_perc_excess = atom_fraction_excess * 100,
           true_atom_perc_excess = true_atom_fraction_excess * 100) %>%
    dplyr::select(atom_perc_excess, true_atom_perc_excess, percIncorp, percTaxa) %>%
    mutate(method = 'q-SIP')

tmp_atomX %>% head(n=3)


atom_perc_excesstrue_atom_perc_excesspercIncorppercTaxamethod
173.1840328507100 100 10 q-SIP
271.1606289706100 100 10 q-SIP
366.5302376265100 100 10 q-SIP

In [212]:
# delta BD
tmp_dBD = df_dBD_j %>%
    filter(percTaxa == '10',
           true_incorporator == TRUE) %>%
    mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100,
           percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
           percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric), 
           atom_perc_excess = atom_fraction_excess * 100,
           true_atom_perc_excess = true_atom_fraction_excess * 100) %>%
    dplyr::select(atom_perc_excess, true_atom_perc_excess, percIncorp, percTaxa) %>%
    mutate(method = 'delta BD')

tmp_dBD %>% head(n=3)


atom_perc_excesstrue_atom_perc_excesspercIncorppercTaxamethod
1113.106943472222100 100 10 delta BD
2126.197355921667100 100 10 delta BD
374.0483719483333100 100 10 delta BD

In [213]:
# for vline
df_j = rbind(tmp_atomX, tmp_dBD)

tmp = df_j %>%
    distinct(percIncorp) %>%
    mutate(true_atom_perc_excess = percIncorp %>% as.character %>% as.numeric)

options(repr.plot.width=10, repr.plot.height=3)
p_dens_true = ggplot(df_j, aes(atom_perc_excess)) +
    geom_density(alpha=0.5, fill='grey70') +
    geom_vline(data=tmp, aes(xintercept=true_atom_perc_excess), linetype='dashed', alpha=0.4) +
    scale_x_continuous(limits=c(-20, 120)) +
    labs(x='atom % excess 13C (estimate)', y='Probability density') +
    facet_grid(method ~ percIncorp) +
    theme_bw() 

plot(p_dens_true)


Warning message:
“Removed 152 rows containing non-finite values (stat_density).”

All incorporators


In [214]:
# delta BD
tmp_atomX = df_atomX_j %>%
    filter(percTaxa == '10',
           atom_CI_low > 0) %>%
    mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100,
           percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
           percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric), 
           atom_perc_excess = atom_fraction_excess * 100,
           true_atom_perc_excess = true_atom_fraction_excess * 100) %>%
    dplyr::select(atom_perc_excess, true_atom_perc_excess, percIncorp, percTaxa, true_incorporator) %>%
    mutate(method = 'q-SIP')

tmp_atomX %>% head(n=3)


atom_perc_excesstrue_atom_perc_excesspercIncorppercTaxatrue_incorporatormethod
16.823550109960 0 10 FALSE q-SIP
25.767546912090 0 10 FALSE q-SIP
39.68882481680 0 10 FALSE q-SIP

In [215]:
# delta BD
tmp_dBD = df_dBD_j %>%
    filter(percTaxa == '10') %>%
    mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100,
           percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
           percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric), 
           atom_perc_excess = atom_fraction_excess * 100,
           true_atom_perc_excess = true_atom_fraction_excess * 100) %>%
    dplyr::select(atom_perc_excess, true_atom_perc_excess, percIncorp, percTaxa, true_incorporator) %>%
    mutate(method = 'delta BD')

tmp_dBD %>% head(n=3)


atom_perc_excesstrue_atom_perc_excesspercIncorppercTaxatrue_incorporatormethod
18.674823553944450 0 10 FALSE delta BD
25.507196064055560 0 10 FALSE delta BD
312.13335179441670 0 10 FALSE delta BD

In [216]:
# for vline
df_j = rbind(tmp_atomX, tmp_dBD)

tmp = df_j %>%
    distinct(percIncorp) %>%
    mutate(true_atom_perc_excess = percIncorp %>% as.character %>% as.numeric)

options(repr.plot.width=10, repr.plot.height=3)
p_dens_all = ggplot(df_j, aes(atom_perc_excess)) +
    geom_density(alpha=0.5, fill='grey70') +
    geom_vline(data=tmp, aes(xintercept=true_atom_perc_excess), linetype='dashed', alpha=0.4) +
    scale_x_continuous(limits=c(-20, 120)) +
    scale_fill_discrete('True incorp-\norator?') +
    labs(x='atom % excess 13C (estimate)', y='Probability density') +
    facet_grid(method ~ percIncorp) +
    theme_bw() 

plot(p_dens_all)


Warning message:
“Removed 11427 rows containing non-finite values (stat_density).”

All incorporators: faceted


In [217]:
# for vline
df_j = rbind(tmp_atomX, tmp_dBD)

tmp = df_j %>%
    distinct(percIncorp) %>%
    mutate(true_atom_perc_excess = percIncorp %>% as.character %>% as.numeric)

options(repr.plot.width=10, repr.plot.height=3)
p_dens_all_facet = ggplot(df_j, aes(atom_perc_excess)) +
    geom_density(alpha=0.5, aes(fill=true_incorporator)) +
    geom_vline(data=tmp, aes(xintercept=true_atom_perc_excess), linetype='dashed', alpha=0.4) +
    scale_x_continuous(limits=c(-20, 120)) +
    scale_fill_discrete('True incorp-\norator?') +
    labs(x='atom % excess 13C (estimate)', y='Probability density') +
    facet_grid(method ~ percIncorp) +
    theme_bw() 

plot(p_dens_all_facet)


Warning message:
“Removed 11427 rows containing non-finite values (stat_density).”

Truth-v-Est + Density

Just True incorporators


In [218]:
options(repr.plot.width=10, repr.plot.height=6)
p_TE_dens_true = cowplot::ggdraw() +
    cowplot::draw_plot(p_BDshift_true, 0, 0.6, 0.95, 0.4) +
    cowplot::draw_plot(p_dens_true, 0, 0, 0.9, 0.6) +
    cowplot::draw_plot_label(c('A)', 'B)'), c(0, 0), c(1, 0.6), size=12)

plot(p_TE_dens_true)


Warning message:
“Removed 152 rows containing non-finite values (stat_density).”

All incorporators


In [219]:
options(repr.plot.width=10, repr.plot.height=6)
p_TE_dens_all = cowplot::ggdraw() +
    cowplot::draw_plot(p_BDshift_all, 0, 0.6, 0.95, 0.4) +
    cowplot::draw_plot(p_dens_all, 0, 0, 0.9, 0.6) +
    cowplot::draw_plot_label(c('A)', 'B)'), c(0, 0), c(1, 0.6), size=12)

plot(p_TE_dens_all)


Warning message:
“Removed 11427 rows containing non-finite values (stat_density).”

All incorporators: faceted


In [220]:
options(repr.plot.width=10, repr.plot.height=7)
p_TE_dens_all_facet = cowplot::ggdraw() +
    cowplot::draw_plot(p_BDshift_facet, 0, 0.5, 0.95, 0.5) +
    cowplot::draw_plot(p_dens_all_facet, 0, 0, 1, 0.50) +
    cowplot::draw_plot_label(c('A)', 'B)'), c(0, 0), c(1, 0.5), size=12)

plot(p_TE_dens_all_facet)


Warning message:
“Removed 11427 rows containing non-finite values (stat_density).”

In [ ]:


In [ ]:


In [ ]:

END

MW-HR-SIP filter


In [175]:
df.jj = rbind(df.j.dis.qSIP %>% mutate(method='qSIP'),
              df.j.dis.dBD %>% mutate(method='Delta BD')) %>%
    mutate(method = gsub('qSIP', 'q-SIP', method))

options(repr.plot.width=8, repr.plot.height=4)
p.comb = ggplot(df.jj, aes(percIncorp, mean_delta_excess, 
                      color=percTaxa, group=percTaxa,
                      ymin=mean_delta_excess-sd_delta_excess,
                     ymax=mean_delta_excess+sd_delta_excess)) +
    geom_line() +
    geom_linerange(alpha=0.5, size=1) +
    geom_point() +
    scale_color_discrete('% incorp-\norators') +
    labs(x='atom % excess 13C (truth)', 
         y='atom % excess 13C\n(truth - estimate)') +
    facet_grid(incorp_called ~ method) +
    theme_bw() 
p.comb


Warning message:
“Removed 1 rows containing missing values (geom_linerange).”

In [172]:
outF = file.path(workDir, 'atomIncorp_taxaIncorp_BDshift-MWHRSIP.pdf')
ggsave(outF, p.comb, width=8, height=4)
cat('File written:', outF, '\n')


Warning message:
“Removed 1 rows containing missing values (geom_linerange).”
File written: /ebio/abt3_projects/methanogen_host_evo/SIPSim_pt2/data/bac_genome1147/atomIncorp_taxaIncorp//atomIncorp_taxaIncorp_BDshift-MWHRSIP.pdf 

In [173]:
outF = file.path(workDir, 'atomIncorp_taxaIncorp_BDshift-MWHRSIP.jpeg')
ggsave(outF, p.comb, width=8, height=4)
cat('File written:', outF, '\n')


Warning message:
“Removed 1 rows containing missing values (geom_linerange).”
File written: /ebio/abt3_projects/methanogen_host_evo/SIPSim_pt2/data/bac_genome1147/atomIncorp_taxaIncorp//atomIncorp_taxaIncorp_BDshift-MWHRSIP.jpeg 

Why does q-SIP under-estimate true incorp?

  • for true incorporators, q-SIP seems to show ~30% under-estimation in atom % excess 13C
    • is this due to low abundance taxa 'heavy' tail having high levels of noise?

True BD shift


In [ ]:
BDshift_files = list.files(path=workDir, pattern='BD-shift_stats.txt', full.names=TRUE, recursive=TRUE)
BDshift_files %>% length %>% print

In [ ]:
df_shift = list()
for(F in BDshift_files){
    tmp = read.delim(F, sep='\t') 
    FF = strsplit(F, '/') %>% unlist
    FFl = length(FF)
    tmp$percIncorp = FF[FFl-3]
    tmp$percTaxa = FF[FFl-2]
    tmp$rep = FF[FFl-1]
    tmp$file = FF[FFl]
    df_shift[[F]] = tmp
}

df_shift = do.call(rbind, df_shift)
rownames(df_shift) = 1:nrow(df_shift)

df_shift = df_shift %>%
    filter(library %in% c(2,4,6)) %>%
    group_by(taxon, percIncorp, percTaxa, rep) %>%
    summarize(median = median(median)) %>%
    ungroup() %>%
    rename('median_true_BD_shift' = median) 

# status
df_shift %>% nrow %>% print
df_shift %>% head(n=3)

q-SIP


In [ ]:
atomX_files = list.files(path=workDir, pattern='*_qSIP_atom.txt', full.names=TRUE, recursive=TRUE)
atomX_files %>% length %>% print

In [ ]:
df_atomX = list()
for(F in atomX_files){
    tmp = read.delim(F, sep='\t') 
    FF = strsplit(F, '/') %>% unlist
    FFl = length(FF)
    tmp$percIncorp = FF[FFl-3]
    tmp$percTaxa = FF[FFl-2]
    tmp$rep = FF[FFl-1]
    tmp$file = FF[FFl]
    df_atomX[[F]] = tmp
}

df_atomX = do.call(rbind, df_atomX)
rownames(df_atomX) = 1:nrow(df_atomX)
df_atomX %>% head(n=3) %>% print

Original taxon abundances


In [ ]:
comm_files = list.files(path=workDir, pattern='comm.txt', full.names=TRUE, recursive=TRUE)
comm_files %>% length %>% print

In [ ]:
df_comm = list()
for(F in comm_files){
    tmp = read.delim(F, sep='\t') 
    FF = strsplit(F, '/') %>% unlist
    FFl = length(FF)
    tmp$percIncorp = FF[FFl-3]
    tmp$percTaxa = FF[FFl-2]
    tmp$rep = FF[FFl-1]
    tmp$file = FF[FFl]
    df_comm[[F]] = tmp
}

df_comm = do.call(rbind, df_comm)
rownames(df_comm) = 1:nrow(df_comm)


# mean abund for libraries
df_comm = df_comm %>%
    rename('taxon' = taxon_name) %>%
    mutate(rel_abund_perc = ifelse(is.na(rel_abund_perc), 0, rel_abund_perc)) %>%
    group_by(taxon, percIncorp, percTaxa, rep) %>%
    summarize(any_zero = any(rel_abund_perc == 0),
              mean_rel_abund_perc = mean(rel_abund_perc),
              mean_rank_abund = mean(rank)) %>%
    ungroup()

# status
df_comm %>% nrow %>% print
df_comm %>% head(n=3)

In [ ]:
df_comm$any_zero %>% table

Joining estimate with true values


In [ ]:
# table join
join_vars = c('taxon' = 'taxon',
              'percIncorp'='percIncorp',
              'percTaxa'='percTaxa',
              'rep'='rep')

df_atomX %>% nrow %>% print

# joining q-SIP + true-BD
df.j = left_join(df_atomX, df_shift, join_vars) %>%
    filter(atom_CI_low > 0) %>%
    filter(!is.na(BD_diff)) %>%
    mutate(true_incorporator = ifelse(median_true_BD_shift > 0.002, TRUE, FALSE),
          true_atom_fraction_excess = median_true_BD_shift / 0.036,
          atom_fraction_excess = ifelse(is.na(atom_CI_low), 0, atom_fraction_excess)) %>%
    #filter(true_incorporator == TRUE) %>%
    mutate(delta_excess = atom_fraction_excess * 100 - true_atom_fraction_excess * 100)
df.j %>% nrow %>% print

# joining with abundances
df.j = df.j %>%
    inner_join(df_comm, join_vars) %>%
    mutate(percIncorp = percIncorp %>% as.character,
           percTaxa = percTaxa %>% as.character,
           percIncorp = percIncorp %>% reorder(percIncorp %>% as.numeric),
           percTaxa = percTaxa %>% reorder(percTaxa %>% as.numeric))

# status
df.j %>% nrow %>% print
df.j %>% head(n=3) %>% print

Plotting BD taxa_abund ~ BD_shift_accuracy


In [ ]:
options(repr.plot.width=8, repr.plot.height=6)
p.qSIP_abund = ggplot(df.j, aes(mean_rel_abund_perc, delta_excess, 
                                color=true_incorporator, group=true_incorporator)) +
    #geom_point(shape='O', alpha=0.15, size=0.5) +
    geom_smooth(size=0.3) +
    scale_x_log10() +
    scale_color_discrete('True incorp-\norator') +
    labs(x='% abundance', 
         y='atom % excess 13C\n(truth - estimate)') +
    facet_grid(percIncorp ~ percTaxa) +
    theme_bw() 

p.qSIP_abund

In [ ]:
outF = file.path(workDir, 'atomIncorp_taxaIncorp_qSIP-abund.pdf')
ggsave(outF, p.qSIP_abund, width=9, height=6)
cat('File written:', outF, '\n')

In [ ]:
outF = file.path(workDir, 'atomIncorp_taxaIncorp_qSIP-abund.jpeg')
ggsave(outF, p.qSIP_abund, width=9, height=6)
cat('File written:', outF, '\n')

In [ ]: