Imports


In [21]:
# xcms
require(xcms)

# download from MASSive
require(RCurl)
MY_URL <- "ftp://massive.ucsd.edu/peak/"
MY_USERNAME_PWD <- "MSV000084403:a"
MY_MZML_DIR <- '~/my_mzml_directory/'


# tidyverse
require(tidyverse)
require(readxl)
require(janitor)
require(stringr)
require(broom)

# better graphs
require(ggfortify)
require(viridis)
require(cowplot)

File collection and annotation


In [25]:
url <- "ftp://massive.ucsd.edu/peak/"
userpwd <- "MSV000084403:a"
destination <- '~/my_mzml_directory/'

filenames = getURL(MY_URL, 
                   ftp.use.epsv = FALSE, 
                   dirlistonly = TRUE,
                   userpwd = MY_USERNAME_PWD) %>%
            str_split('\n') %>%
            unlist()

filenames


for (fname in filenames){
    cmd <- paste0("curl -u ",
                  MY_USERNAME_PWD, 
                  ' ',
                  MY_URL, fname,
                  " -o ",
                  MY_MZML_DIR, fname)
    print(cmd)
    system(cmd)
}


  1. 'samp01_rep01.mzML'
  2. 'samp01_rep02.mzML'
  3. 'samp02_rep01.mzML'
  4. 'samp02_rep02.mzML'
  5. 'samp03_rep01.mzML'
  6. 'samp03_rep02.mzML'
  7. 'samp04_rep01.mzML'
  8. 'samp04_rep02.mzML'
  9. 'samp05_rep01.mzML'
  10. 'samp05_rep02.mzML'
  11. 'samp06_rep01.mzML'
  12. 'samp06_rep02.mzML'
  13. 'samp07_rep01.mzML'
  14. 'samp07_rep02.mzML'
  15. 'samp08_rep01.mzML'
  16. 'samp08_rep02.mzML'
  17. 'samp09_rep01.mzML'
  18. 'samp09_rep02.mzML'
  19. 'samp10_rep01.mzML'
  20. 'samp10_rep02.mzML'
  21. ''
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp01_rep01.mzML -o ~/my_mzml_directory/samp01_rep01.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp01_rep02.mzML -o ~/my_mzml_directory/samp01_rep02.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp02_rep01.mzML -o ~/my_mzml_directory/samp02_rep01.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp02_rep02.mzML -o ~/my_mzml_directory/samp02_rep02.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp03_rep01.mzML -o ~/my_mzml_directory/samp03_rep01.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp03_rep02.mzML -o ~/my_mzml_directory/samp03_rep02.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp04_rep01.mzML -o ~/my_mzml_directory/samp04_rep01.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp04_rep02.mzML -o ~/my_mzml_directory/samp04_rep02.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp05_rep01.mzML -o ~/my_mzml_directory/samp05_rep01.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp05_rep02.mzML -o ~/my_mzml_directory/samp05_rep02.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp06_rep01.mzML -o ~/my_mzml_directory/samp06_rep01.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp06_rep02.mzML -o ~/my_mzml_directory/samp06_rep02.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp07_rep01.mzML -o ~/my_mzml_directory/samp07_rep01.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp07_rep02.mzML -o ~/my_mzml_directory/samp07_rep02.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp08_rep01.mzML -o ~/my_mzml_directory/samp08_rep01.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp08_rep02.mzML -o ~/my_mzml_directory/samp08_rep02.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp09_rep01.mzML -o ~/my_mzml_directory/samp09_rep01.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp09_rep02.mzML -o ~/my_mzml_directory/samp09_rep02.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp10_rep01.mzML -o ~/my_mzml_directory/samp10_rep01.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/samp10_rep02.mzML -o ~/my_mzml_directory/samp10_rep02.mzML"
[1] "curl -u MSV000084403:a ftp://massive.ucsd.edu/peak/ -o ~/my_mzml_directory/"

In [26]:
## get full paths and wells for files we have
my_files <- list.files(path = MY_MZML_DIR,
                   pattern = '*.mzML',
                   full.names = T)


## annotate these files
phenodat <- tibble(fname = my_files) %>%
    mutate(sample = str_extract(fname, '(?<=samp)[0-9]{1,2}') %>% as.numeric,
           replicate = str_extract(fname, '(?<=rep)[0-9]{2}')) %>%
    mutate(group = ifelse(sample <= 5, 'CH3OH', 'CD3OH')) 

phenodat


A tibble: 20 × 4
fnamesamplereplicategroup
<chr><dbl><chr><chr>
/Users/curt/my_mzml_directory//samp01_rep01.mzML 101CH3OH
/Users/curt/my_mzml_directory//samp01_rep02.mzML 102CH3OH
/Users/curt/my_mzml_directory//samp02_rep01.mzML 201CH3OH
/Users/curt/my_mzml_directory//samp02_rep02.mzML 202CH3OH
/Users/curt/my_mzml_directory//samp03_rep01.mzML 301CH3OH
/Users/curt/my_mzml_directory//samp03_rep02.mzML 302CH3OH
/Users/curt/my_mzml_directory//samp04_rep01.mzML 401CH3OH
/Users/curt/my_mzml_directory//samp04_rep02.mzML 402CH3OH
/Users/curt/my_mzml_directory//samp05_rep01.mzML 501CH3OH
/Users/curt/my_mzml_directory//samp05_rep02.mzML 502CH3OH
/Users/curt/my_mzml_directory//samp06_rep01.mzML 601CD3OH
/Users/curt/my_mzml_directory//samp06_rep02.mzML 602CD3OH
/Users/curt/my_mzml_directory//samp07_rep01.mzML 701CD3OH
/Users/curt/my_mzml_directory//samp07_rep02.mzML 702CD3OH
/Users/curt/my_mzml_directory//samp08_rep01.mzML 801CD3OH
/Users/curt/my_mzml_directory//samp08_rep02.mzML 802CD3OH
/Users/curt/my_mzml_directory//samp09_rep01.mzML 901CD3OH
/Users/curt/my_mzml_directory//samp09_rep02.mzML 902CD3OH
/Users/curt/my_mzml_directory//samp10_rep01.mzML1001CD3OH
/Users/curt/my_mzml_directory//samp10_rep02.mzML1002CD3OH

XCMS


In [27]:
my_fnames <- phenodat %>% pull(fname)

## link code to files
raw_data <- readMSData(files = my_fnames,
           pdata = new("NAnnotatedDataFrame", phenodat),
           mode = "onDisk") 

## feature detection
cwp <- CentWaveParam(peakwidth = c(7.5, 20),
                     noise = 1200,
                     ppm=10)
xdata <- findChromPeaks(raw_data, param = cwp)

## retention time correction
xdata <- adjustRtime(xdata, param = ObiwarpParam(binSize = 0.3))

options(repr.plot.width=4, repr.plot.height=3)
plotAdjustedRtime(xdata)

## Correspondence: group peaks across samples.
pdp <- PeakDensityParam(sampleGroups = xdata@phenoData@data$group,
                        minFraction = 0.6,
                        bw = 0.5,
                        binSize = 0.1)

xdata <- groupChromPeaks(xdata, param = pdp)

## fill missing data
xdata <- fillChromPeaks(xdata)

## save
save(xdata, file='saved_data/xcms_r_object_pos.rdat')


R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
Sample number 10 used as center sample.
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
Applying retention time adjustment to the identified chromatographic peaks ... OK
Processing 18585 mz slices ... OK
Defining peak areas for filling-in .... OK
Start integrating peak areas from original files
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call

In [28]:
load('saved_data/xcms_r_object_pos.rdat')

xdata


MSn experiment data ("XCMSnExp")
Object size in memory: 6.17 Mb
- - - Spectra data - - -
 MS level(s): 1 
 Number of spectra: 19074 
 MSn retention times: 0:3 - 7:60 minutes
- - - Processing information - - -
Data loaded [Tue Oct  1 17:15:11 2019] 
 MSnbase version: 2.10.1 
- - - Meta data  - - -
phenoData
  rowNames: 1 2 ... 3 (20 total)
  varLabels: fname sample replicate group
  varMetadata: labelDescription
Loaded from:
  [1] samp01_rep01.mzML...  [20] samp10_rep02.mzML
  Use 'fileNames(.)' to see all files.
protocolData: none
featureData
  featureNames: F01.S001 F01.S002 ... F20.S954 (19074 total)
  fvarLabels: fileIdx spIdx ... spectrum (33 total)
  fvarMetadata: labelDescription
experimentData: use 'experimentData(object)'
- - - xcms preprocessing - - -
Chromatographic peak detection:
 method: centWave 
 189388 peaks identified in 20 samples.
 On average 9469 chromatographic peaks per sample.
Alignment/retention time adjustment:
 method: obiwarp 
Correspondence:
 method: chromatographic peak density 
 7010 features identified.
 Median mz range of features: 0.0022708
 Median rt range of features: 1.5007
 31582 filled peaks (on average 1579.1 per sample).

PCA on asinh-transformed data


In [29]:
pseudoLog10 <- function(x) { asinh(x/2)/log(10) }

In [30]:
# get feature intensity matrix

ft_ints <- 
    xdata %>% 
    featureValues(value="into") %>% 
    replace_na(0) %>% 
    pseudoLog10 %>%
    as.data.frame %>%
    rownames_to_column() %>%
    gather(sample, value, -rowname) %>%
    group_by(rowname) %>%
    filter(max(value) > 3.5) %>%
    spread(sample, value) %>%
    column_to_rownames('rowname') %>%
    as.matrix

In [68]:
my_pca <- 
    ft_ints %>% 
    t %>%
    as_tibble %>%
    nest() %>% 
    mutate(pca = map(data, 
                     ~ prcomp(.x, center = TRUE, scale = FALSE)
                     ),
         pca_aug = map2(pca, data, ~augment(.x, data = .y)))


Warning message:
“`...` must not be empty for ungrouped data frames.
Did you want `data = everything()`?”

In [81]:
var_exp <- my_pca %>%
    unnest(pca_aug) %>%
    summarize_at(.vars = vars(contains("PC", ignore.case=FALSE)), .funs = list(var)) %>%
    gather(key = pc, value = variance) %>%
    mutate(var_exp = variance/sum(variance),
           cum_var_exp = cumsum(var_exp),
           pc = str_replace(pc, ".fitted", "")
           )

var_exp


A tibble: 20 × 4
pcvariancevar_expcum_var_exp
<chr><dbl><dbl><dbl>
PC1 6.865384e+022.902629e-010.2902629
PC2 5.817680e+022.459668e-010.5362298
PC3 1.954117e+028.261850e-020.6188483
PC4 1.369366e+025.789568e-020.6767439
PC5 1.207657e+025.105876e-020.7278027
PC6 1.134499e+024.796571e-020.7757684
PC7 7.509254e+013.174852e-020.8075169
PC8 6.870928e+012.904973e-020.8365667
PC9 5.522817e+012.335003e-020.8599167
PC105.308640e+012.244450e-020.8823612
PC115.055055e+012.137237e-020.9037336
PC123.955917e+011.672530e-020.9204589
PC133.784201e+011.599930e-020.9364582
PC143.199304e+011.352640e-020.9499846
PC152.797590e+011.182798e-020.9618125
PC162.714885e+011.147831e-020.9732909
PC172.287683e+019.672139e-030.9829630
PC182.125362e+018.985858e-030.9919489
PC191.904281e+018.051148e-031.0000000
PC201.836739e-287.765583e-321.0000000

In [82]:
options(repr.plot.width=6, repr.plot.height=4)

my_pca %>% 
    pull(pca) %>% 
    '[['(1) %>% 
    '[['('x') %>%   
    as_tibble %>%
    mutate(sample_class = xdata@phenoData@data$group,
           sample_name = xdata@phenoData@data$sample) %>%
    group_by(sample_class, sample_name) %>%
    select(`PC1`, `PC2`, `PC3`) %>%
    ggplot(aes(x=PC1,
               y=PC2, 
               color = sample_class,
               label = sample_name)
          ) +
    scale_x_continuous(name = 'PC1\n(29% of variance)') +
    scale_y_continuous(name = 'PC2\n(25% of variance)') +
    geom_point(size=5) +
    geom_text(color='black') +
    ggtitle('positive mode')


Adding missing grouping variables: `sample_class`, `sample_name`

Model each feature's intensity as a function of group ID

(I.e., do t-tests)


In [83]:
my_names <- ft_ints %>% row.names
modeled <- ft_ints %>% 
    as_tibble() %>%
    mutate(name = my_names) %>%
    gather(fname, value, -name) %>%
    left_join(phenodat %>% mutate(fname = basename(fname)), by='fname') %>%
    select(name, value, group, sample) %>%
    mutate(sample = factor(sample)) %>%
    group_by(name) %>%
    do(tidy(lm(formula = value ~ 1 + I(group),
                          data=.)
             )
      ) 

group_effects <-
    modeled %>%
    filter(!str_detect(term, 'Intercept'))

intercepts <-
    modeled %>%
    filter(str_detect(term, 'Intercept'))

all_params_wide <-
    group_effects %>% 
    left_join(intercepts %>% mutate(intercept = estimate) %>% select(name, intercept),
              by='name') %>%
    left_join(featureDefinitions(xdata) %>% as.data.frame %>% rownames_to_column(),
              by = c('name' = 'rowname')) %>%
    mutate(fold_change = estimate) %>% select(-estimate) %>%
    select(name, fold_change, std.error, p.value, intercept, mzmed, rtmed, npeaks, CH3OH, CD3OH)

all_params_wide %>% arrange(p.value) %>% head


A grouped_df: 6 × 10
namefold_changestd.errorp.valueinterceptmzmedrtmednpeaksCH3OHCD3OH
<chr><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
FT0889-5.2959520.021840634.368873e-33 5.295952e+00217.1709282.7166610010
FT0723-4.4526800.033025081.682893e-28 4.452680e+00202.1514277.7152910010
FT2709-3.8520520.033313082.662814e-27 3.852052e+00388.2926307.72101 80 8
FT2389-4.7601260.041781953.477338e-27 4.760126e+00362.1928 63.18211 90 9
FT6256 4.5267890.044285222.442135e-26-7.944109e-16822.4898336.22572 66 0
FT2200-4.3895160.047102071.286057e-25 4.389516e+00347.3003313.2224910010

In [84]:
P.VALUE.CUTOFF <- 1e-5
LOGP.CUTOFF <- -log10(P.VALUE.CUTOFF)
FOLD_CHANGE.CUTOFF <- 1.5

In [85]:
options(repr.plot.width=6, repr.plot.height=6)
p1 <- 
    all_params_wide %>%
    filter(rtmed > 100) %>%
    mutate(lbl = ifelse(-log10(p.value) > LOGP.CUTOFF & abs(fold_change) > FOLD_CHANGE.CUTOFF,
                        mzmed %>% round(4) %>% as.character, NA)) %>%
    ggplot(aes(x=intercept + fold_change, y=intercept, 
               shape = -log10(p.value) > LOGP.CUTOFF, 
               alpha = -log10(p.value),
               color = -log10(p.value) > LOGP.CUTOFF & abs(fold_change) > FOLD_CHANGE.CUTOFF,
               label = lbl
               )
          ) +
        geom_point() +
        geom_text(hjust=-0.1, size=3.5, check_overlap = T) +
        geom_abline(slope=1) +
        scale_color_manual(values = c('black', 'red')) +
        theme(legend.position = 'none') +
        scale_x_continuous(name = 'log10(intensity),\nCH3OH-extracted sample') +
        scale_y_continuous(name = 'log10(intensity),\nCD3OH-extracted sample')

p1

ggsave('graphs/pos_mode_scatter.pdf', height=6, width=6)
ggsave('graphs/pos_mode_scatter.png', height=6, width=6)


Warning message:
“Removed 5854 rows containing missing values (geom_text).”Warning message:
“Removed 5854 rows containing missing values (geom_text).”Warning message:
“Removed 5854 rows containing missing values (geom_text).”

Find pairs of significantly altered features that:

have a mass difference of $\require{mhchem}\ce{D3} - \ce{H3}$

have the same retention time


In [86]:
3.01883 * (1-350/1e6)
3.01883 * (1+350/1e6)


3.0177734095
3.0198865905

In [87]:
three_d_neutrons <- 3.01883
MASS.CUTOFF.PPM <- 100
RT.CUTOFF <- 5
MASS.CUTOFF.HIGH <- three_d_neutrons * (1 + MASS.CUTOFF.PPM/1e6)
MASS.CUTOFF.LOW <- three_d_neutrons * (1 - MASS.CUTOFF.PPM/1e6)

In [88]:
hits <- all_params_wide %>% 
    filter(p.value < P.VALUE.CUTOFF, abs(fold_change) > FOLD_CHANGE.CUTOFF) %>%
    mutate(foo=1) %>%
    left_join(., ., by=c('foo')) %>%
    filter(abs(rtmed.x - rtmed.y) < RT.CUTOFF,   
            mzmed.y > mzmed.x,   # avoid double-counting
           (mzmed.y - mzmed.x) > MASS.CUTOFF.LOW, 
           (mzmed.y - mzmed.x) < MASS.CUTOFF.HIGH ) %>%
    ungroup %>% 
    mutate(row = row_number()) %>%
    select(row, name.x, name.y) %>%
    gather(set, feature, -row) %>%
    mutate(set = ifelse(set == 'name.x', 'CH3OH', 'CD3OH') %>% as.factor) %>%
    mutate(set = fct_relevel(set, 'CH3OH', after=0)) %>%
    spread(set, feature)
    
hit_plot <-
    hits %>% 
    left_join(all_params_wide %>% 
                  mutate(x.h = intercept + fold_change) %>%
                  select(x.h, y.h = intercept, p.value, mzmed, rtmed), 
              by=c('CH3OH'='name')) %>%
    left_join(all_params_wide %>%
              mutate(x.d = intercept + fold_change) %>%
                  select(x.d, y.d = intercept, p.value, mzmed, rtmed), 
              by=c('CD3OH'='name')) %>%
    select(row, x.h, y.h, x.d, y.d) %>%
    gather(var, x, x.h, x.d, -row) %>%
    gather(var.y, y, y.h, y.d, -row, -var) %>%
    group_by(row) %>%
    filter(var == 'x.h' & var.y == 'y.h' | var == 'x.d' & var.y == 'y.d') %>%
    mutate(lbl = '', p.value = 0.5)


    
p1 + 
    geom_line(data=hit_plot, aes(x=x, y=y, group=row), color='red', alpha=0.5)

ggsave('graphs/pos_mode_scatter_paired.pdf', height=6, width=6)
ggsave('graphs/pos_mode_scatter_paired.png', height=6, width=6)


Adding missing grouping variables: `name`
Adding missing grouping variables: `name`
Warning message:
“Removed 5854 rows containing missing values (geom_text).”Warning message:
“Removed 5854 rows containing missing values (geom_text).”Warning message:
“Removed 5854 rows containing missing values (geom_text).”

In [103]:
hits %>% 
    left_join(all_params_wide %>% 
                  mutate(x.h = intercept + fold_change) %>%
                  select(x.h, y.h = intercept, p.value, mzmed, rtmed), 
              by=c('CH3OH'='name')) %>%
    left_join(all_params_wide %>%
              mutate(x.d = intercept + fold_change) %>%
                  select(x.d, y.d = intercept, p.value, mzmed, rtmed), 
              by=c('CD3OH'='name')) %>%
    select(CH3OH, CD3OH, CH3OH.intensity=x.h, CD3OH.intensity=y.d, 
           p.value.CH3OH = p.value.x,
           p.value.CD3OH = p.value.y,
           mzmed.CH3OH = mzmed.x,
           mzmed.CD3OH = mzmed.y,
           rtmed.CH3OH = rtmed.x,
           rtmed.CD3OH = rtmed.y
           ) %>%
    arrange(rtmed.CH3OH) ->
    hit_table

options(repr.plot.width=8, repr.plot.height=4)

hit_table %>%
    ggplot(aes(x=rtmed.CH3OH/60, y=mzmed.CH3OH, ymin=0, ymax = mzmed.CH3OH,
               size=CH3OH.intensity, label = mzmed.CH3OH %>% round(4))) +
        geom_point(color='forestgreen', alpha=0.3) +
        geom_errorbar(width=0, size=0.5) +
        geom_text(size = 4, hjust=-0.1, check_overlap = T) +
        geom_point(aes(x=rtmed.CD3OH/60, y=-mzmed.CD3OH, size=CD3OH.intensity), color='firebrick', alpha=0.5) +
        geom_errorbar(aes(x=rtmed.CD3OH/60,  ymin=0, ymax=-mzmed.CD3OH, y=-mzmed.CD3OH), width=0, size=0.5) +
        geom_text(size = 4, hjust=-0.1, check_overlap = T, aes(label = mzmed.CD3OH %>% round(4), y=-mzmed.CD3OH)) +
        theme_cowplot() +
        theme(legend.position = 'bottom') +
        geom_abline(intercept = 0, slope=0) +
        xlab('retention time, min') +
        ylab('m/z, Da\n(negative: CD3OH; positive:CH3OH)')


Adding missing grouping variables: `name`
Adding missing grouping variables: `name`
Warning message:
“Ignoring unknown aesthetics: y”

In [104]:
hit_table %>%
    mutate(avg_rt = ((rtmed.CH3OH + rtmed.CD3OH) / 2 )) %>%
    mutate(approx.rt = avg_rt %>% round(0)) %>%
    group_by(approx.rt)


A grouped_df: 8 × 12
CH3OHCD3OHCH3OH.intensityCD3OH.intensityp.value.CH3OHp.value.CD3OHmzmed.CH3OHmzmed.CD3OHrtmed.CH3OHrtmed.CD3OHavg_rtapprox.rt
<chr><chr><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
FT2122FT21714.8779624.7105421.794188e-066.017511e-13342.1398345.1585 62.93043 62.68344 62.80693 63
FT1347FT13785.1103985.1519234.575414e-125.750295e-07267.1210270.1398228.20755227.70800227.95778228
FT2742FT27774.2112153.9801762.548459e-104.507042e-06391.1206394.1396230.20862230.20878230.20870230
FT1289FT13224.2771264.3365081.699914e-079.216228e-10261.0737264.0924236.20906235.70885235.95895236
FT5269FT52994.4851244.1535726.903719e-081.278762e-06661.2524664.2715245.71101246.21031245.96066246
FT0844FT08796.3293636.2464991.095318e-192.303823e-12213.1487216.1675283.21681282.71701282.96691283
FT5164FT51944.3522934.1362832.976586e-106.040734e-06644.2427647.2615303.22017303.22095303.22056303
FT6235FT62644.5629344.6109851.284181e-112.050999e-11820.4756823.4942327.72446327.72479327.72462328

Manual annotation of the hits is continued in the accompanying CH3OH_vs_CD3OH_xcms_hit_annotation.xlsx file.


In [ ]: