In [21]:
# xcms
require(xcms)
# download from MASSive
require(RCurl)
MY_URL <- "ftp://massive.ucsd.edu/peak/"
MY_USERNAME_PWD <- "MSV000084403:a"
MY_MZML_DIR <- '~/my_mzml_directory/'
# tidyverse
require(tidyverse)
require(readxl)
require(janitor)
require(stringr)
require(broom)
# better graphs
require(ggfortify)
require(viridis)
require(cowplot)
In [25]:
url <- "ftp://massive.ucsd.edu/peak/"
userpwd <- "MSV000084403:a"
destination <- '~/my_mzml_directory/'
filenames = getURL(MY_URL,
ftp.use.epsv = FALSE,
dirlistonly = TRUE,
userpwd = MY_USERNAME_PWD) %>%
str_split('\n') %>%
unlist()
filenames
for (fname in filenames){
cmd <- paste0("curl -u ",
MY_USERNAME_PWD,
' ',
MY_URL, fname,
" -o ",
MY_MZML_DIR, fname)
print(cmd)
system(cmd)
}
In [26]:
## get full paths and wells for files we have
my_files <- list.files(path = MY_MZML_DIR,
pattern = '*.mzML',
full.names = T)
## annotate these files
phenodat <- tibble(fname = my_files) %>%
mutate(sample = str_extract(fname, '(?<=samp)[0-9]{1,2}') %>% as.numeric,
replicate = str_extract(fname, '(?<=rep)[0-9]{2}')) %>%
mutate(group = ifelse(sample <= 5, 'CH3OH', 'CD3OH'))
phenodat
In [27]:
my_fnames <- phenodat %>% pull(fname)
## link code to files
raw_data <- readMSData(files = my_fnames,
pdata = new("NAnnotatedDataFrame", phenodat),
mode = "onDisk")
## feature detection
cwp <- CentWaveParam(peakwidth = c(7.5, 20),
noise = 1200,
ppm=10)
xdata <- findChromPeaks(raw_data, param = cwp)
## retention time correction
xdata <- adjustRtime(xdata, param = ObiwarpParam(binSize = 0.3))
options(repr.plot.width=4, repr.plot.height=3)
plotAdjustedRtime(xdata)
## Correspondence: group peaks across samples.
pdp <- PeakDensityParam(sampleGroups = xdata@phenoData@data$group,
minFraction = 0.6,
bw = 0.5,
binSize = 0.1)
xdata <- groupChromPeaks(xdata, param = pdp)
## fill missing data
xdata <- fillChromPeaks(xdata)
## save
save(xdata, file='saved_data/xcms_r_object_pos.rdat')
In [28]:
load('saved_data/xcms_r_object_pos.rdat')
xdata
In [29]:
pseudoLog10 <- function(x) { asinh(x/2)/log(10) }
In [30]:
# get feature intensity matrix
ft_ints <-
xdata %>%
featureValues(value="into") %>%
replace_na(0) %>%
pseudoLog10 %>%
as.data.frame %>%
rownames_to_column() %>%
gather(sample, value, -rowname) %>%
group_by(rowname) %>%
filter(max(value) > 3.5) %>%
spread(sample, value) %>%
column_to_rownames('rowname') %>%
as.matrix
In [68]:
my_pca <-
ft_ints %>%
t %>%
as_tibble %>%
nest() %>%
mutate(pca = map(data,
~ prcomp(.x, center = TRUE, scale = FALSE)
),
pca_aug = map2(pca, data, ~augment(.x, data = .y)))
In [81]:
var_exp <- my_pca %>%
unnest(pca_aug) %>%
summarize_at(.vars = vars(contains("PC", ignore.case=FALSE)), .funs = list(var)) %>%
gather(key = pc, value = variance) %>%
mutate(var_exp = variance/sum(variance),
cum_var_exp = cumsum(var_exp),
pc = str_replace(pc, ".fitted", "")
)
var_exp
In [82]:
options(repr.plot.width=6, repr.plot.height=4)
my_pca %>%
pull(pca) %>%
'[['(1) %>%
'[['('x') %>%
as_tibble %>%
mutate(sample_class = xdata@phenoData@data$group,
sample_name = xdata@phenoData@data$sample) %>%
group_by(sample_class, sample_name) %>%
select(`PC1`, `PC2`, `PC3`) %>%
ggplot(aes(x=PC1,
y=PC2,
color = sample_class,
label = sample_name)
) +
scale_x_continuous(name = 'PC1\n(29% of variance)') +
scale_y_continuous(name = 'PC2\n(25% of variance)') +
geom_point(size=5) +
geom_text(color='black') +
ggtitle('positive mode')
In [83]:
my_names <- ft_ints %>% row.names
modeled <- ft_ints %>%
as_tibble() %>%
mutate(name = my_names) %>%
gather(fname, value, -name) %>%
left_join(phenodat %>% mutate(fname = basename(fname)), by='fname') %>%
select(name, value, group, sample) %>%
mutate(sample = factor(sample)) %>%
group_by(name) %>%
do(tidy(lm(formula = value ~ 1 + I(group),
data=.)
)
)
group_effects <-
modeled %>%
filter(!str_detect(term, 'Intercept'))
intercepts <-
modeled %>%
filter(str_detect(term, 'Intercept'))
all_params_wide <-
group_effects %>%
left_join(intercepts %>% mutate(intercept = estimate) %>% select(name, intercept),
by='name') %>%
left_join(featureDefinitions(xdata) %>% as.data.frame %>% rownames_to_column(),
by = c('name' = 'rowname')) %>%
mutate(fold_change = estimate) %>% select(-estimate) %>%
select(name, fold_change, std.error, p.value, intercept, mzmed, rtmed, npeaks, CH3OH, CD3OH)
all_params_wide %>% arrange(p.value) %>% head
In [84]:
P.VALUE.CUTOFF <- 1e-5
LOGP.CUTOFF <- -log10(P.VALUE.CUTOFF)
FOLD_CHANGE.CUTOFF <- 1.5
In [85]:
options(repr.plot.width=6, repr.plot.height=6)
p1 <-
all_params_wide %>%
filter(rtmed > 100) %>%
mutate(lbl = ifelse(-log10(p.value) > LOGP.CUTOFF & abs(fold_change) > FOLD_CHANGE.CUTOFF,
mzmed %>% round(4) %>% as.character, NA)) %>%
ggplot(aes(x=intercept + fold_change, y=intercept,
shape = -log10(p.value) > LOGP.CUTOFF,
alpha = -log10(p.value),
color = -log10(p.value) > LOGP.CUTOFF & abs(fold_change) > FOLD_CHANGE.CUTOFF,
label = lbl
)
) +
geom_point() +
geom_text(hjust=-0.1, size=3.5, check_overlap = T) +
geom_abline(slope=1) +
scale_color_manual(values = c('black', 'red')) +
theme(legend.position = 'none') +
scale_x_continuous(name = 'log10(intensity),\nCH3OH-extracted sample') +
scale_y_continuous(name = 'log10(intensity),\nCD3OH-extracted sample')
p1
ggsave('graphs/pos_mode_scatter.pdf', height=6, width=6)
ggsave('graphs/pos_mode_scatter.png', height=6, width=6)
In [86]:
3.01883 * (1-350/1e6)
3.01883 * (1+350/1e6)
In [87]:
three_d_neutrons <- 3.01883
MASS.CUTOFF.PPM <- 100
RT.CUTOFF <- 5
MASS.CUTOFF.HIGH <- three_d_neutrons * (1 + MASS.CUTOFF.PPM/1e6)
MASS.CUTOFF.LOW <- three_d_neutrons * (1 - MASS.CUTOFF.PPM/1e6)
In [88]:
hits <- all_params_wide %>%
filter(p.value < P.VALUE.CUTOFF, abs(fold_change) > FOLD_CHANGE.CUTOFF) %>%
mutate(foo=1) %>%
left_join(., ., by=c('foo')) %>%
filter(abs(rtmed.x - rtmed.y) < RT.CUTOFF,
mzmed.y > mzmed.x, # avoid double-counting
(mzmed.y - mzmed.x) > MASS.CUTOFF.LOW,
(mzmed.y - mzmed.x) < MASS.CUTOFF.HIGH ) %>%
ungroup %>%
mutate(row = row_number()) %>%
select(row, name.x, name.y) %>%
gather(set, feature, -row) %>%
mutate(set = ifelse(set == 'name.x', 'CH3OH', 'CD3OH') %>% as.factor) %>%
mutate(set = fct_relevel(set, 'CH3OH', after=0)) %>%
spread(set, feature)
hit_plot <-
hits %>%
left_join(all_params_wide %>%
mutate(x.h = intercept + fold_change) %>%
select(x.h, y.h = intercept, p.value, mzmed, rtmed),
by=c('CH3OH'='name')) %>%
left_join(all_params_wide %>%
mutate(x.d = intercept + fold_change) %>%
select(x.d, y.d = intercept, p.value, mzmed, rtmed),
by=c('CD3OH'='name')) %>%
select(row, x.h, y.h, x.d, y.d) %>%
gather(var, x, x.h, x.d, -row) %>%
gather(var.y, y, y.h, y.d, -row, -var) %>%
group_by(row) %>%
filter(var == 'x.h' & var.y == 'y.h' | var == 'x.d' & var.y == 'y.d') %>%
mutate(lbl = '', p.value = 0.5)
p1 +
geom_line(data=hit_plot, aes(x=x, y=y, group=row), color='red', alpha=0.5)
ggsave('graphs/pos_mode_scatter_paired.pdf', height=6, width=6)
ggsave('graphs/pos_mode_scatter_paired.png', height=6, width=6)
In [103]:
hits %>%
left_join(all_params_wide %>%
mutate(x.h = intercept + fold_change) %>%
select(x.h, y.h = intercept, p.value, mzmed, rtmed),
by=c('CH3OH'='name')) %>%
left_join(all_params_wide %>%
mutate(x.d = intercept + fold_change) %>%
select(x.d, y.d = intercept, p.value, mzmed, rtmed),
by=c('CD3OH'='name')) %>%
select(CH3OH, CD3OH, CH3OH.intensity=x.h, CD3OH.intensity=y.d,
p.value.CH3OH = p.value.x,
p.value.CD3OH = p.value.y,
mzmed.CH3OH = mzmed.x,
mzmed.CD3OH = mzmed.y,
rtmed.CH3OH = rtmed.x,
rtmed.CD3OH = rtmed.y
) %>%
arrange(rtmed.CH3OH) ->
hit_table
options(repr.plot.width=8, repr.plot.height=4)
hit_table %>%
ggplot(aes(x=rtmed.CH3OH/60, y=mzmed.CH3OH, ymin=0, ymax = mzmed.CH3OH,
size=CH3OH.intensity, label = mzmed.CH3OH %>% round(4))) +
geom_point(color='forestgreen', alpha=0.3) +
geom_errorbar(width=0, size=0.5) +
geom_text(size = 4, hjust=-0.1, check_overlap = T) +
geom_point(aes(x=rtmed.CD3OH/60, y=-mzmed.CD3OH, size=CD3OH.intensity), color='firebrick', alpha=0.5) +
geom_errorbar(aes(x=rtmed.CD3OH/60, ymin=0, ymax=-mzmed.CD3OH, y=-mzmed.CD3OH), width=0, size=0.5) +
geom_text(size = 4, hjust=-0.1, check_overlap = T, aes(label = mzmed.CD3OH %>% round(4), y=-mzmed.CD3OH)) +
theme_cowplot() +
theme(legend.position = 'bottom') +
geom_abline(intercept = 0, slope=0) +
xlab('retention time, min') +
ylab('m/z, Da\n(negative: CD3OH; positive:CH3OH)')
In [104]:
hit_table %>%
mutate(avg_rt = ((rtmed.CH3OH + rtmed.CD3OH) / 2 )) %>%
mutate(approx.rt = avg_rt %>% round(0)) %>%
group_by(approx.rt)
Manual annotation of the hits is continued in the accompanying CH3OH_vs_CD3OH_xcms_hit_annotation.xlsx file.
In [ ]: