This notebook replicates the analysis workflow created by Marc Rurik. The workflow wrapped in KNIME nodes can be found here, and is illustrated in the picture below. The workflow is developed to analyze HPLC-Orbitrap measurements split into separate or alternating m/z scan ranges. The input expects to be OpenMS consensusXML files, generated by the TextExporter in OpenMS. The workflow further combines the scan ranges, filter them ultilizing blank samples and QC measurements, and then performs statistical analysis to retrieve reliable features.
The aim of this notebook is to give an user friendly and interactive environment where you can get on-the-fly information about the progress of your data. After each step of the pipeline, we provide you with live feedback, consisting of informative plots.
Illustration by: Marc Rurik
In [ ]:
sessionInfo()
In [ ]:
low = read.csv("../results/alternate_pos_low_mr.csv",
header = FALSE, skip=7,
stringsAsFactors = FALSE,
fill=TRUE,
col.names= paste0("V",seq_len(227)),
sep="")
high = read.csv("../results/alternate_pos_high_mr.csv",
header = FALSE, skip=7,
stringsAsFactors = FALSE,
fill=TRUE,
col.names= paste0("V",seq_len(227)),
sep="")
In [ ]:
print("Head and tail of low mass range")
head(low)
tail(low)
print("Head and tail of high mass range")
head(high)
tail(high)
In [ ]:
source('Functions.R')
In [ ]:
low_parsed <- Parse(low)
high_parsed <- Parse(high)
In [ ]:
low_parsed[low_parsed=="NaN"] <- NA
high_parsed[high_parsed=="NaN"] <- NA
cat("Size of low mass range matrix:", dim(low_parsed))
head(low_parsed)
cat("Size of high mass range matrix: ", dim(high_parsed))
head(high_parsed)
In [ ]:
low_renamed <- renameColumns(low_parsed)
high_renamed <- renameColumns(high_parsed)
if (ncol(low_renamed)!=ncol(high_renamed)) {
if (ncol(low_renamed>ncol(high_renamed))) {
warning("Samples are missing in the high mass range")
} else {
warning("Samples are missing in the low mass range")
}
}
print(names(low_renamed))
In [ ]:
data <- rbind(low_renamed, high_renamed[,names(low_renamed)])
names(data)<-gsub("_",".",names(data))
cat("Number of features: ", nrow(data))
In [ ]:
blankFilterPassed = 20 #Percentage, if the median blank intesity is more than X% of the sample intensity --> contaminant
data.bf <- blankFilter(data,blankFilterPassed)
In [ ]:
# set parameters
ignoreColsPattern = c("TCA", "Blank")
method = "mean"
outlier = c(0.68, 1/0.68)
verbose = TRUE
data.bf.norm <- consensusMapNormalization(data.bf, ignoreColsPattern, method, outlier, verbose)
In [ ]:
TIC<-colSums(apply(data.bf[,-which(names(data.bf) %in% data.bf.norm$ignoredCols)], 2, as.numeric), na.rm=T)
TICnorm<-colSums(data.bf.norm$df[,-which(names(data.bf) %in% data.bf.norm$ignoredCols)], na.rm=T)
# Boxplots before and after normalization
n <- ncol(data.matrix(data.bf[,-which(names(data.bf) %in% data.bf.norm$ignoredCols)])) + 7
cc <- rep("black", length(names(data.bf[,-which(names(data.bf) %in% data.bf.norm$ignoredCols)])))
cc[grep("Pool", names(data.bf[,-which(names(data.bf) %in% data.bf.norm$ignoredCols)]))] <- topo.colors(5)[1]
cc[grep("C", names(data.bf[,-which(names(data.bf) %in% data.bf.norm$ignoredCols)]))] <- topo.colors(5)[2]
cc[grep("L", names(data.bf[,-which(names(data.bf) %in% data.bf.norm$ignoredCols)]))] <- topo.colors(5)[3]
cc[grep("H", names(data.bf[,-which(names(data.bf) %in% data.bf.norm$ignoredCols)]))] <- topo.colors(5)[4]
layout(matrix(c(1, 1, 2, 3, 3, 4), nrow = 2, byrow = TRUE))
boxplot(log2(data.matrix(data.bf[,-which(names(data.bf) %in% data.bf.norm$ignoredCols)])),
main="Before normalization",
col=cc,las=2, cex.axis=0.9, ylab="log2(intensity)")
boxplot(TIC,lwd = 2, frame=F, main="TIC distribution")
stripchart(TIC, vertical = TRUE,
method = "jitter", add = TRUE, pch = 20, col = 'blue')
boxplot(log2(data.matrix(data.bf.norm$df[,-which(names(data.bf.norm$df) %in% data.bf.norm$ignoredCols)])),
main="After normalization",
col=cc, las=2, cex.axis=0.9, ylab="log2(intensity)")
boxplot(TICnorm,lwd = 2, frame=F, main="TIC distribution")
stripchart(TICnorm, vertical = TRUE,
method = "jitter", add = TRUE, pch = 20, col = 'blue')
par(mfrow=c(1,1))
In [ ]:
# set parameters
poolFilterCount = 5
biolReplFilterCount = 2
numReplicates = 3
maxRSD = 25
data.bf.norm.filt <- QAFilter(data.bf.norm$df, poolFilterCount, biolReplFilterCount, numReplicates, maxRSD)
cat(nrow(data.bf.norm.filt), "number of features left")
In [ ]:
data.bf.norm.filt.imputed <- missingValues(data.bf.norm.filt)
In [ ]:
low_mass <- sum(as.numeric(data.bf.norm.filt.imputed$mz)<200)
mid_mass <- sum(as.numeric(data.bf.norm.filt.imputed$mz)>200 & as.numeric(data.bf.norm.filt.imputed$mz)<400)
high_mass <- sum(as.numeric(data.bf.norm.filt.imputed$mz)>400 & as.numeric(data.bf.norm.filt.imputed$mz)<1000)
data <- data.frame(x=c("50-200", "200-400", "400-1000"),
y=c(low_mass,mid_mass,high_mass))
ylimit <- c(0, 1.2*max(data$y))
par(oma=c(2,5,3,5))
xx <- barplot(data$y,
names.arg=data$x,
ylim=ylimit,
ylab="No. of features",
xlab="m/z range",
col=topo.colors(6))
text(x=xx,
y=data$y,
label=data$y,
pos = 3)
title("Number of features detected in selected mass segment", outer = TRUE)
In [ ]:
stat_features <- statisticalAnalysis(data.bf.norm.filt.imputed)
In [ ]:
source('multiplot.R')
FoldChange <- data.frame(stat_features[,grep("fc", names(stat_features))])
pvalues <- data.frame(stat_features[,grep("pval", names(stat_features))])
header <- gsub("_pval", "",names(pvalues))
par(mfrow=c(2,length(FoldChange)/2), oma= c(0,0,11,0), cex.main= 2)
for (i in 1:length(FoldChange)) {
plot(log2(FoldChange[,i]),
-log10(pvalues[,i]),
pch=20,
#xlim=c(-max(log2(FoldChange[,i])),max(log2(FoldChange[,i]))),
xlim=c(-4,4),
ylim=c(0,6),
main = header[i],
cex.main= 1.5,
xlab = "log2(FoldChange)",
ylab = "-log10(pvalue)")
fc <- which(abs(log2(FoldChange[,i]))>1)
p <- which(-log10(pvalues[,i])>-log10(0.01))
fc_p <- intersect(fc,p)
abline(h=-log10(0.01), col="red", lty=2)
abline(v=1, col="orange", lty=2)
abline(v=-1, col="orange", lty=2)
points(log2(FoldChange[fc,i]), # orange = high fold change
-log10(pvalues[fc,i]),
col="orange", pch=20)
points(log2(FoldChange[p,i]), # red = low p-value
-log10(pvalues[p,i]),
col="red", pch=20)
points(log2(FoldChange[fc_p,i]), # green = high fold change and low p_value
-log10(pvalues[fc_p,i]),
col="darkgreen", pch=20)
}
title("Volcano plots", outer=TRUE)
In [ ]:
write.csv(stat_features,"../results/stat_result.csv", row.names = F)