Riyue Bao, Ph.D. Center for Research Informatics, The University of Chicago. November 13, 2016
The workshop materials are accessible on Github licensed via LGPLv3.
The test datasets used in this workshop are from Fog. et al. 2015. Loss of PRDM11 promotes MYC-driven lymphomagenesis. Blood, 125(8):1272-81 http://www.bloodjournal.org/content/125/8/1272.long?sso-checked=true
Commonly used tool / package for detection of signficant DEGs include DESeq2
, limma/limmavoom
, edgeR
, etc.
For this workshop, We will demo how to use DESeq2
to detect DEGs between WT and KO groups.
In [1]:
rm(list=ls())
In [2]:
ptm <- proc.time()
In [3]:
##-- List packages required in this analysis
cpan.pkg.list <- c('ggplot2', 'scales', 'ape', 'RColorBrewer',
'reshape','VennDiagram')
bioc.pkg.list <- c('ctc', 'limma', 'edgeR', 'DESeq2', 'vsn',
'genefilter', 'pheatmap',
'clusterProfiler', 'pathview',
'AnnotationHub')
##-- Set up CPAN repo (required if running IRkernel in Jupyter)
# cpan.repos <- 'http://cran.us.r-project.org'
##-- Install CPAN packages
# install.packages('ggplot2', repos=cpan.repos)
# lapply(cpan.pkg.list, suppressMessages(install.packages),
# repos=cpan.repos, character.only = TRUE)
##-- Set up Bioconductor repo
# source("http://bioconductor.org/biocLite.R")
##-- Install Bioc packages
# biocLite('DESeq2')
# lapply(bioc.pkg.list, suppressMessages(biocLite),
# character.only = TRUE)
##-- Load libraries
for(pkg in c(cpan.pkg.list, bioc.pkg.list)) {
print(pkg)
suppressMessages(require(pkg, character.only = TRUE))
}
##-- replace Hs with other (e.g. Mm) if you work on other species
##-- annotation database used by clusterProfiler
# suppressMessages(library(org.Hs.eg.db))
In [4]:
##-- Parameters
cancer <- 'DLBC'
fdr <- 0.05
fc <- 1.5
gene.type <- 'coding'
caller <- 'deseq2'
group1 <- 'KO'
group2 <- 'WT'
colors <- c('#CC0000', '#00CC00')
##-- Set up working directory
work.dir <- '.'
setwd(work.dir)
##-- Input/Output directories
in.dir <- 'ipynb_data/input'
out.dir <- 'ipynb_data/output'
##-- Input/Output files
expr.file <- paste0(cancer, '.raw_counts.tsv')
sample.file <- paste0(cancer, '.sample_group.tsv')
geneinfo.file <- 'gencode.v24.primary_assembly.annotation.gtf.geneinfo'
In [5]:
print(paste0('Cancer = ', cancer))
print(paste0('gene type = ', gene.type))
print(paste0('DEG fdr cutoff = ', fdr))
print(paste0('DEG fc cutoff = ', fc))
print(paste0('Expression file = ', expr.file))
print(paste0('Sample group file = ', sample.file))
print(paste0('Gene info file = ', geneinfo.file))
In [6]:
##-- Read data files
data.expr <- read.delim(paste0(in.dir, '/', expr.file),
header = TRUE, stringsAsFactors=FALSE)
data.sample <- read.delim(paste0(in.dir, '/', sample.file),
header = TRUE, stringsAsFactors=FALSE)
data.geneinfo <- read.delim(paste0(in.dir, '/', geneinfo.file),
header = TRUE, stringsAsFactors = FALSE)
In [7]:
##-- Expression matrix: raw read counts
print(paste0('Expression matrix = ',
nrow(data.expr), ' genes, ',
ncol(data.expr), ' fields'))
data.expr[1:3,]
##-- Sample table: experimental design & groups
print(paste0('Sample table = ',
nrow(data.sample), ' samples, ',
length(table(data.sample$Group)), ' groups'))
data.sample
##-- Gene annotation
print(paste0('Gene annotation = ',
nrow(data.geneinfo), ' genes, ',
ncol(data.geneinfo), ' fields'))
head(data.geneinfo)
In [8]:
data.expr.proc <- data.expr
data.sample.proc <- data.sample
data.geneinfo.proc <- data.geneinfo
##-- Set up row names of each data frame
row.names(data.expr.proc) <- data.expr.proc[,1]
row.names(data.sample.proc) <- data.sample.proc[,1]
row.names(data.geneinfo.proc) <- data.geneinfo.proc[,1]
##-- Expression matrix: remove extra columns
# colnames(data.expr.proc)
data.expr.proc <- data.expr.proc[,c(1,7:12)]
colnames(data.expr.proc)[1] <- 'Gene'
##-- Expression matrix: add gene symbol to Ensembl geneid
##-- from annotation
print(sum(data.expr.proc$Gene %in% data.geneinfo.proc$gene_id)
== nrow(data.expr.proc))
data.expr.proc <- merge(data.geneinfo.proc[,c('gene_id',
'gene_name')],
data.expr.proc, by = 'row.names')
##-- Expression matrix: concat gene symbol & Ensembl id
##-- as unique key for each gene row
data.expr.proc$Gene <- paste0(data.expr.proc$gene_name,
'!', data.expr.proc$gene_id)
data.expr.proc <- data.expr.proc[,-c(1:3)]
row.names(data.expr.proc) <- data.expr.proc[,1]
data.expr.proc <- data.expr.proc[,-1]
##-- Expression matrix: peek into preprocessed data
data.expr.proc[1:3,]
##-- Gene annotation: show gene types
data.frame(table(data.geneinfo.proc$gene_type))
##-- Gene annotation: retrieve Ensembl id only
data.geneinfo.proc.gene <- data.geneinfo.proc[,c(1,2,4)]
data.geneinfo.proc.gene <- unique(data.geneinfo.proc.gene)
print(paste0('Total genes = ', nrow(data.geneinfo.proc.gene)))
##-- Gene annotation: concat gene symbol & Ensembl id
##-- as unique key for each gene row
data.geneinfo.proc.gene <- data.frame(Key =
paste0(data.geneinfo.proc.gene$gene_name,'!',
data.geneinfo.proc.gene$gene_id),
data.geneinfo.proc.gene)
row.names(data.geneinfo.proc.gene) <- data.geneinfo.proc.gene[,1]
data.geneinfo.proc <- data.geneinfo.proc[,-1]
##-- Gene annotation: keep two types of genes,
##-- coding and lincRNA, for further analysis
gene.coding <- data.geneinfo.proc.gene[
data.geneinfo.proc.gene$gene_type == 'protein_coding',]
gene.lincrna <- data.geneinfo.proc[
data.geneinfo.proc.gene$gene_type == 'lincRNA',]
##-- Gene annotation: show how many genes are in each list
print(paste0('Coding genes = ', nrow(gene.coding)))
print(paste0('lincRNA genes = ', nrow(gene.lincrna)))
In [9]:
data.sample.proc.sub <- data.sample.proc
##-- Expression matrix: select gene rows
##-- based on specified gene type (coding or lincrna)
gene.list <- ''
if(gene.type == 'coding') { gene.list <- gene.coding }
if(gene.type == 'lincrna') { gene.list <- gene.lincrna }
data.expr.proc.sub <- data.expr.proc[
row.names(data.expr.proc) %in% row.names(gene.list),]
##-- Expression matrix: sort matrix sample column
##-- to be consistent with sample table
##--(required for DEG analysis)
data.expr.proc.sub <- data.expr.proc.sub[,data.sample.proc.sub$Sample]
# data.expr = data.expr.proc.sub
# save(data.expr, file = paste0(paste0(out.dir, '/', caller, '/', expr.file, '.coding.Data')))
write.table(data.expr.proc.sub,
file = paste0(out.dir, '/', caller, '/', expr.file, '.coding.tsv'),
col.names = TRUE, row.names = TRUE, quote = FALSE, sep = '\t')
print(paste0('Gene list = ',
nrow(gene.list), ' (', gene.type, ')'))
print(paste0('Expression matrix before subsetting: ',
nrow(data.expr.proc), ' genes'))
print(paste0('Expression matrix after subsetting: ',
nrow(data.expr.proc.sub), ' genes'))
In [10]:
out.prefix <- paste0(out.dir, '/', caller, '/',cancer,'.',
gene.type, '.', caller)
##-- DESeq2: note data.sample.proc.sub must
##-- have the same sample column
##-- order as data.expr.proc.sub
Group <- as.factor(data.sample.proc.sub$Group)
##-- DESeq2: covert to matrix data format (previously was data frame)
cds <- as.matrix(data.expr.proc.sub)
##-- DESeq2: build DESeqDataSet object, prepare design matrix
dds <- DESeqDataSetFromMatrix(countData = cds,
colData = data.sample.proc.sub,
design = ~ Group)
##-- DESeq2: note this is just simple filter to reduce mem
##-- no affect on DEG results
# dim(dds)
# dds <- dds[ rowSums(counts(dds)) > 0, ]
# dim(dds)
##-- DESeq2: plot estimated dispersions
# pdf(paste0(out.prefix, '.dispersion.pdf'),width = 7, height = 7)
# plotDispEsts(dds, xlim = c(1,10000), ylim = c(1E-10, 1))
# dev.off()
print(dds)
In [11]:
##-- DESeq2: three normalization algrithms (rld, vsd and vsd.fast)
rld <- rlog(dds, blind=FALSE)
vsd <- varianceStabilizingTransformation(dds, blind=FALSE)
# vsd.fast <- vst(dds, blind=FALSE)
##-- DESeq2: peek into normalized expression matrix
head(assay(rld), 3)
head(assay(vsd), 3)
##-- DESeq2: print normalized expression matrix to local files
write.table(data.frame(Gene = row.names(assay(rld)), assay(rld)),
file = paste0(out.prefix, '.rld.txt'),
sep = '\t', col.names = TRUE, row.names = FALSE, quote = FALSE)
write.table(data.frame(Gene = row.names(assay(vsd)), assay(vsd)),
file = paste0(out.prefix, '.vsd.txt'),
sep = '\t', col.names = TRUE, row.names = FALSE, quote = FALSE)
In [12]:
##-- Set up R plot display options in notebook
options(jupyter.plot_mimetypes = "image/svg+xml")
options(repr.plot.width = 6, repr.plot.height = 5)
##-- DESeq2: remove genes not expressed in any samples
##-- for plottig purposes
notAllZero <- (rowSums(counts(dds))>0)
##-- DESeq2: mean to var plots
# pdf(paste0(out.prefix, '.meanvar.log2.pdf'),width = 7, height = 7)
meanSdPlot(log2(counts(estimateSizeFactors(dds),
normalized=TRUE)[notAllZero,] + 1))
# dev.off()
# pdf(paste0(out.prefix, '.meanvar.rld.pdf'),width = 7, height = 7)
meanSdPlot(assay(rld[notAllZero,]))
# dev.off()
# pdf(paste0(out.prefix, '.meanvar.vsd.pdf'),width = 7, height = 7)
meanSdPlot(assay(vsd[notAllZero,]))
# dev.off()
In [13]:
##-- Set up R plot display options in notebook
options(jupyter.plot_mimetypes = "image/svg+xml")
options(repr.plot.width = 6, repr.plot.height = 5)
##-- DESeq2: sample correlation heatmap
##-- calculate sample distance
sampleDists <- dist(t(assay(rld)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(rld$Group, rld$Libtype, sep="-")
colnames(sampleDistMatrix) <- rownames(sampleDistMatrix)
##-- DESeq2: plot heatmap
heatmap.colors <- rev(cm.colors(32))[1:16]
# pdf(paste0(out.prefix, '.sm_cor.pdf'),width = 7, height = 7)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=heatmap.colors)
# dev.off()
##-- DESeq2: Principal component analysis (PCA) plot of the samples
##-- use ggplot2 to customize the PCA plot
# pdf(paste0(out.prefix, '.pca.pdf'),width = 7, height = 7)
data.pca <- plotPCA(rld, intgroup=c('Group'), returnData=TRUE)
percent.var <- round(100 * attr(data.pca, "percentVar"))
pca.colors <- c(KO = colors[1], WT = colors[2])
p1 <- ggplot(data.pca, aes(PC1, PC2, color = Group)) +
geom_point(size = 5, shape = 17) +
scale_colour_manual(values = pca.colors) +
xlab(paste0("PC1: ",percent.var[1],"% variance")) +
ylab(paste0("PC2: ",percent.var[2],"% variance"))
plot(p1)
In [14]:
##-- Set up R plot display options in notebook
options(jupyter.plot_mimetypes = "image/svg+xml")
options(repr.plot.width = 6, repr.plot.height = 5)
print(paste0('Group 1 = ', group1))
print(paste0('Group 2 = ', group2))
comp <- paste0(group1, 'vs', group2, '.')
out.prefix <- paste0(out.dir, '/', caller,'/',cancer,'.',
gene.type, '.',comp,caller,'.txt')
##-- DESeq2: fit the model and identify DEGs
##-- same as running those three steps:
##-- (1) dds = estimateSizeFactors(dds)
##-- (2) dds = estimateDispersions(dds)
##-- (3) dds = nbinomWaldTest(dds)
dds <- DESeq(dds, test="Wald", betaPrior=TRUE)
res <- results(dds,
contrast=c("Group",group1,group2),
pAdjustMethod ="fdr",
alpha=fdr)
##-- DESeq2: save the result as an R object for reloading
##-- in future analysis
# save(res, file = paste0(out.prefix,'.RData'))
##-- DESeq2: Plot MA to display differential expression
##-- versus expression strength
# pdf(paste0(out.prefix,'.MAplot.pdf'), width=7, height=7)
plotMA(res, main="DESeq2", ylim=c(-2,2))
# dev.off()
##-- DESeq2: peek into DEG data object
summary(res)
mcols(res)$description
res <- as.data.frame(res)
##-- DESeq2: add fold change (via anti-log log2FC)
res$foldChange <- NA
row.pos <- which(! is.na(res$log2FoldChange) &
res$log2FoldChange >= 0)
row.neg <- which(! is.na(res$log2FoldChange) &
res$log2FoldChange < 0)
res$foldChange[row.pos] <- 2^res$log2FoldChange[row.pos]
res$foldChange[row.neg] <- -2^((-1) * res$log2FoldChange[row.neg])
res <- data.frame(id = row.names(res), res)
# print(sum(res$foldChange == 0) == 0)
##-- DESeq2: peek into gene list
res[1:3,]
##-- DESeq2: save DEG output into local file
res.ensembl <- gsub('\\S+[!]', '', res[,1], perl = TRUE)
res.symbol <- gsub('[!]\\S+', '', res[,1], perl = TRUE)
res.print <- data.frame(ENSEMBL = res.ensembl,
SYMBOL = res.symbol,
res[,-c(1:2)])
write.table(res.print,
file = out.prefix,
sep = '\t', col.names = TRUE, row.names = FALSE, quote = FALSE)
In [15]:
##-- Set up R plot display options in notebook
options(jupyter.plot_mimetypes = "image/svg+xml")
options(repr.plot.width = 6, repr.plot.height = 5)
##-- DESeq2: remove nan values for the foldchange == NAN
before <- nrow(res)
res <- res[!is.na(res$foldChange) & ! is.na(res$padj),];
after <- nrow(res)
print(paste0('Genes removed = ', (before - after),
' (fold change is NA)'))
print(paste0('Genes kept = ', after))
print(paste0('Filter DEGs by: fc, ', fc, ', fdr ', fdr))
##-- DESeq2: plot histogram of unadj p-value for sanity check
# pdf(paste0(out.prefix,'.pvalue.hist.pdf'), width=7, height=7)
hist(res$pval, breaks=100,
col="skyblue", border="slateblue",
main="Histogram of unadjusted p-value",
xlab="unajusted p-value")
# dev.off()
##-- DESeq2: filter DEGs
res.flt <- res[(res$foldChange >= fc | res$foldChange <= -fc) &
res$padj < fdr,]
print(paste0('Genes filtered = ', (after - nrow(res.flt)),
' (fc, ', fc, ', fdr ', fdr, ')'))
print(paste0('Genes kept = ', nrow(res.flt)))
##-- DESeq2: peek into filtered gene list
res.flt[1:3,]
##-- DESeq2: save filtered DEG output into local file
if(nrow(res.flt) > 0) {
res.flt.ensembl <- gsub('\\S+[!]', '', res.flt[,1], perl = TRUE)
res.flt.print <- res.print[res.print$ENSEMBL %in% res.flt.ensembl,]
write.table(res.flt.print, paste0(out.prefix,
'.flt.fdr', fdr, '_fc', fc),
col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
}
In [16]:
##-- Set up R plot display options in notebook
options(jupyter.plot_mimetypes = "image/svg+xml")
options(repr.plot.width = 6, repr.plot.height = 5)
##-- DESeq2: continue with significant DEGs from previous step
print(paste0('Genes significant = ', nrow(res.flt),
' (fc, ', fc, ', fdr ', fdr, ')'))
##-- DESeq2: select those sig genes from normalized expression matrix
gene.select <- res.flt$id
data.plot <- assay(rld)
data.plot <- data.plot[row.names(data.plot) %in% gene.select,,
drop = FALSE]
##-- DESeq2: prepare annotation label and colors
data.plot.anno <- as.data.frame(
data.sample.proc.sub[,colnames(data.sample.proc.sub) %in%
c('Group'),
drop = FALSE])
data.plot.anno.colors <- list(Group = c(KO = colors[1], WT = colors[2]))
##-- DESeq2: plot sig DEG heatmap
print(paste0('Heatmap = ', nrow(data.plot), ' genes on the row, ',
ncol(data.plot), ' samples on the column'))
pheatmap(data.plot,
scale = 'row',
cluster_rows = TRUE, cluster_cols = TRUE,
show_rownames = FALSE, show_colnames = TRUE,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "complete",
annotation_row = NA,
annotation_col = data.plot.anno,
annotation_colors = data.plot.anno.colors
)
In [17]:
##-- clusterProfiler: NOTE those commands were tested for clusterProfiler
##-- version 3.0.5 or above. Older versions may not work as they have
##-- options revised/removed in the new versions.
##-- clusterProfiler: prepare input for GO/KEGG enrichment analysis
genes.all <- res.print
genes.sig <- res.flt.print
genes.all$ENSEMBL <- gsub('[.]\\d+', '', genes.all$ENSEMBL, perl = TRUE)
genes.sig$ENSEMBL <- gsub('[.]\\d+', '', genes.sig$ENSEMBL, perl = TRUE)
##-- clusterProfiler: remove genes with fc / pvalue as NA
genes.all <- na.omit(genes.all)
##-- clusterProfiler: add EntrezID
##-- show which keytype is available
keytypes(org.Hs.eg.db)
genes.all.anno <- bitr(geneID = genes.all$ENSEMBL,
fromType = 'ENSEMBL',
toType = c('ENTREZID', 'SYMBOL'),
OrgDb = 'org.Hs.eg.db',
drop = TRUE)
##-- clusterProfiler: save annotated output to local files
write.table(genes.all.anno,
file = paste0(out.prefix,'.anno'),
col.names = TRUE, row.names = FALSE, sep = '\t',
quote = FALSE)
##-- clusterProfiler: remove genes with duplicate EntrezIDs
genes.all.anno <- genes.all.anno[
which(! duplicated(genes.all.anno$ENTREZID)), ]
row.names(genes.all.anno) <- genes.all.anno$ENTREZID
##-- clusterProfiler: add fc and p-value to genes.all.anno
genes.all.anno <- merge(genes.all.anno, genes.all, by = 'ENSEMBL')
row.names(genes.all.anno) <- genes.all.anno$ENTREZID
##-- clusterProfiler: prepare significant gene list
genes.sig.anno <- genes.all.anno[genes.all.anno$ENSEMBL %in%
genes.sig$ENSEMBL,]
print(paste0('Genes significant = ', nrow(genes.sig.anno)))
genes.sig.anno[1:3,1:7]
##-- clusterProfiler: prepare input for GSEA
##-- ranked gene list with decreasing fold change
gene.list <- genes.all.anno$foldChange
names(gene.list) <- genes.all.anno$ENTREZID
gene.list <- sort(gene.list, decreasing = TRUE)
gene.list[1:3]
In [18]:
##-- clusterProfiler: GO over-representation test
ego <- enrichGO( gene = genes.sig.anno$ENTREZID,
universe = genes.all.anno$ENTREZID,
OrgDb = 'org.Hs.eg.db',
ont = "BP",
pAdjustMethod = "fdr",
pvalueCutoff = 0.05,
readable = TRUE)
summary(ego)[1:3,1:7]
##-- clusterProfiler: save GO enrichment output to an R object (faster data import next time!)
save(ego, file = paste0(out.prefix, '.flt.fdr', fdr,'_fc',fc, '.enrichGO.RData'))
In [19]:
##-- Set up R plot display options in notebook
options(jupyter.plot_mimetypes = "image/svg+xml")
options(repr.plot.width = 10, repr.plot.height = 5)
##-- clusterProfiler: if running this cell separately,
##-- load a fresh new copy of ego
load(paste0(out.prefix, '.flt.fdr', fdr,'_fc',fc, '.enrichGO.RData'))
##-- clusterProfiler: enrichGO - drop 5 level of GO terms (too general!)
ego.alllevel <- ego
for(i in 1:5) {
ego <- dropGO(ego, level = i)
}
##-- clusterProfiler: visualization of GO enrichment results
p1 <- barplot(ego, showCategory=20)
p2 <- dotplot(ego, showCategory=20)
# p3 = plotGOgraph(ego, firstSigNodes = 10)
##-- clusterProfiler: show the plots in notebook
plot(p1)
plot(p2)
In [20]:
##-- clusterProfiler: KEGG over-representation test
kk <- enrichKEGG(gene = genes.sig.anno$ENTREZID,
universe = genes.all.anno$ENTREZID,
organism = "human",
pAdjustMethod = "fdr",
pvalueCutoff = 0.05)
summary(kk)[1:3,]
##-- clusterProfiler: save KEGG pathway enrichment output to an R object (faster data import next time!)
save(kk, file = paste0(out.prefix, '.flt.fdr', fdr,'_fc',fc,'.enrichKEGG.RData'))
In [21]:
##-- clusterProfiler: if running this cell separately, load a fresh new copy of kk
load(paste0(out.prefix, '.flt.fdr', fdr,'_fc',fc,'.enrichKEGG.RData'))
##-- clusterProfiler: visualize pathway with all genes
pathway <- 'hsa04010'
plot.title <- paste0(pathway, '.gene_all.pathview.png')
p4 <- pathview(gene.data = gene.list,
pathway.id = pathway,
species = "human",
limit = list(gene=4, cpd=1),
kegg.dir = paste0(out.dir,'/',caller),
out.suffix = paste0('gene_all.pathview'))
<img src='ipynb_data/assets/hsa04010.gene_all.pathview.png', title = 'hsa04010', width = 1000, height = 1000>
In [22]:
proc.time() - ptm
In [23]:
print('Program run finished!')
##-- Print analysis environment (for reproducible research)
sessionInfo()