Riyue Bao, Ph.D. Center for Research Informatics, The University of Chicago. November 13, 2016
The workshop materials are accessible on Github licensed via LGPLv3.
The analysis takes 3 major steps.
<img src='ipynb_data/assets/Figure26.png', title = 'TCGA & Clinical workflow', width = 900, height = 900>
The Cancer Genome Atlas, Ovrian Cancer (TCGA-OV), miRNA gene expression and Clinical data.
The Cancer Genome Atlas Research Network. 2011. [Integrated genomic analyses of ovarian carcinoma](http://www.nature.com/nature/journal/v474/n7353/full/nature10166.html). Nature 474, 609–615)
R 3.3
or above)<img src='ipynb_data/assets/Figure23.2.png', title = 'GDC', width = 600, height = 600>
Commonly used tool / package for survival include survival
.
In this workshop, We will demo how to use survival
to identify if PRDM11 gene expression is a prognosis factor of DLBC.
In [1]:
rm(list=ls())
set.seed(13)
In [2]:
ptm <- proc.time()
In [3]:
##-- Those packages were pre-installed in the AWS machine
pkg.list <- c('ggplot2', 'RColorBrewer', 'reshape', 'broom',
'gplots', 'survival', 'NMF', 'IRdisplay')
for(pkg in pkg.list) {
print(pkg)
suppressMessages(library(pkg, character.only = TRUE))
}
In [4]:
separator <- '========================================'
##-- Parameters
cancer <- 'OV'
gene.top.count <- 150
##-- Set up working directory
work.dir <- '.'
setwd(work.dir)
##-- Input/Output directories
in.dir <- 'ipynb_data/input'
out.dir <- 'ipynb_data/output/tcga_ov'
##-- Input/Output files
expr.file <- paste0('TCGA_', cancer, '.mirna_expression.tsv')
clinical.file <- paste0('TCGA_', cancer, '.clinical.tsv')
In [5]:
print(paste0('Cancer = ', cancer))
print(paste0('Expression file = ', expr.file))
print(paste0('Clinical file = ', clinical.file))
In [6]:
##-- Read files
expr <- read.delim(paste0(in.dir,'/',expr.file),
header = TRUE, stringsAsFactors = FALSE)
clinical <- read.delim(paste0(in.dir,'/',clinical.file),
header = TRUE, stringsAsFactors = FALSE)
clinical <- na.omit(clinical)
print(paste0('Patients with complete clinical = ', nrow(clinical)-1))
print(paste0('Patients with gene expression = ', ncol(expr)-1))
print(paste0('Overlap = ', length(intersect(clinical$sample,
colnames(expr)))))
print(separator)
print('Show the first three rows of clinical file:')
print(clinical[1:3,])
print(separator)
print('Show the first three rows and left five columns of expression file:')
print(expr[1:3,1:5])
In [7]:
##-- Preprocess row.names(clinical) = clinical[,1]
row.names(expr) <- expr[,1]
expr <- as.matrix(expr[,-1])
##-- median-centered normalization by gene (for NMF clustering only!)
expr.centered <- t(apply(expr,1,function(x){x-median(x)}))
##-- calculate variance: MAD
expr.var <- data.frame(mad = apply(expr.centered,1,mad))
##-- sort gene by MAD values (higher to lower)
expr.var <- expr.var[rev(order(expr.var[,1])),,drop=FALSE]
print(paste0('Calcuate and sort gene by Median absolute deviation (MAD):'))
head(expr.var)
##-- select 150 most variable genes
expr.var.top <- expr.var[1:gene.top.count,,drop=FALSE]
gene.top <- data.frame(gene = row.names(expr.var.top))
print(paste0('Select top ', gene.top.count,' most variable genes'))
print(expr.var.top[1:6, , drop = FALSE])
##-- subset expression matrix by genes and samples
expr.sub <- expr.centered[row.names(expr.centered) %in%
gene.top$gene,colnames(expr.centered) %in%
clinical$sample]
##-- make clinical samples consistent with expression
clinical <- clinical[clinical$sample %in%
colnames(expr.sub),]
##-- convert expression matrix to rank matrix (Important for NMF!)
##-- because no negative values are allowed in the matrix
expr.sub <- apply(expr.sub,2,rank) / gene.top.count
In [8]:
print(paste0('Expression matrix ready for NMF clustering: ',
nrow(expr.sub), ' genes, ',
ncol(expr.sub), ' samples'))
print(separator)
##-- run NMF to cluster samples & genes
##-- (use 4 core 'p4', and print more info 'v')
##-- takes 15 minutes to run... skip in workshop
print('NMF clustering start...')
# expr.sub.nmf <- nmf(expr.sub,
# rank = 3,
# method = 'brunet',
# seed = 'random',
# nrun = 100,
# .opt = 'vp4')
##-- workshop only: load already generated result
load(paste0(out.dir,'/',expr.file,'.nmf.RData'))
print('Done!')
In [9]:
##-- Retrieve sample & gene clusters and add to clinical table
##-- retrieve the basis matrix and coef matrix
expr.sub.nmf.w <- basis(expr.sub.nmf)
expr.sub.nmf.h <- coef(expr.sub.nmf)
##-- retrieve gene cluster
expr.sub.nmf.geneclr <- predict(expr.sub.nmf, 'features')
expr.sub.nmf.geneclr <- data.frame(gene = row.names(expr.sub.nmf.w),
cluster = expr.sub.nmf.geneclr)
row.names(expr.sub.nmf.geneclr) <- expr.sub.nmf.geneclr$gene
expr.sub.nmf.geneclr <- expr.sub.nmf.geneclr[
order(expr.sub.nmf.geneclr$cluster),]
print('Gene clusters ... ')
print(table(expr.sub.nmf.geneclr$cluster))
print(expr.sub.nmf.geneclr[1:3,])
##-- retrieve sample cluster
expr.sub.nmf.smclr <- predict(expr.sub.nmf)
expr.sub.nmf.smclr <- data.frame(sample = colnames(expr.sub.nmf.h),
cluster = expr.sub.nmf.smclr)
row.names(expr.sub.nmf.smclr) <- expr.sub.nmf.smclr$sample
expr.sub.nmf.smclr <- expr.sub.nmf.smclr[
order(expr.sub.nmf.smclr$cluster),]
print('Sample clusters ... ')
print(table(expr.sub.nmf.smclr$cluster))
print(expr.sub.nmf.smclr[1:3,])
##-- add sample cluster to clinical table
clinical <- merge(clinical, expr.sub.nmf.smclr, by = 'sample')
clinical$cluster <- as.numeric(clinical$cluster)
clinical <- clinical[order(clinical$cluster),]
clinical$cluster <- as.character(clinical$cluster)
##-- Plot sample correlation and gene expression heatmaps
##-- prepare for plotting heatmaps
gene.counts <- data.frame(table(expr.sub.nmf.geneclr$cluster))
gene.colors <- c(rep('pink',gene.counts[1,2]),
rep('purple',gene.counts[2,2]),
rep('lightgreen',gene.counts[3,2]))
sample.counts <- data.frame(table(expr.sub.nmf.smclr$cluster))
sample.colors <- c(rep('red',sample.counts[1,2]),
rep('blue',sample.counts[2,2]),
rep('green',sample.counts[3,2]))
##-- calculate expression correlation between samples
expr.sub.srt <- expr.sub[,clinical$sample]
expr.sub.srt <- expr.sub.srt[row.names(expr.sub.srt) %in%
expr.sub.nmf.geneclr$gene,]
expr.sub.srt <- expr.sub.srt[as.character(expr.sub.nmf.geneclr$gene),]
expr.sub.cor <- cor(expr.sub.srt)
##-- plot sample correlation heatmap
my.heatcol <- bluered(177)
my.breaks <- sort(unique(c(seq(-1, -0.5, length.out=20),
seq(-0.5, 0.5, length.out=140),
seq(0.5, 1, length.out=20))))
centered <- t(scale(t(expr.sub.cor), scale=FALSE))
##-- skip in workshop!!
# png(file=paste0(out.dir,'/',expr.file,'.nmf.cor_heatmap.png'), width=800, height=800)
# heatmap <- heatmap.2(centered,
# dendrogram='none',
# Rowv=NULL,
# Colv=NULL,
# col=my.heatcol,
# RowSideColors=sample.colors,
# ColSideColors=sample.colors,
# density.info='none',
# trace='none',
# key=TRUE, keysize=1.2,
# labRow=FALSE,labCol=FALSE,
# xlab='Samples',ylab='Samples',
# main = 'Sample correlation heatmap')
# dev.off()
##-- plot gene expression heatmap
my.heatcol <- bluered(177)
centered <- t(scale(t(expr.sub.srt), scale=FALSE))
##-- skip in workshop!!
# png(file=paste0(out.dir,'/',expr.file,'.nmf.gene_heatmap.png'), width=800, height=800)
# heatmap <- heatmap.2(centered,
# dendrogram='none',
# Rowv=NULL,
# Colv=NULL,
# col=my.heatcol,
# RowSideColors=gene.colors,
# ColSideColors=sample.colors,
# density.info='none',
# trace='none',
# key=TRUE, keysize=1.2,
# labRow=FALSE,labCol=FALSE,
# xlab='Samples',ylab='Genes',
# main = 'Gene expression heatmap')
# dev.off()
##-- directly view pre-generated heatmaps (workshop only)
display_png(file='ipynb_data/assets/Figure22.2.png')
print(table(expr.sub.nmf.smclr$cluster))
print(table(expr.sub.nmf.geneclr$cluster))
In [10]:
##-- Set up R plot display options in notebook
options(jupyter.plot_mimetypes = "image/svg+xml")
options(repr.plot.width = 5, repr.plot.height = 5)
##-- add right censoring
table(clinical$vital.status)
clinical$censor <- NA
clinical$censor[which(clinical$vital.status == 'LIVING')] <- 0
clinical$censor[which(clinical$vital.status == 'DECEASED')] <- 1
clinical$censor <- as.numeric(clinical$censor)
##-- KM plot
surv <- Surv(clinical$overall.survival.day, clinical$censor)
surv.fit <- survfit(surv ~ clinical$cluster)
plot(surv.fit, mark=4, col=c('#00CC00','#0000CC','#CC0000'),
lty=1, lwd=1.5,cex=0.8,cex.lab=1.5, cex.axis=1.5, cex.main=1,
main='Kaplan-Meier survival curves for TCGA OV dataset',
xlab='Days to Death',
ylab='Probability of Survival')
text(1500,0.08, labels=paste0('cluster1 (n=',sample.counts[1,2],')'),
cex=1.2, col='#00CC00')
text(650,0.40, labels=paste0('cluster2\n(n=',sample.counts[2,2],')'),
cex=1.2, col='#0000CC')
text(2500,0.68, labels=paste0('cluster3 (n=',sample.counts[3,2],')'),
cex=1.2, col='#CC0000')
separator
##-- log-rank test
print(paste0('Running log-rank test for survival'))
surv <- Surv(clinical$overall.survival.day, clinical$censor)
surv.diff <- survdiff(surv ~ clinical$cluster)
surv.diff
separator
In [11]:
##-- cox pp haz model (comparing cluster 2 and 3 as an example!)
print(paste0('Running Cox proportional hazards model for survival'))
clinical.sub <- clinical[clinical$cluster %in% c(1,2,3),]
surv <- Surv(clinical.sub$overall.survival.day,
clinical.sub$censor)
print('Cluster 1 will be set as baseline ... ')
separator
##-- univariate cox model
print(paste0('(a) Univariate cox model for survival'))
m1 <- coxph(surv~(clinical.sub$cluster))
tidy(m1)
separator
##-- full cox model
print(paste0('(b) Full multivariate cox model for survival'))
m2 <- coxph(surv~(clinical.sub$cluster +
clinical.sub$age.at.diagnosis.year +
clinical.sub$tumor.grade))
tidy(m2)
separator
##-- reduced cox model
##-- Always use the simplest model with the least necessary amount
##-- of covariates to minimize overfitting
print(paste0('(c) Reduced multivariate cox model for survival'))
m3 <- coxph(surv~(clinical.sub$cluster +
clinical.sub$age.at.diagnosis.year))
tidy(m3)
In [12]:
proc.time() - ptm
In [13]:
print('Program run finished!')
##-- Print analysis environment (for reproducible research)
sessionInfo()