Author: K. Fisch Email: Kfisch@ucsd.edu Date: May 2016
Omics pipe is an open-source, modular computational platform that automates ‘best practice’ multi-omics data analysis pipelines.
This Jupyter notebook wraps the functionality of Omics Pipe into an easy-to-use interactive Jupyter notebook and parses
the output for genomic interpretation. Read more about Omics Pipe at https://pythonhosted.org/omics_pipe/.
In [ ]:
#Omics Pipe Overview
from IPython.display import Image
Image(filename='/data/core_analysis_pipelines/RNAseq/Omics_Pipe_RNAseq_count_based_pipeline/images/op_diagram.png', width=500, height=100)
In [ ]:
#Activate Jupyter Notebook Extensions
import notebook
E = notebook.nbextensions.EnableNBExtensionApp()
E.enable_nbextension('usability/codefolding/main')
E.enable_nbextension('usability/comment-uncomment/main')
E.enable_nbextension('usability/datestamper/main')
E.enable_nbextension('usability/dragdrop/main')
E.enable_nbextension('usability/hide_input/main')
#E.enable_nbextension('usability/read-only/main')
E.enable_nbextension('usability/runtools/main')
E.enable_nbextension('usability/search-replace/main')
E.enable_nbextension('usability/toc/main')
#disable extension
#D = notebook.nbextensions.DisableNBExtensionApp()
#D.disable_nbextension('usability/codefolding/main')
In [ ]:
#Import Omics pipe and module dependencies
import yaml
from omics_pipe.parameters.default_parameters import default_parameters
from ruffus import *
import sys
import os
import time
import datetime
import drmaa
import csv
from omics_pipe.utils import *
from IPython.display import IFrame
import pandas
import glob
import os
import matplotlib.pyplot as plt
%matplotlib inline
#%matplotlib notebook
import qgrid
qgrid.nbinstall(overwrite=True)
qgrid.set_defaults(remote_js=True, precision=4)
from IPython.display import HTML
import mygene
now = datetime.datetime.now()
date = now.strftime("%Y-%m-%d %H:%M")
In [ ]:
#Change top directory to locate result files
os.chdir("/data/core_analysis_pipelines/RNAseq/Omics_Pipe_RNAseq_count_based_pipeline")
In [ ]:
#Omics Pipe documentation: Parameters
IFrame("https://pythonhosted.org/omics_pipe/parameter_file.html", width=700, height=250)
In [ ]:
###Customize parameters: Specify sample names and conditions
sample_names = ["468-3_CTRL-2","468-3_LPS-2","468-4_CTRL","468-4_LPS","468-6_CTRL","468-6_LPS","685-1_CTRL","685-1_LPS","685-2_CTRL-3","685-2_LPS-2","685-5_CTRL-3","685-5_LPS-1","685-7_CTRL-3","685-7_LPS-2","697-1_CTRL-3","697-1_LPS-1","697-2_CTRL","697-2_LPS","697-3_CTRL-3","697-3_LPS-2","697-4_CTRL-2","697-4_LPS-2","697-5_CTRL","697-5_LPS"]
condition = ["Control","LPS","Control","LPS","Control","LPS","Control","LPS","Control","LPS","Control","LPS","Control","LPS","Control","LPS","Control","LPS","Control","LPS","Control","LPS","Control","LPS"]
lib_type = ["single_end"]*len(condition)
pair = ["468-3","468-3","468-4","468-4","468-6","468-6","685-1","685-1","685-2","685-2","685-5","685-5","685-7","685-7","697-1","697-1","697-2","697-2","697-3","697-3","697-4","697-4","697-5","697-5"]
genotype = ["het","het","wt","wt","mut","mut","mut","mut","wt","wt","het","het","het","het","wt","wt","mut","mut","wt","wt","het","het","mut","mut"]
In [ ]:
#Update Metadata File
meta = {'Sample': pandas.Series(sample_names), 'condition': pandas.Series(condition) , 'libType': pandas.Series(lib_type),
'pair': pandas.Series(pair), 'genotype': pandas.Series(genotype)}
meta_df = pandas.DataFrame(data = meta)
deseq_meta_new = "/data/mccoy/new_meta.csv"
meta_df.to_csv(deseq_meta_new,index=False)
print meta_df
In [ ]:
###Update parameters, such as GENOME, GTF_FILE, paths, etc
parameters = "/root/src/omics-pipe/tests/test_params_RNAseq_counts_AWS.yaml"
stream = file(parameters, 'r')
params = yaml.load(stream)
params.update({"SAMPLE_LIST": sample_names})
params.update({"DESEQ_META": deseq_meta_new})
params.update({"R_VERSION": '3.2.3'})
params.update({"GENOME": '/database/Mus_musculus/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa'})
params.update({"STAR_INDEX": '/database/Mus_musculus/Mus_musculus/STAR_index'})
params.update({"REF_GENES": '/database/Mus_musculus/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf'})
params.update({"RAW_DATA_DIR": '/data/mccoy/fastq'})
params.update({"TEMP_DIR": '/data/tmp'})
params.update({"PIPE_MULTIPROCESS": 100})
params.update({"STAR_VERSION": '2.4.5a'})
params.update({"PARAMS_FILE": '/data/mccoy/Omics_Pipe_RNAseq_params.yaml'})
params.update({"LOG_PATH": ':/data/mccoy/logs'})
params.update({"QC_PATH": "/data/mccoy/QC"})
params.update({"FLAG_PATH": "/data/mccoy/flags"})
params.update({"DESEQ_RESULTS": "/data/mccoy/deseq"})
params.update({"STAR_OPTIONS": '--readFilesCommand cat --runThreadN 8 --outSAMstrandField intronMotif --outFilterIntronMotifs RemoveNoncanonical'})
params.update({"REPORT_RESULTS": "/data/mccoy/report"})
params.update({"STAR_RESULTS": "/data/mccoy/star"})
params.update({"HTSEQ_RESULTS": "/data/mccoy/counts"})
params.update({"DESIGN": '~condition'})
#update params
default_parameters.update(params)
#write yaml file
stream = file('updated_params.yaml', 'w')
yaml.dump(params,stream)
p = Bunch(default_parameters)
#View Parameters
print "Run Parameters: \n" + str(params)
In [ ]:
### Omics Pipe Pipelines
from IPython.display import Image
Image(filename='/data/core_analysis_pipelines/RNAseq/Omics_Pipe_RNAseq_count_based_pipeline/images/op_pipelines.png', width=700, height=150)
In [ ]:
###Run Omics Pipe from the command line
!omics_pipe RNAseq_count_based /data/mccoy/updated_params.yaml
In [ ]:
#Change top directory to locate result files
os.chdir("/data/mccoy")
In [ ]:
#Display Omics Pipe Pipeline Run Status
#pipeline = './flags/pipeline_combined_%s.pdf' % date
pipeline = './flags/pipeline_combined_2016-05-16 17:41.pdf'
IFrame(pipeline, width=700, height=500)
Quality control of the raw data (fastq files) was assessed using the tool FastQC (http://www.bioinformatics.babraham.ac.uk/projects/fastqc/). The results for all samples are summarized below, and samples are given a PASS/FAIL rating.
In [ ]:
###Summarize FastQC raw data QC results per sample
results_dir = './QC/'
# Below is the complete list of labels in the summary file
summary_labels = ["Basic Statistics", "Per base sequence quality", "Per tile sequence quality",
"Per sequence quality scores", "Per base sequence content", "Per sequence GC content",
"Per base N content", "Sequence Length Distribution", "Sequence Duplication Levels",
"Overrepresented sequences", "Adapter Content", "Kmer Content"]
# Below is the list I anticipate caring about; I leave the full list above in case it turns out later
# I anticipated wrong and need to update this one.
labels_of_interest = ["Basic Statistics", "Per base sequence quality"]
# Look for each file named summary.txt in each subdirectory named *_fastqc in the results directory
summary_wildpath = os.path.join(results_dir, '*/*_fastqc', "summary.txt")
summary_filepaths = [x for x in glob.glob(summary_wildpath)]
#print os.getcwd()
# Examine each of these files to find lines starting with "FAIL" or "WARN"
for curr_summary_path in summary_filepaths:
has_error = False
#print(divider)
with open(curr_summary_path, 'r') as f:
for line in f:
if line.startswith("FAIL") or line.startswith("WARN"):
fields = line.split("\t")
if not has_error:
print(fields[2].strip() + ": PASS") # the file name
has_error = True
if fields[1] in labels_of_interest:
print(fields[0] + "\t" + fields[1])
In [ ]:
#Display QC results for individual samples
sample = "468-3_CTRL-2"
name = './QC/%s/%s_fastqc/fastqc_report.html' % (sample,sample)
IFrame(name, width=1000, height=600)
The samples were aligned to the genome with the STAR aligner (https://github.com/alexdobin/STAR). The alignment statistics for all samples are summarized and displayed below. Samples that do not pass the alignment quality filter (Good quality = # aligned reads > 10 million and % aligned > 60%) are excluded from downstream analyses.
In [ ]:
##Summarize Alignment QC Statistics
import sys
from io import StringIO
star_dir = './star/'
# Look for each file named summary.txt in each subdirectory named *_fastqc in the results directory
summary_wildpath = os.path.join(star_dir, '*/', "Log.final.out")
#summary_wildpath = os.path.join(star_dir, "*Log.final.out")
summary_filepaths = [x for x in glob.glob(summary_wildpath)]
#print summary_filepaths
alignment_stats = pandas.DataFrame()
for curr_summary_path in summary_filepaths:
#with open(curr_summary_path, 'r') as f:
filename = curr_summary_path.replace("./star/","")
filename2 = filename.replace("/Log.final.out","")
df = pandas.read_csv(curr_summary_path, sep="\t", header=None)
raw_reads = df.iloc[[4]]
y = raw_reads[1].to_frame()
aligned_reads = df.iloc[[7]]
z = aligned_reads[1].to_frame()
percent_aligned = df.iloc[[8]]
#print percent_aligned
a = percent_aligned[1]
b = a.to_string()
c = b.replace("%","")
c1 = c.replace("8 ","")
e = float(c1)
d = {"Sample": pandas.Series(filename2), "Raw_Reads": pandas.Series(float(y[1])),
"Aligned_Reads": pandas.Series(float(z[1])),
"Percent_Uniquely_Aligned": pandas.Series(e)}
p = pandas.DataFrame(data=d)
alignment_stats = alignment_stats.append(p)
#print alignment_stats
#View interactive table
qgrid.show_grid(alignment_stats, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 200})
In [ ]:
#Barplot of number of aligned reads per sample
plt.figure(figsize=(10,10))
ax = plt.subplot(111)
alignment_stats.plot(ax=ax, kind='barh', title='# of Reads')
ax.axis(x='off')
ax.axvline(x=10000000, linewidth=2, color='Red', zorder=0)
#plt.xlabel('# Aligned Reads',fontsize=16)
for i, x in enumerate(alignment_stats.Sample):
ax.text(0, i + 0, x, ha='right', va= "bottom", fontsize='medium')
plt.savefig('./alignment_stats_%s' %date ,dpi=300) # save figure
In [ ]:
###Flag samples with poor alignment or low numbers of reads
df = alignment_stats
failed_samples = df.loc[(df.Aligned_Reads < 10000000) | (df.Percent_Uniquely_Aligned < 60), ['Sample','Raw_Reads', 'Aligned_Reads', 'Percent_Uniquely_Aligned']]
#View interactive table
qgrid.show_grid(failed_samples, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 200})
In [ ]:
#View Alignment Statistics for failed samples
for failed in failed_samples["Sample"]:
#fname = "/data/results/star/%s/Log.final.out" % failed
fname = "./star/%s/Log.final.out" % failed
with open(fname, 'r') as fin:
print failed + fin.read()
In [ ]:
###Samples that passed QC for alignment
passed_samples = df.loc[(df.Aligned_Reads > 10000000) | (df.Percent_Uniquely_Aligned > 60), ['Sample','Raw_Reads', 'Aligned_Reads', 'Percent_Uniquely_Aligned']]
print "Number of samples that passed alignment QC = " + str(len(passed_samples))
#View interactive table
qgrid.show_grid(passed_samples, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 200})
In [ ]:
#View Alignment Statistics for passed samples
for passed in passed_samples["Sample"]:
#fname = "/data/results/star/%s/Log.final.out" % passed
fname = "./star/%s/Log.final.out" % passed
with open(fname, 'r') as fin:
print passed + fin.read()
In [ ]:
#Create new metadata file with samples that passed QC for differential expression analyses
passed_list = passed_samples["Sample"]
meta_df_passed = meta_df.loc[meta_df.Sample.isin(passed_list2)]
deseq_meta_new2 = "/data/mccoy/new_meta_QCpassed.csv"
meta_df_passed.to_csv(deseq_meta_new2,index=False)
print meta_df_passed
In [ ]:
print passed_list
The aligned reads were quantifed using RefSeq mm10 annotation using HTSeq-count (http://www-huber.embl.de/users/anders/HTSeq/doc/count.html). The counts for all samples are summarized and displayed below.
In [ ]:
##Set working directory
working_dir <- "/data/mccoy"
setwd(working_dir)
date <- Sys.Date()
In [ ]:
#Set R options
options(jupyter.plot_mimetypes = 'image/png')
options(useHTTPS=FALSE)
options(scipen=500)
Differential expression analysis was performed with DESeq2 in Bioconductor (https://bioconductor.org/packages/release/bioc/html/DESeq2.html). The differentially expressed genes and raw counts for all samples are summarized and displayed below.
In [ ]:
#Load custom R scripts
source("/data/ccbb_tickets/20160504_McCoy_Prince_RNAseq_pathways/src/rnaSeq/RNA_seq_DE.R")
In [ ]:
#Load R packages; Execute this twice to clear the log
require(limma)
require(edgeR)
require(DESeq2)
require(RColorBrewer)
require(cluster)
library(gplots)
library(SPIA)
library(graphite)
library(PoiClaClu)
library(ggplot2)
library(pathview)
library(KEGG.db)
library(mygene)
library(splitstackshape)
library(reshape)
library(hwriter)
library(ReportingTools)
library("EnrichmentBrowser")
library(IRdisplay)
library(repr)
In [ ]:
#Compile individual count files
#======================================================================================================================================
#Specify working directory. Should be the name of your project, and there should be a subfolder within this
#directory named "counts" which contains the raw count files in .txt format for all samples (output from htseq in Omics Pipe)
##Set working directory
setwd(working_dir)
name<- "McCoy_Prince_RNAseq_20160517"
#Reads in count files
dir <- paste(getwd(), "/counts", sep="")
countFiles <- paste(dir, "/", dir(dir), sep='')
countNames1 <- gsub('_counts.txt', '', countFiles)
countNames <- gsub(sprintf("%s/", dir), '', countNames1)
countsDf <- NULL
for (i in countFiles) {
dat <- read.csv(i, header=F, sep="\t", na.strings="", as.is=T)
countsDf <- cbind(countsDf, dat[,2])
}
x1 <- dim(countsDf)[1]-4
x2 <- dim(countsDf)[1]
countsDf <- countsDf[-c(x1:x2),] # remove the last 5 lines, they hold no genes
rownames(countsDf) <- read.csv(i, header=F, sep="\t", na.strings="", as.is=T)$V1[-c(x1:x2)]
colnames(countsDf) <- countNames
write.csv(countsDf, sprintf("%s/%s_ALL_counts.csv", working_dir, name)) #Creates file with all counts in one file
df <- countsDf
geneCount <- df
rc <- rowSums(geneCount)
geneCount <- geneCount[rc > 0,]
N <- colSums(geneCount)
names <- names(N)
print("Top of Raw Counts File:")
head(geneCount)
In [ ]:
##Visualize library size distribution (# aligned reads)
par(oma=c(5,1,1,1) + 0.1)
barplot(N*1e-6,
ylab="Library size (millions)",
main=c("Library size distribution"),
names=names(N),
las=2,
cex.names=0.75
)
In [ ]:
# Preprocess data & read in metadata
#=====================================================================================================================================
#Read in design file. Example in s3://ucsd-ccb-data-analysis/Katie/RNAseq_scripts
#meta <- read.csv(sprintf("%s_design.csv",name), header=T, stringsAsFactor=FALSE)
#Read in design file with good quality samples only
meta <- read.csv(sprintf("%s/new_meta_QCpassed.csv",working_dir), header=T, stringsAsFactors=FALSE)
dds <- DESeqDataSetFromMatrix(countData = geneCount,
colData = meta,
design = ~condition)
#Run differential expression analysis
dds <- DESeq(dds)
In [ ]:
#Create MDS plot for all samples
rld <- rlog(dds)
poisd <- PoissonDistance(t(counts(dds)))
samplePoisDistMatrix <- as.matrix( poisd$dd )
rownames(samplePoisDistMatrix) <- paste( dds$dex, dds$cell, sep="-" )
mds <- data.frame(cmdscale(samplePoisDistMatrix))
mds <- cbind(mds, colData(rld))
mds <- as.data.frame(mds)
qplot(X1,X2,color=condition,data=mds,size=5, shape=genotype)
In [ ]:
##Run all plotting code and save to PDF
pdf(sprintf("%s/%s_all_samples_plots_%s.pdf", working_dir, name,date))
par(oma=c(5,1,1,1) + 0.1)
barplot(N*1e-6,
ylab="Library size (millions)",
main=c("Library size distribution"),
names=names(N),
las=2,
cex.names=0.75
)
poisd <- PoissonDistance(t(counts(dds)))
samplePoisDistMatrix <- as.matrix( poisd$dd )
rownames(samplePoisDistMatrix) <- paste( dds$dex, dds$cell, sep="-" )
mds <- data.frame(cmdscale(samplePoisDistMatrix))
mds <- cbind(mds, colData(rld))
mds <- as.data.frame(mds)
qplot(X1,X2,color=condition,data=mds,size=5, shape=genotype)
dev.off()
In [ ]:
#Read in design file with good quality samples only
meta <- read.csv(sprintf("%s/new_meta_QCpassed.csv",working_dir), header=T, stringsAsFactors=FALSE)
#Create new meta files with subsets of samples for desired comparisons
#Failed samples 468-4-LPS, 468-6_CTRL, 697-4_LPS-2
#wt to wt LPS vs Control
desired_samples <- c("468-4_CTRL","685-2_CTRL-3","685-2_LPS-2",
"697-1_CTRL-3","697-1_LPS-1","697-3_CTRL-3","697-3_LPS-2") #removed failed sample 468-4 LPS
desired_design <- "~condition + pair"
name2 <- "WTonly_LPSvsControl"
desired_samples
desired_design
name2
In [ ]:
#Reload count files for desired meta file
df <- countsDf
geneCount <- df
rc <- rowSums(geneCount)
geneCount <- geneCount[rc > 0,]
N <- colSums(geneCount)
names <- names(N)
#Update meta data file
#meta <- meta[match(colnames(geneCount),desired_samples),]
meta <- meta[(colnames(geneCount) %in% desired_samples),]
meta <- meta[complete.cases(meta),]
rownames(meta) <- meta$Sample
print("Meta Data File:")
meta
#Subset geneCounts for desired samples
geneCount <- geneCount[,meta$Sample]
df<-df[,meta$Sample]
check <- cbind(meta$Sample, colnames(geneCount))
group <- meta$condition
print("Top of Counts File:")
head(geneCount)
In [ ]:
#Differential expression
#=====================================================================================================================================
#Normalization of raw counts using deseq
trsLength <- NA
if(is.element("length", colnames(df)))
trsLength <- df[rc > 0,"length"]
norm <- getNormData(df, group, trsLength, addRaw=TRUE)
deseq <- log2(norm$DESeq+1)
sf_deseq <- getSizeFactor(df, group)$DESeq # save the size factors of the ref cohort
write.csv(deseq, sprintf("%s/DEseq_normalized_counts_%s.csv",working_dir,name))
# nonspecic Filtering
sds <- apply(deseq,1,sd)
use <- (sds > quantile(sds, 0.75))
deseqNsf <- deseq[use,]
#Create DEseq dataset from matrix
dds <- DESeqDataSetFromMatrix(countData = geneCount,
colData = meta,
design = formula(desired_design))
#Run differential expression analysis
dds <- DESeq(dds)
ddsClean <- replaceOutliersWithTrimmedMean(dds)
ddsClean <- DESeq(ddsClean)
res <- results(ddsClean)
#res <- results(ddsClean, contrast=c("condition", "LPS", "Control")) #Specify conditions for DE comparison here if more than two conditions
res <- res[order(res$padj),]
write.csv(res, sprintf("%s/DE_genes_%s_%s_%s.csv", working_dir, name, name2, date)) #Writes results of differential expression analysis to this file in your working dir
In [ ]:
#Differentially expressed genes
DE <- subset(res, padj < 0.001) #specify level of DE
DE2 <- subset(DE, abs(log2FoldChange) > 1) #specify level of DE
gene_list <- row.names(DE2)
write.csv(gene_list, sprintf("%s/DE_gene_ID_list_%s_%s_%s.csv", working_dir, name, name2, date))
print("Number of Differentially Expressed Genes padj < 0.001:")
nrow(DE)
print("Number of Differentially Expressed Genes padj < 0.001 and log2FoldChange > 1:")
nrow(DE2)
print("Top of Differentially Expressed Genes List:")
head(as.data.frame(DE2))
print("List of Differentially Expressed Genes to Cut and Copy for Enrichment Analyses")
cat(gene_list)
In [ ]:
##Set working directory
setwd(working_dir)
In [ ]:
#Create distance matrix heatmap and clustering
#pdf(sprintf("%s_%s_plots_%s.pdf", name, name2,date)) #Uncomment this to save all plots to a pdf file
#png(sprintf("%s_%s_plots_%s.png", name, name2,date),res=1200, width=4,height=4, units='in') #Uncomment this to save all plots to a pdf file
rld <- rlog(dds)
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
rownames(mat) <- colnames(mat) <- with(colData(dds), paste(Sample, genotype, sep=":"))
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(13, 13))
In [ ]:
#Create MDS plot for samples in desired comparison
poisd <- PoissonDistance(t(counts(dds)))
samplePoisDistMatrix <- as.matrix( poisd$dd )
rownames(samplePoisDistMatrix) <- paste( dds$dex, dds$cell, sep="-" )
mds <- data.frame(cmdscale(samplePoisDistMatrix))
mds <- cbind(mds, colData(rld))
mds <- as.data.frame(mds)
qplot(X1,X2,color=condition,data=mds,size=5)
In [ ]:
#Create Heatmap of differentially expressed genes
#DE <- subset(res, padj < 0.05) #specify level of DE
#DE <- subset(top$table, FDR < 0.05) #specify level of DE
DE <- subset(res, padj < 0.001) #specify level of DE
DE2 <- subset(DE, abs(log2FoldChange) > 1) #specify level of DE
#DE <- subset(res, pvalue < 0.01)
useHeat <- row.names(DE2)
deseqHeat <- deseq[useHeat,]
colnames(deseqHeat) <- with(colData(dds), paste(Sample, genotype, sep=":"))
#deseqHeat <-deseqHeat[,]
par(oma=c(5,1,1,1) + 0.1)
heatmap.2(deseqHeat,
Rowv=TRUE,
#Colv=hc,
col=rev(redgreen(75)),
scale="row",
#ColSideColors=unlist(sapply(group, mycol)),
trace="none",
key=TRUE,
cexRow=0.35,
cexCol=1,
dendrogram="both"
#labRow=TRUE
)
In [ ]:
#Create Heatmap of top 100 differentially expressed genes
#DE <- subset(res, padj < 0.05) #specify level of DE
#DE <- subset(top$table, FDR < 0.05) #specify level of DE
DE <- subset(res, padj < 0.001) #specify level of DE
DE2 <- subset(DE, abs(log2FoldChange) > 1) #specify level of DE
DE_100 <- DE2[1:100,]
#DE <- subset(res, pvalue < 0.01)
useHeat <- row.names(DE_100)
deseqHeat <- deseq[useHeat,]
colnames(deseqHeat) <- with(colData(dds), paste(Sample, genotype, sep=":"))
#deseqHeat <-deseqHeat[,]
par(oma=c(5,1,1,1) + 0.1)
heatmap.2(deseqHeat,
Rowv=TRUE,
#Colv=hc,
col=rev(redgreen(75)),
scale="row",
#ColSideColors=unlist(sapply(group, mycol)),
trace="none",
key=TRUE,
cexRow=0.35,
cexCol=1,
dendrogram="both"
#labRow=TRUE
)
In [ ]:
##Run all plotting code and save to PDF
##Set working directory
setwd(working_dir)
#Create distance matrix heatmap and clustering
pdf(sprintf("%s/%s_%s_plots_%s.pdf", working_dir, name, name2,date)) #Uncomment this to save all plots to a pdf file
#png(sprintf("%s_%s_plots_%s.png", name, name2,date),res=1200, width=4,height=4, units='in') #Uncomment this to save all plots to a pdf file
#Create distance matrix
rld <- rlog(dds)
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
rownames(mat) <- colnames(mat) <- with(colData(dds), paste(Sample, genotype, sep=":"))
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(13, 13))
#Create MDS plot for samples in desired comparison
poisd <- PoissonDistance(t(counts(dds)))
samplePoisDistMatrix <- as.matrix( poisd$dd )
rownames(samplePoisDistMatrix) <- paste( dds$dex, dds$cell, sep="-" )
mds <- data.frame(cmdscale(samplePoisDistMatrix))
mds <- cbind(mds, colData(rld))
mds <- as.data.frame(mds)
qplot(X1,X2,color=condition,data=mds,size=5)
#Create Heatmap of differentially expressed genes
#DE <- subset(res, padj < 0.05) #specify level of DE
#DE <- subset(top$table, FDR < 0.05) #specify level of DE
DE <- subset(res, padj < 0.001) #specify level of DE
DE2 <- subset(DE, abs(log2FoldChange) > 1) #specify level of DE
#DE <- subset(res, pvalue < 0.01)
useHeat <- row.names(DE2)
deseqHeat <- deseq[useHeat,]
colnames(deseqHeat) <- with(colData(dds), paste(Sample, genotype, sep=":"))
#deseqHeat <-deseqHeat[,]
par(oma=c(5,1,1,1) + 0.1)
heatmap.2(deseqHeat,
Rowv=TRUE,
#Colv=hc,
col=rev(redgreen(75)),
scale="row",
#ColSideColors=unlist(sapply(group, mycol)),
trace="none",
key=TRUE,
cexRow=0.35,
cexCol=1,
dendrogram="both"
#labRow=TRUE
)
#Create Heatmap of top 100 differentially expressed genes
#DE <- subset(res, padj < 0.05) #specify level of DE
#DE <- subset(top$table, FDR < 0.05) #specify level of DE
DE <- subset(res, padj < 0.001) #specify level of DE
DE2 <- subset(DE, abs(log2FoldChange) > 1) #specify level of DE
DE_100 <- DE2[1:100,]
#DE <- subset(res, pvalue < 0.01)
useHeat <- row.names(DE_100)
deseqHeat <- deseq[useHeat,]
colnames(deseqHeat) <- with(colData(dds), paste(Sample, genotype, sep=":"))
#deseqHeat <-deseqHeat[,]
par(oma=c(5,1,1,1) + 0.1)
heatmap.2(deseqHeat,
Rowv=TRUE,
#Colv=hc,
col=rev(redgreen(75)),
scale="row",
#ColSideColors=unlist(sapply(group, mycol)),
trace="none",
key=TRUE,
cexRow=0.35,
cexCol=1,
dendrogram="both"
#labRow=TRUE
)
dev.off()
In [ ]:
##Set working directory
setwd(working_dir)
In [ ]:
###Annotated differential expression results for all genes
all_results <- as.data.frame(res)
id_list_all <- row.names(res)
out_all<-queryMany(id_list_all, scopes="symbol", fields="entrezgene", species="mouse")
In [ ]:
##Merge annotations with original DE results
merged_all <- merge(all_results, out_all, by.x="row.names", by.y="query", all.x=TRUE)
merged_all_sub <- subset(merged_all, !is.na(merged_all$entrezgene))
head(merged_all_sub)
nrow(merged_all_sub)
In [ ]:
#Prepare Differentially expressed genes and All genes for SPIA input
DE <- subset(res, padj < 0.001) #specify level of DE
DE2 <- subset(DE, abs(log2FoldChange) > 1) #specify level of DE
id_list <- row.names(DE2)
out<-queryMany(id_list, scopes="symbol", fields="entrezgene", species="mouse")
merged <- merge(data.frame(DE2), out, by.x="row.names", by.y="query", all.x=TRUE)
merged_sub <- subset(merged, !is.na(merged$entrezgene))
DE_genes1 <- as.vector(merged_sub$log2FoldChange)
DE_genes2 <- gsub("Inf", 5, DE_genes1)
DE_genes2 <- as.numeric(DE_genes2)
DE_genes <- gsub("-Inf", -5, DE_genes2)
DE_genes <-as.numeric(DE_genes)
names(DE_genes) <- merged_sub$entrezgene
head(DE_genes)
ALL_genes <- merged_all_sub$entrezgene
head(ALL_genes)
In [ ]:
##Set working directory
setwd(working_dir)
In [ ]:
##Run SPIA
res = spia(de=DE_genes, all=ALL_genes, organism="mmu", nB=2000, plots=FALSE, beta=NULL, combine="fisher" ) #MAYBE NEED TO ADD DATADIR
write.csv(res, file = sprintf("%s/spia_output__%s_%s_fisher.csv", working_dir, name, date))
In [ ]:
#View top of the results table
head(res)
In [ ]:
##Set working directory
setwd(working_dir)
In [ ]:
##Download, run, and prepare mmu databases
#setwd(working_dir)
kegg.gs.mmu <- get.kegg.genesets("mmu")
go.gs.mmu <- get.go.genesets(org="mmu", onto="BP", mode="GO.db")
pwys.mmu <- download.kegg.pathways("mmu")
mmu.grn <- compile.grn.from.kegg(pwys.mmu)
In [ ]:
##Set working directory
setwd(working_dir)
In [ ]:
gene_ids_from_merged_all <- merged_all_sub$Row.names
merged_all_sub_unique <- merged_all_sub[!duplicated(merged_all_sub$Row.names),]
unique_genes <- gene_ids_from_merged_all[!duplicated(gene_ids_from_merged_all)]
#length(unique_genes)
exprs1 <- subset(geneCount, rownames(geneCount) %in% unique_genes)
exprs <- as.matrix(exprs1)
row.names(exprs) <- NULL
colnames(exprs) <- NULL
#nrow(exprs)
write.table(exprs, sprintf("/data/mccoy/DE_exprs_%s_%s_%s.tab", name, name2, date), sep="\t",row.names = F,col.names = F)
pdat1 <- data.frame("names" =colnames(geneCount))
#pdat1
meta_merge <- merge(pdat1, meta, by.x = "names", by.y="Sample")
#meta_merge
meta_merge$condition_binary <- ifelse(meta_merge$condition == "LPS", 1, 0)
pdat2 <- data.frame(meta_merge$names, meta_merge$condition_binary, meta_merge$pair)
pdat <- as.matrix(pdat2)
row.names(pdat) <- NULL
colnames(pdat) <- NULL
write.table(pdat, sprintf("/data/mccoy/DE_pdat_%s_%s_%s.tab", name, name2, date), sep="\t",row.names = F,col.names = F)
fdat1 <- data.frame("names"= row.names(exprs1))
fdat2 <- merge(fdat1, merged_all_sub_unique, by.x="names", by.y="Row.names")
fdat <- data.frame(fdat2$entrezgene)
#nrow(fdat)
#head(fdat)
write.table(fdat, sprintf("/data/mccoy/DE_fdat_%s_%s_%s.tab", name, name2, date), sep="\t",row.names = F,col.names = F)
#Create fdat from DE results instead of built in DE function from EnrichmentBrowser
fdat_DE <- data.frame("ENTREZID" = merged_all_sub_unique$entrezgene, "FC" = merged_all_sub_unique$log2FoldChange,
"ADJ.PVAL" = merged_all_sub_unique$padj, "DESeq.STAT" = merged_all_sub_unique$stat)
row.names(fdat_DE) <- fdat_DE$ENTREZID
head(fdat_DE)
write.table(fdat_DE, sprintf("/data/mccoy/DE_fdat_DEresults_%s_%s_%s.tab", name, name2, date), sep="\t",row.names = F,col.names = F)
#Create Expression Set from real data, does not include DE expression results
eset_raw <- read.eset(exprs.file=sprintf("/data/mccoy/DE_exprs_%s_%s_%s.tab", name, name2, date), pdat.file=sprintf("/data/mccoy/DE_pdat_%s_%s_%s.tab", name, name2, date),
fdat.file=sprintf("/data/mccoy/DE_fdat_%s_%s_%s.tab", name, name2, date), data.type='rseq')
#Create ExpressionSet from real data, include DE expression results as fdata
eset_DE <- read.eset(exprs.file=sprintf("/data/mccoy/DE_exprs_%s_%s_%s.tab", name, name2, date), pdat.file=sprintf("/data/mccoy/DE_pdat_%s_%s_%s.tab", name, name2, date),
fdat.file=sprintf("/data/mccoy/DE_fdat_DEresults_%s_%s_%s.tab", name, name2, date), data.type='rseq')
#Fix column names for eset_DE and check
colnames(fData(eset_DE)) <- c("ENTREZID", "FC", "ADJ.PVAL", "DESeq.STAT")
#Recode pvalues to capture desired significance level
fData(eset_DE)$ADJ.PVAL <- as.numeric(fData(eset_DE)$ADJ.PVAL)
fData(eset_DE)$FC <- as.numeric(fData(eset_DE)$FC)
fData(eset_DE)$ADJ.PVAL[is.na(fData(eset_DE)$ADJ.PVAL)] <- 1
fData(eset_DE)$FC[is.na(fData(eset_DE)$FC)] <- 0
class(fData(eset_DE)$ADJ.PVAL)
class(fData(eset_DE)$FC)
head(fData(eset_DE))
#View plots of ExpressionSets
par(mfrow=c(1,2))
pdistr(fData(eset_DE)$ADJ.PVAL)
volcano(fData(eset_DE)$FC, fData(eset_DE)$ADJ.PVAL)
In [ ]:
###Run SBEA kegg for original pvalues
sbea.res.kegg <- sbea(method="ora", eset=eset_DE, gs=kegg.gs.mmu, perm=0, alpha=0.001, beta = 1, padj.method="BH",
out.file=sprintf("%s/SBEA_KEGG_results_%s_%s_%s.txt", working_dir, name, name2, date))
#, out.file="/data/mccoy/SBEA_KEGG_RESULTS_test.txt",beta = 1, sig.stat='&')
#This works and gives good concordance with Webgestalt and ToppGene
#Basic Overrepresentation Analysis
sbea.res.kegg <- sbea(method="ora", eset=eset_DE, gs=kegg.gs.mmu, perm=0, alpha=0.001, beta = 1, padj.method="BH")
gs.ranking(sbea.res.kegg,signif.only=TRUE)
In [ ]:
###Enrichment Browser functions not being recognized for some reason. Defining them here works.
determine.edge.color <- function(edge.cons){
ifelse(edge.cons < 0, rgb(0,0,abs(edge.cons)), rgb(abs(edge.cons),0,0))
}
is.consistent <-function (grn.rel) {
act.cons <- mean(abs(grn.rel[1:2]))
if (length(grn.rel) == 2)
return(act.cons)
if (sum(sign(grn.rel[1:2])) == 0)
act.cons <- -act.cons
return(ifelse(grn.rel[3] == 1, act.cons, -act.cons))
}
In [ ]:
#Create EnrichmentBrowser Html Report with Pathway Viz
setwd("/root/anaconda3/lib/R/library/EnrichmentBrowser/results/reports")
ea.browse(sbea.res.kegg)
#Compress and Move html results from default EnrichmentBrowser directory to desired directory
dir.create(sprintf("%s/EnrichmentBrowser", working_dir), showWarnings=FALSE)
zip(zipfile = sprintf("/data/mccoy/EnrichmentBrowser/SBEA_KEGG_results_%s_%s_%s.zip", name, name2, date), "/root/anaconda3/lib/R/library/EnrichmentBrowser/results/reports")
In [ ]:
###Run SBEA GO for original pvalues
###sbea.res.go <- sbea(method="ora", eset=eset_DE, gs=go.gs.mmu, perm=0, alpha=0.001, beta = 1, padj.method="BH",
# #out.file=sprintf("%s/SBEA_GO_results_%s_%s_%s.txt", working_dir, name, name2, date))
#, out.file="/data/mccoy/SBEA_KEGG_RESULTS_test.txt",beta = 1, sig.stat='&')
#This works and gives good concordance with Webgestalt and ToppGene
#Basic Overrepresentation Analysis
##sbea.res.go <- sbea(method="ora", eset=eset_DE, gs=go.gs.mmu, perm=0, alpha=0.001, beta = 1, padj.method="BH")
##gs.ranking(sbea.res.go,signif.only=TRUE)
In [ ]:
#Create EnrichmentBrowser Html Report with Pathway Viz
#this works!!
#setwd("/root/anaconda3/lib/R/library/EnrichmentBrowser/results/reports")
##ea.browse(sbea.res.go)
#Compress and Move html results from default EnrichmentBrowser directory to desired directory
##zip(zipfile = sprintf("/data/mccoy/EnrichmentBrowser/SBEA_GO_results_%s_%s_%s.zip", name, name2, date), "/root/anaconda3/lib/R/library/EnrichmentBrowser/results/reports")
In [ ]:
#Network based enrichment analysis using EnrichmentBrowser
#kegg.gs.mmu <- get.kegg.genesets("mmu")
#go.gs.mmu <- get.go.genesets(org="mmu", onto="BP", mode="GO.db")
#pwys.mmu <- download.kegg.pathways("mmu")
#mmu.grn <- compile.grn.from.kegg(pwys.mmu)
# perform GGEA using the compiled KEGG regulatory network
nbea.res <- nbea(method="ggea", eset=eset_DE, gs=kegg.gs.mmu, grn=mmu.grn)
gs.ranking(nbea.res)
In [ ]:
#View network
par(mfrow=c(1,2))
ggea.graph(
gs=kegg.gs.mmu[["mmu04145_Phagosome"]],
grn=mmu.grn, eset=eset_DE)
In [ ]:
ggea.graph.legend()
In [ ]:
#Combine enrichment results from different analysis methods
res.list <- list(sbea.res.kegg, nbea.res)
comb.res <- comb.ea.results(res.list)
ea.browse(comb.res, graph.view=mmu.grn, nr.show=5)
#Compress and Move html results from default EnrichmentBrowser directory to desired directory
zip(zipfile = sprintf("%s/EnrichmentBrowser/SBEA_Combined_results_%s_%s_%s.zip", working_dir, name, name2, date), "/root/anaconda3/lib/R/library/EnrichmentBrowser/results/reports")
In [ ]:
#Display Pathway Results for KEGG pathway Lysosome
display_png(file="/root/anaconda3/lib/R/library/EnrichmentBrowser/results/reports/mmu04142_volc.png")
display_png(file ="/root/anaconda3/lib/R/library/EnrichmentBrowser/results/reports/mmu04142_kpath.png")
display_png(file ="/root/anaconda3/lib/R/library/EnrichmentBrowser/results/reports/mmu04142_hmap.png")
display_png(file ="/root/anaconda3/lib/R/library/EnrichmentBrowser/results/reports/mmu04142_hmap2.png")
In [ ]:
#Display Pathway Results for KEGG pathway Phagosome
display_png(file="/root/anaconda3/lib/R/library/EnrichmentBrowser/results/reports/mmu04145_volc.png")
display_png(file ="/root/anaconda3/lib/R/library/EnrichmentBrowser/results/reports/mmu04145_kpath.png")
display_png(file ="/root/anaconda3/lib/R/library/EnrichmentBrowser/results/reports/mmu04145_hmap.png")
display_png(file ="/root/anaconda3/lib/R/library/EnrichmentBrowser/results/reports/mmu04145_hmap2.png")
In [ ]:
#Display Pathway Results for KEGG pathway Leukocyte transendothelial migration
display_png(file="/root/anaconda3/lib/R/library/EnrichmentBrowser/results/reports/mmu04670_volc.png")
display_png(file ="/root/anaconda3/lib/R/library/EnrichmentBrowser/results/reports/mmu04670_kpath.png")
display_png(file ="/root/anaconda3/lib/R/library/EnrichmentBrowser/results/reports/mmu04670_hmap.png")
display_png(file ="/root/anaconda3/lib/R/library/EnrichmentBrowser/results/reports/mmu04670_hmap2.png")