Goal: Create Jupyter notebook cell that runs an enrichment analysis with GSEApy
Steps:
Import gene list from differential expression analysis
Implement GSEApy (http://software.broadinstitute.org/gsea/index.jsp ; http://pythonhosted.org/gseapy/index.html)
Display significant GSEA results and graphs.
In [ ]:
##importing python module
import os
import pandas
import numpy
import gseapy
import mygene
import ipywidgets
import qgrid
import urllib2
qgrid.nbinstall(overwrite=True)
qgrid.set_defaults(remote_js=True, precision=4)
from IPython.display import IFrame
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
%matplotlib inline
##change directory
os.chdir("/Users/nicole/Documents/CCBB Internship")
In [ ]:
##read in DESeq2 results
genes=pandas.read_csv("/Users/nicole/Documents/CCBB Internship/DE_genes.csv")
##View interactive table
#qgrid.show_grid(genes.sample(1000), grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})
#View top of file
genes.head(10)
In [ ]:
#Extract genes that are differentially expressed with a pvalue less than a certain cutoff (pvalue < 0.05 or padj < 0.05)
genes_DE_only = genes.loc[(genes.padj < 0.05)]
#View top of file
genes_DE_only.head(10)
In [ ]:
#Check how many rows in original genes file
len(genes)
In [ ]:
#Check how many rows in DE genes file
len(genes_DE_only)
In [ ]:
#Extract list of DE genes (Check to make sure this code works, this was adapted from a different notebook)
de_list = genes_DE_only[genes_DE_only.columns[0]]
#Remove .* from end of Ensembl ID
de_list2 = de_list.replace("\.\d","",regex=True)
#Add new column with reformatted Ensembl IDs
genes_DE_only["Full_Ensembl"] = de_list2
#View top of file
genes_DE_only.head(10)
In [ ]:
#Set up mygene.info API and query
mg = mygene.MyGeneInfo()
gene_ids = mg.getgenes(de_list2, 'name, symbol, entrezgene', as_dataframe=True)
gene_ids.index.name = "Ensembl"
gene_ids.reset_index(inplace=True)
#View top of file
gene_ids.head(10)
In [ ]:
#Merge mygene.info query results with original DE genes list
DE_with_ids = genes_DE_only.merge(gene_ids, left_on="Full_Ensembl", right_on="Ensembl")
#Check top of file
DE_with_ids.head(10)
In [ ]:
#Write results to file
DE_with_ids.to_csv("./DE_genes_converted.csv")
In [ ]:
#Convert to .txt file
DE_with_ids.to_csv("./DE_genes_converted.txt", sep='\t')
In [ ]:
#Generate rank file (symbol, padj)
cols = DE_with_ids.columns.tolist()
cols.insert(0, cols.pop(cols.index('symbol')))
cols.insert(1, cols.pop(cols.index('log2FoldChange')))
prerank_file = DE_with_ids.reindex(columns= cols)
#Condense dataframe to contain only symbol & log2FoldChange
prerank_file.drop(prerank_file.columns[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11,12,13]], axis=1, inplace=True)
#Exclude NaN values
prerank_file.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
#Change column titles
prerank_file.columns = ['gene_name', 'rank']
prerank_file[['rank']] = prerank_file[['rank']].astype(float)
prerank_file.head(10)
In [ ]:
#Display data types
prerank_file.dtypes
In [ ]:
#Write results to file
prerank_file.to_csv("./prerank_file.csv", index=False)
#Remove header and separate by tab (.rnk format)
prerank_file.to_csv("./prerank_file.rnk", sep='\t', index=False, float_format='%.25f', header=False)
In [ ]:
#Open .rnk file
pr = pandas.read_csv("/Users/nicole/Documents/CCBB Internship/prerank_file.rnk", sep='\t')
In [ ]:
#gmt = pandas.read_csv("/Users/nicole/Documents/CCBB Internship/msigdb.v5.1.symbols.gmt-2.txt", sep = ' ')
#gmt.to_csv("/Users/nicole/Documents/CCBB Internship/msigdb.v5.1.symbols.gmt", sep = ' ')
In [ ]:
#GSEApy prerank method to calculate es, nes, pval,fdrs, and produce figures
pr_results = gseapy.prerank(rnk = "/Users/nicole/Documents/CCBB Internship/prerank_file.rnk",
gene_sets = "/Users/nicole/Documents/CCBB Internship/msigdb.v5.1.symbols.gmt",
outdir='/Users/nicole/ccbb_internal/interns/Nicole/gseapy_output',
permutation_n=1000, graph_num = 539, format = 'png')
In [ ]:
#Display GSEApy report
pr = pandas.read_csv("/Users/nicole/ccbb_internal/interns/Nicole/gseapy_output/gseapy_reports.csv")
#View interactive table
#qgrid.show_grid(pr, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})
pr.head(15)
In [ ]:
#for loop that iterates through the top 10 enrich_terms and displays the plot
gseapyres = pandas.read_csv("/Users/nicole/ccbb_internal/interns/Nicole/gseapy_output/gseapy_reports.csv")
gseapyres = gseapyres.head(15)
for i in gseapyres.ix[:,0]:
image = i
address = "/Users/nicole/ccbb_internal/interns/Nicole/gseapy_output/%s.png" % image
img = mpimg.imread(address)
plt.imshow(img)
plt.gcf().set_size_inches(10,10)
plt.show()
i = i.split(" ")[1].strip()
geneset = i
print "more info about " + geneset + " available here:"
print 'http://www.broadinstitute.org/gsea/msigdb/cards/%s' % geneset
print ' '
#IFrame('http://www.broadinstitute.org/gsea/msigdb/cards/%s' % geneset, width=900, height=900)
In [ ]: