GSEApy Implementation

Authors: N. Mouchamel & K. Fisch

Email: Kfisch@ucsd.edu

Date: July 2016

Goal: Create Jupyter notebook cell that runs an enrichment analysis with GSEApy

Steps:

  1. Import gene list from differential expression analysis

  2. Implement GSEApy (http://software.broadinstitute.org/gsea/index.jsp ; http://pythonhosted.org/gseapy/index.html)

  3. Display significant GSEA results and graphs.


In [ ]:
##importing python module
import os
import pandas
import numpy
import gseapy
import mygene
import ipywidgets
import qgrid
import urllib2

qgrid.nbinstall(overwrite=True)
qgrid.set_defaults(remote_js=True, precision=4)

from IPython.display import IFrame
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
%matplotlib inline

##change directory
os.chdir("/Users/nicole/Documents/CCBB Internship")

Read in differential expression results as a Pandas data frame to get differentially expressed gene list


In [ ]:
##read in DESeq2 results
genes=pandas.read_csv("/Users/nicole/Documents/CCBB Internship/DE_genes.csv")

##View interactive table
#qgrid.show_grid(genes.sample(1000), grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})

#View top of file
genes.head(10)

In [ ]:
#Extract genes that are differentially expressed with a pvalue less than a certain cutoff (pvalue < 0.05 or padj < 0.05)
genes_DE_only = genes.loc[(genes.padj < 0.05)]

#View top of file
genes_DE_only.head(10)

In [ ]:
#Check how many rows in original genes file
len(genes)

In [ ]:
#Check how many rows in DE genes file
len(genes_DE_only)

Translate Ensembl IDs to Gene Symbols and Entrez IDs using mygene.info API


In [ ]:
#Extract list of DE genes  (Check to make sure this code works, this was adapted from a different notebook)
de_list = genes_DE_only[genes_DE_only.columns[0]]

#Remove .* from end of Ensembl ID
de_list2 = de_list.replace("\.\d","",regex=True)

#Add new column with reformatted Ensembl IDs
genes_DE_only["Full_Ensembl"] = de_list2

#View top of file 
genes_DE_only.head(10)

In [ ]:
#Set up mygene.info API and query
mg = mygene.MyGeneInfo()
gene_ids = mg.getgenes(de_list2, 'name, symbol, entrezgene', as_dataframe=True)
gene_ids.index.name = "Ensembl"
gene_ids.reset_index(inplace=True)

#View top of file
gene_ids.head(10)

In [ ]:
#Merge mygene.info query results with original DE genes list
DE_with_ids = genes_DE_only.merge(gene_ids, left_on="Full_Ensembl", right_on="Ensembl")

#Check top of file
DE_with_ids.head(10)

In [ ]:
#Write results to file
DE_with_ids.to_csv("./DE_genes_converted.csv")

In [ ]:
#Convert to .txt file
DE_with_ids.to_csv("./DE_genes_converted.txt", sep='\t')

In [ ]:
#Generate rank file (symbol, padj)
cols = DE_with_ids.columns.tolist()
cols.insert(0, cols.pop(cols.index('symbol')))
cols.insert(1, cols.pop(cols.index('log2FoldChange')))

prerank_file = DE_with_ids.reindex(columns= cols)

#Condense dataframe to contain only symbol & log2FoldChange
prerank_file.drop(prerank_file.columns[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11,12,13]], axis=1, inplace=True) 

#Exclude NaN values
prerank_file.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

#Change column titles
prerank_file.columns = ['gene_name', 'rank']

prerank_file[['rank']] = prerank_file[['rank']].astype(float)

prerank_file.head(10)

In [ ]:
#Display data types
prerank_file.dtypes

In [ ]:
#Write results to file
prerank_file.to_csv("./prerank_file.csv", index=False)

#Remove header and separate by tab (.rnk format)
prerank_file.to_csv("./prerank_file.rnk", sep='\t', index=False, float_format='%.25f', header=False)

In [ ]:
#Open .rnk file
pr = pandas.read_csv("/Users/nicole/Documents/CCBB Internship/prerank_file.rnk", sep='\t')

In [ ]:
#gmt = pandas.read_csv("/Users/nicole/Documents/CCBB Internship/msigdb.v5.1.symbols.gmt-2.txt", sep = ' ')
#gmt.to_csv("/Users/nicole/Documents/CCBB Internship/msigdb.v5.1.symbols.gmt", sep = ' ')

Implement GSEAPY


In [ ]:
#GSEApy prerank method to calculate es, nes, pval,fdrs, and produce figures
pr_results = gseapy.prerank(rnk = "/Users/nicole/Documents/CCBB Internship/prerank_file.rnk", 
                         gene_sets = "/Users/nicole/Documents/CCBB Internship/msigdb.v5.1.symbols.gmt", 
                            outdir='/Users/nicole/ccbb_internal/interns/Nicole/gseapy_output', 
                            permutation_n=1000, graph_num = 539, format = 'png')

In [ ]:
#Display GSEApy report
pr = pandas.read_csv("/Users/nicole/ccbb_internal/interns/Nicole/gseapy_output/gseapy_reports.csv")

#View interactive table
#qgrid.show_grid(pr, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})

pr.head(15)

In [ ]:
#for loop that iterates through the top 10 enrich_terms and displays the plot
gseapyres = pandas.read_csv("/Users/nicole/ccbb_internal/interns/Nicole/gseapy_output/gseapy_reports.csv")
gseapyres = gseapyres.head(15)

for i in gseapyres.ix[:,0]:
    
    image = i
    address = "/Users/nicole/ccbb_internal/interns/Nicole/gseapy_output/%s.png" % image
    
    img = mpimg.imread(address)
    plt.imshow(img)
    plt.gcf().set_size_inches(10,10)
    plt.show()
    
    i = i.split(" ")[1].strip()
    geneset = i 
    print "more info about " + geneset + " available here:"
    print 'http://www.broadinstitute.org/gsea/msigdb/cards/%s' % geneset
    print ' '
    
    
#IFrame('http://www.broadinstitute.org/gsea/msigdb/cards/%s' % geneset, width=900, height=900)

In [ ]: