In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina' # mac
import pandas as pd
import gseapy as gp
import matplotlib.pyplot as plt
Check gseapy version
In [2]:
gp.__version__
Out[2]:
See validated fiters, attributes, datasets
>>> from gseapy.parser import Biomart
>>> bm = Biomart(verbose=False, host="asia.ensembl.org")
>>> ## view validated marts
>>> marts = bm.get_marts()
>>> ## view validated dataset
>>> datasets = bm.get_datasets(mart='ENSEMBL_MART_ENSEMBL')
>>> ## view validated attributes
>>> attrs = bm.get_attributes(dataset='hsapiens_gene_ensembl')
>>> ## view validated filters
>>> filters = bm.get_filters(dataset='hsapiens_gene_ensembl')
>>> ## query results
>>> results = bm.query(dataset='hsapiens_gene_ensembl', attributes=['entrezgene', 'go_id'],
filters={'ensemble_gene_id': [your_input_list]})
In [3]:
# read in an example gene list
gene_list = pd.read_csv("./data/gene_list.txt",header=None, sep="\t")
gene_list.head()
Out[3]:
In [4]:
# convert dataframe or series to list
glist = gene_list.squeeze().str.strip().tolist()
print(glist[:10])
In [5]:
from gseapy.parser import Biomart
In [6]:
bm = Biomart()
In [7]:
## BioMart is slow. Be patient
results = bm.query(dataset='hsapiens_gene_ensembl',
attributes=['external_gene_name','entrezgene', 'go_id'],
filters={'hgnc_symbol': glist},
# save output file
filename="query.results.txt")
In [8]:
# note: pandas read entrez ids as floats, you need to convert them to int
results.head()
Out[8]:
See all supported enrichr library names
Select database from { 'Human', 'Mouse', 'Yeast', 'Fly', 'Fish', 'Worm' }
Enrichr library could be used for gsea
, ssgsea
, and prerank
, too
In [9]:
names = gp.get_library_name() # default: Human
names[:10]
Out[9]:
In [10]:
yeast = gp.get_library_name(database='Yeast')
yeast[:10]
Out[10]:
In [11]:
# run enrichr
# if you are only intrested in dataframe that enrichr returned, please set no_plot=True
# list, dataframe, series inputs are supported
enr = gp.enrichr(gene_list="./data/gene_list.txt",
# or gene_list=glist
description='test_name',
gene_sets=['KEGG_2016','KEGG_2013'],
outdir='test/enrichr_kegg',
cutoff=0.5 # test dataset, use lower value from range(0,1)
)
In [12]:
# access results through res2d attr
# obj.res2d only stores the last enrichr result in your input libraries
# obj.results stores all results
enr.results.head(5)
Out[12]:
In [13]:
enr2 = gp.enrichr(gene_list="./data/gene_list.txt",
# or gene_list=glist
description='test_name',
gene_sets="./data/genes.gmt",
background='hsapiens_gene_ensembl', # or the number of genes, e.g 20000
outdir='test/enrichr_kegg2',
cutoff=0.5, # only used for testing.
verbose=True)
In [14]:
enr2.results.head(5)
Out[14]:
In [15]:
# simple plotting function
from gseapy.plot import barplot, dotplot
# to save your figure, make sure that ``ofname`` is not None
barplot(enr.res2d,title='KEGG_2013',)
Out[15]:
In [16]:
# to save your figure, make sure that ``ofname`` is not None
dotplot(enr.res2d, title='KEGG_2013',)
Out[16]:
You may also want to use enrichr in command line
the option -v will print out the progress of your job
In [17]:
# !gseapy enrichr -i ./data/gene_list.txt \
# --ds BP2017 \
# -g GO_Biological_Process_2017 \
# -v -o test/enrichr_BP
In [18]:
rnk = pd.read_csv("./data/edb/gsea_data.gsea_data.rnk", header=None, sep="\t")
rnk.head()
Out[18]:
In [19]:
# run prerank
# enrichr libraries are supported by prerank module. Just provide the name
# use 4 process to acceralate the permutation speed
# note: multiprocessing may not work on windows
pre_res = gp.prerank(rnk=rnk, gene_sets='KEGG_2016',
processes=4,
permutation_num=100, # reduce number to speed up testing
outdir='test/prerank_report_kegg', format='png')
Leading edge genes save to the final output results
In [20]:
#access results through obj.res2d attribute or obj.results
pre_res.res2d.head()
Out[20]:
In [21]:
# extract geneset terms in res2d
terms = pre_res.res2d.index
terms
Out[21]:
In [22]:
## easy way
from gseapy.plot import gseaplot
# to save your figure, make sure that ofname is not None
gseaplot(rank_metric=pre_res.ranking, term=terms[0], **pre_res.results[terms[0]])
# save figure
# gseaplot(rank_metric=pre_res.ranking, term=terms[0], ofname='your.plot.pdf', **pre_res.results[terms[0]])
You may also want to use prerank in command line
In [23]:
# ! gseapy prerank -r temp.rnk -g temp.gmt -o prerank_report_temp
In [24]:
phenoA, phenoB, class_vector = gp.parser.gsea_cls_parser("./data/P53.cls")
In [25]:
#class_vector used to indicate group attributes for each sample
print(class_vector)
In [26]:
gene_exp = pd.read_csv("./data/P53.txt", sep="\t")
gene_exp.head()
Out[26]:
In [27]:
print("positively correlated: ", phenoA)
In [28]:
print("negtively correlated: ", phenoB)
In [29]:
# run gsea
# enrichr libraries are supported by gsea module. Just provide the name
gs_res = gp.gsea(data=gene_exp, # or data='./P53_resampling_data.txt'
gene_sets='KEGG_2016', # enrichr library names
cls= './data/P53.cls', # cls=class_vector
# set permutation_type to phenotype if samples >=15
permutation_type='phenotype',
permutation_num=100, # reduce number to speed up test
outdir=None, # do not write output to disk
no_plot=True, # Skip plotting
method='signal_to_noise',
processes=4,
format='png')
In [30]:
#access the dataframe results throught res2d attribute
gs_res.res2d.head()
Out[30]:
In [31]:
from gseapy.plot import gseaplot, heatmap
terms = gs_res.res2d.index
# Make sure that ``ofname`` is not None, if you want to save your figure to disk
gseaplot(gs_res.ranking, term=terms[0], **gs_res.results[terms[0]])
In [32]:
# plotting heatmap
genes = gs_res.res2d.genes[0].split(";")
# Make sure that ``ofname`` is not None, if you want to save your figure to disk
heatmap(df = gs_res.heatmat.loc[genes], z_score=0, title=terms[0], figsize=(18,6))
You may also want to use gsea in command line
In [33]:
# !gseapy gsea -d ./data/P53_resampling_data.txt \
# -g KEGG_2016 -c ./data/P53.cls \
# -o test/gsea_reprot_2 \
# -v --no-plot \
# -t phenotype
In [34]:
# txt, gct file input
ss = gp.ssgsea(data="./data/testSet_rand1200.gct",
gene_sets="./data/randomSets.gmt",
outdir='test/ssgsea_report',
sample_norm_method='rank', # choose 'custom' for your own rank list
permutation_num=0, # skip permutation procedure, because you don't need it
no_plot=True, # skip plotting, because you don't need these figures
processes=4, format='png')
In [35]:
ss.res2d
Out[35]:
In [36]:
from gseapy.plot import heatmap
heatmap(ss.res2d)
In [37]:
# or assign a dataframe, or Series to ssgsea()
ssdf = pd.read_csv("./data/temp.txt", header=None, sep="\t")
ssdf.head()
Out[37]:
In [38]:
# dataframe with one column is also supported by ssGSEA or Prerank
# But you have to set gene_names as index
ssdf2 = ssdf.set_index(0)
ssdf2.head()
Out[38]:
In [39]:
type(ssdf2)
Out[39]:
In [40]:
ssSeries = ssdf2.squeeze()
type(ssSeries)
Out[40]:
In [41]:
# reuse data
df = pd.read_csv("./data/P53_resampling_data.txt", sep="\t")
df.head()
Out[41]:
In [42]:
# Series, DataFrame Example
# supports dataframe and series
ssgs = []
for i, dat in enumerate([ssdf, ssdf2, ssSeries, df]):
sstemp = gp.ssgsea(data=dat,
gene_sets="./data/genes.gmt",
outdir='test/ssgsea_report_'+str(i),
scale=False, # set scale to False to get real original ES
permutation_num=0, # skip permutation procedure, because you don't need it
no_plot=True, # skip plotting, because you don't need these figures
processes=4,
format='png')
ssgs.append(sstemp)
In [43]:
# normalized es save to res2d attri
# one sample input
# NES
ssgs[0].res2d.head()
Out[43]:
Note:
If you want to obtain the real original enrichment score,
you have to set scale=False
In [44]:
# ES
# convert dict to DataFrame
es = pd.DataFrame(ssgs[-1].resultsOnSamples)
es.head()
Out[44]:
In [45]:
# if set scale to True, then
# Scaled ES equal to es/gene_numbers
ses = es/df.shape[0]
ses
Out[45]:
In [46]:
# NES
# scale or no have no affects on final nes value
nes = ssgs[-1].res2d
nes.head()
Out[46]:
In [47]:
# set --no-scale to obtain the real original enrichment score
# !gseapy ssgsea -d ./data/testSet_rand1200.gct \
# -g data/temp.gmt \
# -o test/ssgsea_report2 \
# -p 4 --no-plot --no-scale
In [48]:
# run command inside python console
rep = gp.replot(indir="./data", outdir="test/replot_test")
In [49]:
# !gseapy replot -i data -o test/replot_test