We use data from a 2014 Nature paper:
Computational analysis of cell-to-cell heterogeneity
in single-cell RNA-sequencing data reveals hidden
subpopulations of cells
Note: you must have the Python package, xlrd, installed to run this example.
Note: To create plots, you must have:
In [1]:
# Get http://geneontology.org/ontology/go-basic.obo
from goatools.base import download_go_basic_obo
obo_fname = download_go_basic_obo()
In [2]:
# Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
from goatools.base import download_ncbi_associations
fin_gene2go = download_ncbi_associations()
In [3]:
from goatools.obo_parser import GODag
obodag = GODag("go-basic.obo")
In [4]:
from __future__ import print_function
from goatools.anno.genetogo_reader import Gene2GoReader
# Read NCBI's gene2go. Store annotations in a list of namedtuples
objanno = Gene2GoReader(fin_gene2go, taxids=[10090])
# Get namespace2association where:
# namespace is:
# BP: biological_process
# MF: molecular_function
# CC: cellular_component
# assocation is a dict:
# key: NCBI GeneID
# value: A set of GO IDs associated with that gene
ns2assoc = objanno.get_ns2assc()
for nspc, id2gos in ns2assoc.items():
print("{NS} {N:,} annotated mouse genes".format(NS=nspc, N=len(id2gos)))
In [5]:
from genes_ncbi_10090_proteincoding import GENEID2NT as GeneID2nt_mus
print(len(GeneID2nt_mus))
In [6]:
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
goeaobj = GOEnrichmentStudyNS(
GeneID2nt_mus.keys(), # List of mouse protein-coding genes
ns2assoc, # geneid/GO associations
obodag, # Ontologies
propagate_counts = False,
alpha = 0.05, # default significance cut-off
methods = ['fdr_bh']) # defult multipletest correction method
In [7]:
# Data will be stored in this variable
import os
geneid2symbol = {}
# Get xlsx filename where data is stored
ROOT = os.path.dirname(os.getcwd()) # go up 1 level from current working directory
din_xlsx = os.path.join(ROOT, "goatools/test_data/nbt_3102/nbt.3102-S4_GeneIDs.xlsx")
# Read data
if os.path.isfile(din_xlsx):
import xlrd
book = xlrd.open_workbook(din_xlsx)
pg = book.sheet_by_index(0)
for r in range(pg.nrows):
symbol, geneid, pval = [pg.cell_value(r, c) for c in range(pg.ncols)]
if geneid:
geneid2symbol[int(geneid)] = symbol
print('{N} genes READ: {XLSX}'.format(N=len(geneid2symbol), XLSX=din_xlsx))
else:
raise RuntimeError('FILE NOT FOUND: {XLSX}'.format(XLSX=din_xlsx))
In [8]:
# 'p_' means "pvalue". 'fdr_bh' is the multipletest method we are currently using.
geneids_study = geneid2symbol.keys()
goea_results_all = goeaobj.run_study(geneids_study)
goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
In [9]:
goea_quiet_all = goeaobj.run_study(geneids_study, prt=None)
goea_quiet_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
In [10]:
print('{N} of {M:,} results were significant'.format(
N=len(goea_quiet_sig),
M=len(goea_quiet_all)))
In [11]:
print('Significant results: {E} enriched, {P} purified'.format(
E=sum(1 for r in goea_quiet_sig if r.enrichment=='e'),
P=sum(1 for r in goea_quiet_sig if r.enrichment=='p')))
In [12]:
import collections as cx
ctr = cx.Counter([r.NS for r in goea_quiet_sig])
print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format(
TOTAL=len(goea_quiet_sig),
BP=ctr['BP'], # biological_process
MF=ctr['MF'], # molecular_function
CC=ctr['CC'])) # cellular_component
In [13]:
goeaobj.wr_xlsx("nbt3102.xlsx", goea_results_sig)
goeaobj.wr_txt("nbt3102.txt", goea_results_sig)
Plotting all significant GO terms produces a messy spaghetti plot. Such a plot can be useful sometimes because you can open it and zoom and scroll around. But sometimes it is just too messy to be of use.
The "{NS}" in "nbt3102_{NS}.png" indicates that you will see three plots, one for "biological_process"(BP), "molecular_function"(MF), and "cellular_component"(CC)
In [14]:
from goatools.godag_plot import plot_gos, plot_results, plot_goid2goobj
plot_results("nbt3102_{NS}.png", goea_results_sig)
In [15]:
# Plot subset starting from these significant GO terms
goid_subset = [
'GO:0003723', # MF D04 RNA binding (32 genes)
'GO:0044822', # MF D05 poly(A) RNA binding (86 genes)
'GO:0003729', # MF D06 mRNA binding (11 genes)
'GO:0019843', # MF D05 rRNA binding (6 genes)
'GO:0003746', # MF D06 translation elongation factor activity (5 genes)
]
plot_gos("nbt3102_MF_RNA_genecnt.png",
goid_subset, # Source GO ids
obodag,
goea_results=goea_results_all) # Use pvals for coloring
In [16]:
plot_gos("nbt3102_MF_RNA_Symbols.png",
goid_subset, # Source GO ids
obodag,
goea_results=goea_results_all, # use pvals for coloring
# We can further configure the plot...
id2symbol=geneid2symbol, # Print study gene Symbols, not Entrez GeneIDs
study_items=6, # Only only 6 gene Symbols max on GO terms
items_p_line=3, # Print 3 genes per line
)
Copyright (C) 2016-present, DV Klopfenstein, H Tang. All rights reserved.