We use data from a 2014 Nature paper:
Computational analysis of cell-to-cell heterogeneity
in single-cell RNA-sequencing data reveals hidden
subpopulations of cells
Note: you must have the Python package, xlrd, installed to run this example.
In [2]:
# Get http://geneontology.org/ontology/go-basic.obo
from goatools.base import download_go_basic_obo
obo_fname = download_go_basic_obo()
In [3]:
# Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
from goatools.base import download_ncbi_associations
gene2go = download_ncbi_associations()
In [4]:
from goatools.obo_parser import GODag
obodag = GODag("go-basic.obo")
In [5]:
from __future__ import print_function
from goatools.associations import read_ncbi_gene2go
geneid2gos_mouse = read_ncbi_gene2go("gene2go", taxids=[10090])
print("{N:,} annotated mouse genes".format(N=len(geneid2gos_mouse)))
In [6]:
from goatools.test_data.genes_NCBI_10090_ProteinCoding import GeneID2nt as GeneID2nt_mus
In [7]:
from goatools.go_enrichment import GOEnrichmentStudy
goeaobj = GOEnrichmentStudy(
GeneID2nt_mus.keys(), # List of mouse protein-coding genes
geneid2gos_mouse, # geneid/GO associations
obodag, # Ontologies
propagate_counts = False,
alpha = 0.05, # default significance cut-off
methods = ['fdr_bh']) # defult multipletest correction method
In [8]:
# Data will be stored in this variable
import os
geneid2symbol = {}
# Get xlsx filename where data is stored
ROOT = os.path.dirname(os.getcwd()) # go up 1 level from current working directory
din_xlsx = os.path.join(ROOT, "goatools/test_data/nbt_3102/nbt.3102-S4_GeneIDs.xlsx")
# Read data
if os.path.isfile(din_xlsx):
import xlrd
book = xlrd.open_workbook(din_xlsx)
pg = book.sheet_by_index(0)
for r in range(pg.nrows):
symbol, geneid, pval = [pg.cell_value(r, c) for c in range(pg.ncols)]
if geneid:
geneid2symbol[int(geneid)] = symbol
In [9]:
# 'p_' means "pvalue". 'fdr_bh' is the multipletest method we are currently using.
geneids_study = geneid2symbol.keys()
goea_results_all = goeaobj.run_study(geneids_study)
goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
In [10]:
goeaobj.wr_xlsx("nbt3102_symbols.xlsx", goea_results_sig, itemid2name=geneid2symbol)
goeaobj.wr_xlsx("nbt3102_geneids.xlsx", goea_results_sig)
Copyright (C) 2016, DV Klopfenstein, H Tang. All rights reserved.