Gene Ontology Enrichment Analysis (GOEA)

This is the same GOEA as in goea_nbt3102.ipynb, but the GOEA results can be obtained by calling a single function.

We use data from a 2014 Nature paper:
Computational analysis of cell-to-cell heterogeneity in single-cell RNA-sequencing data reveals hidden subpopulations of cells


In [4]:
def get_goeaobj_nbt3102(method='fdr_bh'):
    """Return GOEA Object ready to run Nature data."""
    from goatools.obo_parser import GODag
    from goatools.associations import read_ncbi_gene2go
    from goatools.test_data.genes_NCBI_10090_ProteinCoding import GeneID2nt as GeneID2nt_mus
    from goatools.go_enrichment import GOEnrichmentStudy
    from goatools.base import download_go_basic_obo, download_ncbi_associations
    # Load Ontologies
    obo_fname = download_go_basic_obo()
    obodag = GODag("go-basic.obo")
    # Load Associations
    download_ncbi_associations() # Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
    geneid2gos_mouse = read_ncbi_gene2go("gene2go", taxids=[10090])
    # GOE Object holds Ontologies, Associations, and Background gene set
    return GOEnrichmentStudy(
        GeneID2nt_mus.keys(), # Background gene set: mouse protein-coding genes
        geneid2gos_mouse, # geneid/GO Associations
        obodag, # Ontologies
        propagate_counts = False,
        alpha = 0.05, # default significance cut-off
        methods = [method]) # defult multipletest correction method

In [5]:
def read_data_nbt3102():
    """Read data from Nature paper."""
    import os
    # Data will be stored in this variable
    geneid2symbol = {}
    # Get xlsx filename where data is stored
    ROOT = os.path.dirname(os.getcwd()) # go up 1 level from current working directory
    din_xlsx = os.path.join(ROOT, "goatools/test_data/nbt_3102/nbt.3102-S4_GeneIDs.xlsx")
    # Read data
    if os.path.isfile(din_xlsx):  
        import xlrd
        book = xlrd.open_workbook(din_xlsx)
        pg = book.sheet_by_index(0)
        for r in range(pg.nrows):
            symbol, geneid, pval = [pg.cell_value(r, c) for c in range(pg.ncols)]
            if geneid:
                geneid2symbol[int(geneid)] = symbol
    return geneid2symbol