Gene Ontology Enrichment Analysis (GOEA)

This is the same GOEA as in goea_nbt3102.ipynb, but the GOEA results can be obtained by calling a single function.

We use data from a 2014 Nature paper:
Computational analysis of cell-to-cell heterogeneity in single-cell RNA-sequencing data reveals hidden subpopulations of cells



In [1]:

    
def get_goeaobj_nbt3102(method='fdr_bh'):
    """Return GOEA Object ready to run Nature data."""
    from goatools.obo_parser import GODag
    from genes_ncbi_10090_proteincoding import GENEID2NT as GeneID2nt_mus
    from goatools.base import download_go_basic_obo, download_ncbi_associations
    from goatools.anno.genetogo_reader import Gene2GoReader
    from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
    # Load Ontologies
    obo_fname = download_go_basic_obo()
    obodag = GODag("go-basic.obo")
    # Load Associations
    download_ncbi_associations() # Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
    # Read NCBI's gene2go. Store annotations in a list of namedtuples
    objanno = Gene2GoReader("gene2go", taxids=[10090])
    # Get associations for each branch of the GO DAG (BP, MF, CC)
    ns2assoc = objanno.get_ns2assc()
    # GOE Object holds Ontologies, Associations, and Background gene set
    return GOEnrichmentStudyNS(
        GeneID2nt_mus.keys(), # Background gene set: mouse protein-coding genes
        ns2assoc, # geneid/GO Associations for BP, MF, anc CC GODAG branches
        obodag, # Ontologies
        propagate_counts = False,
        alpha = 0.05, # default significance cut-off
        methods = [method]) # defult multipletest correction method



In [2]:

    
def read_data_nbt3102():
    """Read data from Nature paper."""
    import os
    # Data will be stored in this variable
    geneid2symbol = {}
    # Get xlsx filename where data is stored
    ROOT = os.path.dirname(os.getcwd()) # go up 1 level from current working directory
    din_xlsx = os.path.join(ROOT, "goatools/test_data/nbt_3102/nbt.3102-S4_GeneIDs.xlsx")
    # Read data
    if os.path.isfile(din_xlsx):  
        import xlrd
        book = xlrd.open_workbook(din_xlsx)
        pg = book.sheet_by_index(0)
        for r in range(pg.nrows):
            symbol, geneid, pval = [pg.cell_value(r, c) for c in range(pg.ncols)]
            if geneid:
                geneid2symbol[int(geneid)] = symbol
    return geneid2symbol