Bonus: Each line in the GAF file is stored in a namedtuple:
In [1]:
import os
if not os.path.exists('goa_human.gaf.gz'):
!wget http://current.geneontology.org/annotations/goa_human.gaf.gz
!gunzip goa_human.gaf.gz
In [2]:
from goatools.anno.gaf_reader import GafReader
ogaf = GafReader("goa_human.gaf")
In [3]:
ns2assc = ogaf.get_ns2assc()
In [4]:
for namespace, associations in ns2assc.items():
for protein_id, go_ids in sorted(associations.items())[:3]:
print("{NS} {PROT:7} : {GOs}".format(
NS=namespace,
PROT=protein_id,
GOs=' '.join(sorted(go_ids))))
In [5]:
# Sort the list of GAF namedtuples by ID
nts = sorted(ogaf.associations, key=lambda nt:nt.DB_ID)
# Print one namedtuple
print(nts[0])
DB # 0 required 1 UniProtKB
DB_ID # 1 required 1 P12345
DB_Symbol # 2 required 1 PHO3
Qualifier # 3 optional 0 or greater NOT
GO_ID # 4 required 1 GO:0003993
DB_Reference # 5 required 1 or greater PMID:2676709
Evidence_Code # 6 required 1 IMP
With_From # 7 optional 0 or greater GO:0000346
Aspect # 8 required 1 F
DB_Name # 9 optional 0 or 1 Toll-like receptor 4
DB_Synonym # 10 optional 0 or greater hToll|Tollbooth
DB_Type # 11 required 1 protein
Taxon # 12 required 1 or 2 taxon:9606
Date # 13 required 1 20090118
Assigned_By # 14 required 1 SGD
Annotation_Extension # 15 optional 0 or greater part_of(CL:0000576)
Gene_Product_Form_ID # 16 optional 0 or 1 UniProtKB:P12345-2
In [6]:
fmtpat = '{DB_ID} {DB_Symbol:13} {GO_ID} {Evidence_Code} {Date} {Assigned_By}'
for nt_line in nts[:10]:
print(fmtpat.format(**nt_line._asdict()))
Copyright (C) 2010-2019, DV Klopfenstein, Haibao Tang. All rights reserved.