In [1]:
!python --version

!wget http://geneontology.org/gene-associations/gene_association.sgd.gz -O ./data/gene_association.sgd.gz
!wget http://purl.obolibrary.org/obo/go.obo -O ./data/go.obo


Python 3.5.3 :: Anaconda 4.4.0 (x86_64)
--2017-08-10 17:17:27--  http://geneontology.org/gene-associations/gene_association.sgd.gz
Resolving geneontology.org... 52.27.86.54
Connecting to geneontology.org|52.27.86.54|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1488961 (1.4M) [application/x-gzip]
Saving to: ‘./data/gene_association.sgd.gz’

./data/gene_associa 100%[===================>]   1.42M  4.88MB/s    in 0.3s    

2017-08-10 17:17:28 (4.88 MB/s) - ‘./data/gene_association.sgd.gz’ saved [1488961/1488961]

--2017-08-10 17:17:28--  http://purl.obolibrary.org/obo/go.obo
Resolving purl.obolibrary.org... 52.3.123.63
Connecting to purl.obolibrary.org|52.3.123.63|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://snapshot.geneontology.org/ontology/go.obo [following]
--2017-08-10 17:17:28--  http://snapshot.geneontology.org/ontology/go.obo
Resolving snapshot.geneontology.org... 54.230.84.41, 54.230.84.147, 54.230.84.215, ...
Connecting to snapshot.geneontology.org|54.230.84.41|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 36333288 (35M) [application/rdf+xml]
Saving to: ‘./data/go.obo’

./data/go.obo       100%[===================>]  34.65M  48.5MB/s    in 0.7s    

2017-08-10 17:17:29 (48.5 MB/s) - ‘./data/go.obo’ saved [36333288/36333288]

Ontology Tree Generator

This is a notebook to generate tree data file from original table and annotations.


In [2]:
import pandas as pd
from goatools import obo_parser

In [3]:
# Latest data sources

treeSourceUrl = 'http://chianti.ucsd.edu/~kono/ci/data/collapsed_go.no_IGI.propagated.small_parent_tree'
oboUrl = './data/go.obo'
yeastAnnotationUrl = './data/gene_association.sgd.gz'
kegg2goUrl = 'http://geneontology.org/external2go/kegg2go'
reactome2go = 'http://geneontology.org/external2go/reactome2go'

phenotypeUrl='http://downloads.yeastgenome.org/curation/literature/phenotype_data.tab'

In [4]:
# Load the tree data
treeColNames = ['parent', 'child', 'type', 'in_tree']
tree = pd.read_csv(treeSourceUrl, delimiter='\t', names=treeColNames)
tree.tail(10)


Out[4]:
parent child type in_tree
441927 GO:0090150 YHR083W gene NOT_TREE
441928 GO:0005575 YHR083W gene NOT_TREE
441929 GO:0098796 YHR083W gene NOT_TREE
441930 GO:1902589 YHR083W gene NOT_TREE
441931 GO:0044085 YHR083W gene NOT_TREE
441932 GO:0015031 YHR083W gene NOT_TREE
441933 GO:1902582 YHR083W gene NOT_TREE
441934 GO:1902580 YHR083W gene NOT_TREE
441935 GO:0098799 YHR083W gene NOT_TREE
441936 GO:0098798 YHR083W gene NOT_TREE

Parse yeast annotation file


In [5]:
cols = pd.read_csv('./annotation_columns.txt', names=['col_names'])
col_names = cols['col_names'].tolist()
print(col_names)


['DB', 'DB_Object_ID', 'DB_Object_Symbol', 'Qualifier', 'GO_ID', 'DB:Reference', 'Evidence', 'With_or_From', 'Aspect', 'DB_Object_Name', 'DB_Object_Synonym', 'DB_Object_Type', 'taxon', 'Date', 'Assigned_by', 'Annotation_Extension', 'Gene_Product_Form_ID']

In [6]:
yeastAnnotation = pd.read_csv(yeastAnnotationUrl, delimiter='\t', comment='!', compression='gzip', names=col_names)
yeastAnnotation.head()


Out[6]:
DB DB_Object_ID DB_Object_Symbol Qualifier GO_ID DB:Reference Evidence With_or_From Aspect DB_Object_Name DB_Object_Synonym DB_Object_Type taxon Date Assigned_by Annotation_Extension Gene_Product_Form_ID
0 SGD S000007287 15S_RRNA NaN GO:0005763 SGD_REF:S000073641|PMID:6262728 IDA NaN C Ribosomal RNA of the small mitochondrial ribos... Q0020|14s rRNA|15S_RRNA_2 gene taxon:559292 20150612 SGD NaN NaN
1 SGD S000007287 15S_RRNA NaN GO:0032543 SGD_REF:S000073641|PMID:6262728 IC GO:0005763 P Ribosomal RNA of the small mitochondrial ribos... Q0020|14s rRNA|15S_RRNA_2 gene taxon:559292 20150612 SGD NaN NaN
2 SGD S000007287 15S_RRNA NaN GO:0003735 SGD_REF:S000073641|PMID:6262728 IC GO:0005763 F Ribosomal RNA of the small mitochondrial ribos... Q0020|14s rRNA|15S_RRNA_2 gene taxon:559292 20150612 SGD NaN NaN
3 SGD S000007288 21S_RRNA NaN GO:0005762 SGD_REF:S000073372|PMID:6759872 IDA NaN C Mitochondrial 21S rRNA Q0158|21S_rRNA_3|21S_rRNA_4 gene taxon:559292 20040202 SGD NaN NaN
4 SGD S000007288 21S_RRNA NaN GO:0032543 SGD_REF:S000073372|PMID:6759872 IMP NaN P Mitochondrial 21S rRNA Q0158|21S_rRNA_3|21S_rRNA_4 gene taxon:559292 20100715 SGD NaN NaN

Parse OBO


In [7]:
obo = obo_parser.GODag(oboUrl)


load obo file ./data/go.obo
./data/go.obo: fmt(1.2) rel(2016-12-06) 47,833 GO Terms

In [8]:
obo['GO:1902580'].name


Out[8]:
'single-organism cellular localization'

In [9]:
ph = pd.read_csv(phenotypeUrl, delimiter='\t')
ph.head()


Out[9]:
IMI1 not in systematic sequence of S288C IMI1.1 S000149345 PMID: 26091838|SGD_REF: S000180603 classical genetics null W303 mitochondrial genome maintenance: abnormal .1 .2 .3 .4
0 IMI1 not in systematic sequence of S288C IMI1 S000149345 PMID: 26091838|SGD_REF: S000180603 classical genetics null W303 respiratory metabolism: decreased glycerol (2%) nonfermentable carbon source similar results with ethanol and lactate
1 IMI1 not in systematic sequence of S288C IMI1 S000149345 PMID: 26091838|SGD_REF: S000180603 classical genetics null W303 mitochondrial morphology: abnormal
2 IMI1 not in systematic sequence of S288C IMI1 S000149345 PMID: 26091838|SGD_REF: S000180603 classical genetics null W303 viable
3 MAL62 not in systematic sequence of S288C MAL62 S000029690 PMID: 22669197|SGD_REF: S000149697 classical genetics overexpression Other fermentative growth: increased maltose maltose fementation and leavening ability are ...
4 MATA1 not in systematic sequence of S288C MATA1 S000029660 PMID: 8065362|SGD_REF: S000039420 classical genetics overexpression Other (LL20) killer toxin resistance: increased K. lactis zymocin

Build Base CyJS Network


In [11]:
import json

goTree = {
    'data': {
        'name': 'GO Merged Tree'
    },
    'elements': {
        'nodes': [],
        'edges': []
    }
}

print(json.dumps(goTree, indent=4))


{
    "elements": {
        "edges": [],
        "nodes": []
    },
    "data": {
        "name": "GO Merged Tree"
    }
}

In [12]:
node_set= set()

row = next(tree.iterrows())[1]
print(row)
print(row['parent'])


parent     GO:0046434
child      GO:0009395
type             is_a
in_tree          TREE
Name: 0, dtype: object
GO:0046434

In [13]:
def get_node(id, name):
    node = {
        'data': {
            'id': id
        }
    }
    
    if id in obo.keys():
        go = obo[id]
        node['data']['name'] = go.name
        node['data']['namespace'] = go.namespace
        
    return node

def get_edge(source, target, itr, is_tree):
    edge = {
        'data': {
            'source': source,
            'target': target,
            'interaction': itr,
            'is_tree': is_tree
        }
    }
    
    return edge

In [14]:
edges = []
for data in tree.iterrows():
    row=data[1]
    node_set.add(row['parent'])
    node_set.add(row['child'])
    edges.append(get_edge(row['parent'], row['child'], row['type'], row['in_tree']))

In [ ]:
nodes = []

for id in node_set:
    node = get_node(id)
    nodes.append(node)

In [22]:
print(len(nodes))
print(nodes[0])
print(len(edges))
edges[0]


13037
{'data': {'namespace': 'biological_process', 'id': 'GO:0043648', 'name': 'dicarboxylic acid metabolic process'}}
441937
Out[22]:
{'data': {'interaction': 'is_a',
  'is_tree': 'TREE',
  'source': 'GO:0046434',
  'target': 'GO:0009395'}}

In [23]:
goTree['elements']['nodes'] = nodes
goTree['elements']['edges'] = edges

with open('./data/tree.cyjs', 'w') as outfile:
    json.dump(goTree, outfile)

In [ ]:
yeastAnnotation['DB_Object_Synonym'] = yeastAnnotation['DB_Object_Synonym'].fillna('')

In [46]:
result = yeastAnnotation[yeastAnnotation['DB_Object_Synonym'].str.contains('YHR083W')]
result


Out[46]:
DB DB_Object_ID DB_Object_Symbol Qualifier GO_ID DB:Reference Evidence With_or_From Aspect DB_Object_Name DB_Object_Synonym DB_Object_Type taxon Date Assigned_by Annotation_Extension Gene_Product_Form_ID
76224 SGD S000001125 SAM35 NaN GO:0005739 SGD_REF:S000117178|PMID:16823961 IDA NaN C Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20061212 SGD NaN NaN
76225 SGD S000001125 SAM35 NaN GO:0003674 SGD_REF:S000069584 ND NaN F Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20090409 SGD NaN NaN
76226 SGD S000001125 SAM35 NaN GO:0005741 SGD_REF:S000076328|PMID:15067005 IDA NaN C Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20040909 SGD NaN NaN
76227 SGD S000001125 SAM35 NaN GO:0001401 SGD_REF:S000076328|PMID:15067005 IDA NaN C Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20040909 SGD NaN NaN
76228 SGD S000001125 SAM35 NaN GO:0045040 SGD_REF:S000076328|PMID:15067005 IMP NaN P Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20040909 SGD NaN NaN
76229 SGD S000001125 SAM35 NaN GO:0005739 SGD_REF:S000175707|PMID:24769239 IDA NaN C Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20150803 SGD NaN NaN
76230 SGD S000001125 SAM35 NaN GO:0001401 SGD_REF:S000076561|PMID:15205677 IDA NaN C Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20040707 SGD NaN NaN
76231 SGD S000001125 SAM35 NaN GO:0001401 SGD_REF:S000076561|PMID:15205677 IMP NaN C Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20040707 SGD NaN NaN
76232 SGD S000001125 SAM35 NaN GO:0005741 SGD_REF:S000076561|PMID:15205677 IDA NaN C Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20040707 SGD NaN NaN
76233 SGD S000001125 SAM35 NaN GO:0006810 SGD_REF:S000148669 IEA UniProtKB-KW:KW-0813 P Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20161112 UniProt NaN NaN
76234 SGD S000001125 SAM35 NaN GO:0016020 SGD_REF:S000148669 IEA UniProtKB-KW:KW-0472 C Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20161112 UniProt NaN NaN
76235 SGD S000001125 SAM35 NaN GO:0015031 SGD_REF:S000148669 IEA UniProtKB-KW:KW-0653 P Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20161112 UniProt NaN NaN
76236 SGD S000001125 SAM35 NaN GO:0005739 SGD_REF:S000148669 IEA UniProtKB-KW:KW-0496 C Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20161112 UniProt NaN NaN
76237 SGD S000001125 SAM35 NaN GO:0070096 SGD_REF:S000076328|PMID:15067005 IMP NaN P Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20090409 SGD NaN NaN
76238 SGD S000001125 SAM35 NaN GO:0070096 SGD_REF:S000126649|PMID:18039934 IMP NaN P Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20090409 SGD NaN NaN
76239 SGD S000001125 SAM35 NaN GO:0070096 SGD_REF:S000076561|PMID:15205677 IMP NaN P Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20090409 SGD NaN NaN
76240 SGD S000001125 SAM35 NaN GO:0045040 SGD_REF:S000077462|PMID:15326197 IMP NaN P Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20090409 SGD NaN NaN
76241 SGD S000001125 SAM35 NaN GO:0005741 SGD_REF:S000148671 IEA UniProtKB-SubCell:SL-0172 C Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20161112 UniProt NaN NaN
76242 SGD S000001125 SAM35 NaN GO:0045040 SGD_REF:S000126649|PMID:18039934 IMP NaN P Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20090409 SGD NaN NaN
76243 SGD S000001125 SAM35 NaN GO:0005741 SGD_REF:S000148669 IEA UniProtKB-KW:KW-1000 C Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20161112 UniProt NaN NaN
76244 SGD S000001125 SAM35 NaN GO:0005741 SGD_REF:S000114057|PMID:16407407 IDA NaN C Component of the sorting and assembly machiner... YHR083W|FMP20|SAM complex subunit SAM35|TOB38|... gene taxon:559292 20060317 SGD NaN NaN