In [ ]:
!python --version

!wget http://geneontology.org/gene-associations/gene_association.sgd.gz -O ./data/gene_association.sgd.gz
!wget http://purl.obolibrary.org/obo/go.obo -O ./data/go.obo

Term Property Generator

Introduction

This is a script to convert public data sets into a searchable, local Elasticsearch DB.

Requirments

  • Python 3.x
  • Elasticsearch 5.x

In [1]:
# Create list of all terms from the GO Tree file
import pandas as pd
treeSourceUrl = 'http://chianti.ucsd.edu/~kono/ci/data/collapsed_go.no_IGI.propagated.small_parent_tree'

# Load the tree data
treeColNames = ['parent', 'child', 'type', 'in_tree']
tree = pd.read_csv(treeSourceUrl, delimiter='\t', names=treeColNames)
tree.tail()


Out[1]:
parent child type in_tree
441932 GO:0015031 YHR083W gene NOT_TREE
441933 GO:1902582 YHR083W gene NOT_TREE
441934 GO:1902580 YHR083W gene NOT_TREE
441935 GO:0098799 YHR083W gene NOT_TREE
441936 GO:0098798 YHR083W gene NOT_TREE

In [2]:
# Extract GO terms in the tree

p_list = tree['parent']
c_list = tree['child']
print(p_list.shape)
print(c_list.shape)

all_list = pd.concat([p_list, c_list])
print(all_list.shape)
all_set = all_list.unique()

print(all_set.shape)

go_set = set()

for t in all_set:
    if t.startswith('GO:'):
        go_set.add(t)

print(len(go_set))


(441937,)
(441937,)
(883874,)
(13037,)
6618

In [3]:
from elasticsearch import Elasticsearch
from datetime import datetime
from elasticsearch_dsl import DocType, Date, Integer, Keyword, Text, Object, Nested, Index
from elasticsearch_dsl.connections import connections
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch_dsl import Search
import pandas as pd

from elasticsearch_dsl.query import MultiMatch, Match, Q


# Define a default Elasticsearch client
connections.create_connection(hosts=['localhost:9200'])

treeSourceUrl = 'http://chianti.ucsd.edu/~kono/ci/data/collapsed_go.no_IGI.propagated.small_parent_tree'
oboUrl = './data/go.obo'
yeastAnnotationUrl = './data/gene_association.sgd.gz'
kegg2goUrl = 'http://geneontology.org/external2go/kegg2go'
reactome2go = 'http://geneontology.org/external2go/reactome2go'

phenotypeUrl='http://downloads.yeastgenome.org/curation/literature/phenotype_data.tab'

Load gene associations


In [4]:
yeastAnnotationUrl = './data/gene_association.sgd.gz'
cols = pd.read_csv('./annotation_columns.txt', names=['col_names'])
col_names = cols['col_names'].tolist()
print(col_names)

yeastAnnotation = pd.read_csv(yeastAnnotationUrl, delimiter='\t', comment='!', compression='gzip', names=col_names)
yeastAnnotation.tail()


['DB', 'DB_Object_ID', 'DB_Object_Symbol', 'Qualifier', 'GO_ID', 'DB:Reference', 'Evidence', 'With_or_From', 'Aspect', 'DB_Object_Name', 'DB_Object_Synonym', 'DB_Object_Type', 'taxon', 'Date', 'Assigned_by', 'Annotation_Extension', 'Gene_Product_Form_ID']
Out[4]:
DB DB_Object_ID DB_Object_Symbol Qualifier GO_ID DB:Reference Evidence With_or_From Aspect DB_Object_Name DB_Object_Synonym DB_Object_Type taxon Date Assigned_by Annotation_Extension Gene_Product_Form_ID
111384 SGD S000006732 tX(XXX)L NaN GO:0030533 SGD_REF:S000181097|PMID:9023104 ISM NaN F tRNA of undetermined specificity, predicted by... tX(XXX)L|tS(GCU)L gene taxon:559292 20030507 SGD NaN NaN
111385 SGD S000006732 tX(XXX)L NaN GO:0005829 SGD_REF:S000181097|PMID:9023104 IC GO:0030533 C tRNA of undetermined specificity, predicted by... tX(XXX)L|tS(GCU)L gene taxon:559292 20030507 SGD NaN NaN
111386 SGD S000007338 tY(GUA)Q NaN GO:0070125 SGD_REF:S000181097|PMID:9023104 IC GO:0030533 P Mitochondrial tyrosine tRNA (tRNA-Tyr) tY(GUA)Q gene taxon:559292 20150730 SGD NaN NaN
111387 SGD S000007338 tY(GUA)Q NaN GO:0005739 SGD_REF:S000181097|PMID:9023104 IC GO:0030533 C Mitochondrial tyrosine tRNA (tRNA-Tyr) tY(GUA)Q gene taxon:559292 20030507 SGD NaN NaN
111388 SGD S000007338 tY(GUA)Q NaN GO:0030533 SGD_REF:S000181097|PMID:9023104 ISM NaN F Mitochondrial tyrosine tRNA (tRNA-Tyr) tY(GUA)Q gene taxon:559292 20060721 SGD NaN NaN

Phenotype


In [5]:
pUrl = 'http://downloads.yeastgenome.org/curation/literature/phenotype_data.tab'

p_cols = pd.read_csv('./p_cols.txt', names=['col_names'])
p_col_names = p_cols['col_names'].tolist()
print(p_col_names)

phenotype = pd.read_csv(pUrl, delimiter='\t', names=p_col_names)


['Feature_Name', 'Feature_Type', 'Gene_Name', 'SGDID', 'Reference', 'Experiment_Type', 'Mutant_Type', 'Allele', 'Strain_Background', 'Phenotype', 'Chemical', 'Condition', 'Details', 'Reporter']

ID Mapping Table


In [6]:
idmap = pd.read_csv('./yeast_clean4.txt', delimiter='\t')
idmap.head()


Out[6]:
symbol locus_name acc_number swiss-prot sgd sequence_length 3d chromosome
0 AAC1 YMR056C P04710 ADT1_YEAST S000004660 309 13 NaN
1 AAC3 YBR085W P18238 ADT3_YEAST S000000289 307 (3) 2
2 AAD10 YJR155W P47182 AAD10_YEAST S000003916 288 10 NaN
3 AAD14 YNL331C P42884 AAD14_YEAST S000005275 376 14 NaN
4 AAD15 YOL165C Q08361 AAD15_YEAST S000005525 143 15 NaN

In [7]:
# Create usuful map for ID mapping
sgd2info = {}

for idx, row in idmap.iterrows():
    entry = {}
    entry['locus'] = row['locus_name']
    entry['acc'] = row['acc_number']
    entry['swiss'] = row['swiss-prot']
    entry['length'] = row['sequence_length']
    
    symbols = row['symbol'].split(';')
    entry['symbol'] = symbols[0]
    
    if len(symbols) == 1:
        entry['alt_symbols'] = []
    else:
        entry['alt_symbols'] = symbols[1:]
    
    if row['3d'] == '(3)':
        entry['3d_struct_available'] = True
        entry['chromosome'] = row['chromosome']
    else:
        entry['3d_struct_available'] = False
        entry['chromosome'] = row['3d']
    
    sgd2info[row['sgd']] = entry

In [8]:
sgd2info['S000005299']


Out[8]:
{'3d_struct_available': True,
 'acc': 'Q00955',
 'alt_symbols': ['ABP2', 'FAS3', 'MTR7'],
 'chromosome': '14',
 'length': '2233',
 'locus': 'YNR016C',
 'swiss': 'ACAC_YEAST',
 'symbol': 'ACC1'}

Define GO Term Entry


In [ ]:
# Map from GO Term to genes
go2gene = {}

go2idset = {}

for idx, row in yeastAnnotation.iterrows():
    goterm = row['GO_ID']
    gene_id = row['DB_Object_ID']
    symbol = row['DB_Object_Symbol']
    full_name = str(row['DB_Object_Name']).replace('\r\n', '')
    
    
    # for gene info
    if gene_id in sgd2info:
        entry = sgd2info[gene_id]
        entry['name'] = full_name
    
    cur_entry = []
    
    if goterm in go2gene:
        cur_entry = go2gene[goterm]
        gene_set = go2idset[goterm]
    else:
        gene_set = set()
        go2idset[goterm] = gene_set
    
    ids = go2idset[goterm]
    
    if gene_id not in ids:
        gene = {
            'sgdid': gene_id,
            'symbol': symbol,
            'name': full_name
        }
    
        ids.add(gene_id)
        go2idset[goterm] = ids
        
        cur_entry.append(gene)
        go2gene[goterm] = cur_entry

In [ ]:
sgd2info['S000005299']

In [ ]:
class GoTerm(DocType):
    termid = Text(index='not_analyzed')
    name = Text(analyzer='standard')
    namespace = Text(analyzer='standard')
    definition = Text(analyzer='standard')
    parents = Object(multi=True)
    children = Object(multi=True)

    genes = Object(multi=True)
    
    class Meta:
        index = 'terms'

class Gene(DocType):
    id = Text(index='not_analyzed')
    symbol = Text(analyzer='standard')
    name = Text(analyzer='standard')
    synonyms = Text(analyzer='standard', multi=True)
    locus = Text(analyzer='standard')
    
    class Meta:
        index = 'genes'

In [ ]:
GoTerm.init()
Gene.init()

In [ ]:
from goatools import obo_parser
oboUrl = './data/go.obo'
obo = obo_parser.GODag(oboUrl, optional_attrs=['def'])

In [ ]:
def get_go_term(term):
    g = {}
    if term.id in go2gene:
        g = go2gene[term.id]
    
    parents = []
    children = []
    
    for p in term.parents:
        parents.append({'id': p.id, 'name': p.name})
    for c in term.children:
        children.append({'id': c.id, 'name': c.name})
    
    definition = term.defn.split('"')[1]
        
    return GoTerm(
        meta={'id':  term.id},
        termid=term.id,
        name=term.name,
        namespace=term.namespace,
        definition=definition,
        parents=parents,
        children=children,
        genes=g
)

print(connections.get_connection().cluster.health())

In [ ]:
def get_gene(gene, id):
    name = ''
    if 'name' in gene:
        name = gene['name']
    
    return Gene(
        meta={'id':  id},
        id = id,
        symbol = gene['symbol'],
        name = name,
        synonyms = gene['alt_symbols'],
        locus = gene['locus']
)

In [ ]:
es = Elasticsearch(host='localhost', port=9200)
pool = []

In [ ]:
term_ids = obo.keys()
print(len(term_ids))

for id in term_ids:    
    if id not in go_set:
        continue
    
    d = get_go_term(obo[id])
    term = {'_index': getattr(d.meta, 'index', d._doc_type.index), '_type': d._doc_type.name,  '_id': d.termid, '_source': d.to_dict()}
    pool.append(term)
    if len(pool) > 5000:
        print('Bulk add start:')
        helpers.bulk(es, pool)
        print('Bulk add success!')

        pool = []

if len(pool) > 0:
    print('Last: ' + str(len(pool)))
    helpers.bulk(es, pool)
    print('---------------success!')

In [ ]:
ids = sgd2info.keys()

print(len(ids))

for id in ids:    
    d = get_gene(sgd2info[id], id)
    term = {'_index': getattr(d.meta, 'index', d._doc_type.index), '_type': d._doc_type.name, '_id': d.id, '_source': d.to_dict()}
    pool.append(term)
    if len(pool) > 5000:
        print('Bulk add start:')
        helpers.bulk(es, pool)
        print('Bulk add success!')

        pool = []

if len(pool) > 0:
    print('Last: ' + str(len(pool)))
    helpers.bulk(es, pool)
    print('---------------success!')

In [ ]:
s = Search(using=es, index="_all").query("match", name='proteasome')

In [ ]:
response = s.execute()

In [ ]:
import json

for hit in response:
    print(json.dumps(hit.to_dict(), indent=4))