In [ ]:

    
!python --version

!wget http://geneontology.org/gene-associations/gene_association.sgd.gz -O ./data/gene_association.sgd.gz
!wget http://purl.obolibrary.org/obo/go.obo -O ./data/go.obo

Term Property Generator

Introduction

This is a script to convert public data sets into a searchable, local Elasticsearch DB.

Requirments

Python 3.x
Elasticsearch 5.x



In [1]:

    
# Create list of all terms from the GO Tree file
import pandas as pd
treeSourceUrl = 'http://chianti.ucsd.edu/~kono/ci/data/collapsed_go.no_IGI.propagated.small_parent_tree'

# Load the tree data
treeColNames = ['parent', 'child', 'type', 'in_tree']
tree = pd.read_csv(treeSourceUrl, delimiter='\t', names=treeColNames)
tree.tail()









    Out[1]:






  
    
      
      parent
      child
      type
      in_tree
    
  
  
    
      441932
      GO:0015031
      YHR083W
      gene
      NOT_TREE
    
    
      441933
      GO:1902582
      YHR083W
      gene
      NOT_TREE
    
    
      441934
      GO:1902580
      YHR083W
      gene
      NOT_TREE
    
    
      441935
      GO:0098799
      YHR083W
      gene
      NOT_TREE
    
    
      441936
      GO:0098798
      YHR083W
      gene
      NOT_TREE



In [2]:

    
# Extract GO terms in the tree

p_list = tree['parent']
c_list = tree['child']
print(p_list.shape)
print(c_list.shape)

all_list = pd.concat([p_list, c_list])
print(all_list.shape)
all_set = all_list.unique()

print(all_set.shape)

go_set = set()

for t in all_set:
    if t.startswith('GO:'):
        go_set.add(t)

print(len(go_set))









    



(441937,)
(441937,)
(883874,)
(13037,)
6618



In [3]:

    
from elasticsearch import Elasticsearch
from datetime import datetime
from elasticsearch_dsl import DocType, Date, Integer, Keyword, Text, Object, Nested, Index
from elasticsearch_dsl.connections import connections
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch_dsl import Search
import pandas as pd

from elasticsearch_dsl.query import MultiMatch, Match, Q


# Define a default Elasticsearch client
connections.create_connection(hosts=['localhost:9200'])

treeSourceUrl = 'http://chianti.ucsd.edu/~kono/ci/data/collapsed_go.no_IGI.propagated.small_parent_tree'
oboUrl = './data/go.obo'
yeastAnnotationUrl = './data/gene_association.sgd.gz'
kegg2goUrl = 'http://geneontology.org/external2go/kegg2go'
reactome2go = 'http://geneontology.org/external2go/reactome2go'

phenotypeUrl='http://downloads.yeastgenome.org/curation/literature/phenotype_data.tab'

Load gene associations



In [4]:

    
yeastAnnotationUrl = './data/gene_association.sgd.gz'
cols = pd.read_csv('./annotation_columns.txt', names=['col_names'])
col_names = cols['col_names'].tolist()
print(col_names)

yeastAnnotation = pd.read_csv(yeastAnnotationUrl, delimiter='\t', comment='!', compression='gzip', names=col_names)
yeastAnnotation.tail()









    



['DB', 'DB_Object_ID', 'DB_Object_Symbol', 'Qualifier', 'GO_ID', 'DB:Reference', 'Evidence', 'With_or_From', 'Aspect', 'DB_Object_Name', 'DB_Object_Synonym', 'DB_Object_Type', 'taxon', 'Date', 'Assigned_by', 'Annotation_Extension', 'Gene_Product_Form_ID']






    Out[4]:






  
    
      
      DB
      DB_Object_ID
      DB_Object_Symbol
      Qualifier
      GO_ID
      DB:Reference
      Evidence
      With_or_From
      Aspect
      DB_Object_Name
      DB_Object_Synonym
      DB_Object_Type
      taxon
      Date
      Assigned_by
      Annotation_Extension
      Gene_Product_Form_ID
    
  
  
    
      111384
      SGD
      S000006732
      tX(XXX)L
      NaN
      GO:0030533
      SGD_REF:S000181097|PMID:9023104
      ISM
      NaN
      F
      tRNA of undetermined specificity, predicted by...
      tX(XXX)L|tS(GCU)L
      gene
      taxon:559292
      20030507
      SGD
      NaN
      NaN
    
    
      111385
      SGD
      S000006732
      tX(XXX)L
      NaN
      GO:0005829
      SGD_REF:S000181097|PMID:9023104
      IC
      GO:0030533
      C
      tRNA of undetermined specificity, predicted by...
      tX(XXX)L|tS(GCU)L
      gene
      taxon:559292
      20030507
      SGD
      NaN
      NaN
    
    
      111386
      SGD
      S000007338
      tY(GUA)Q
      NaN
      GO:0070125
      SGD_REF:S000181097|PMID:9023104
      IC
      GO:0030533
      P
      Mitochondrial tyrosine tRNA (tRNA-Tyr)
      tY(GUA)Q
      gene
      taxon:559292
      20150730
      SGD
      NaN
      NaN
    
    
      111387
      SGD
      S000007338
      tY(GUA)Q
      NaN
      GO:0005739
      SGD_REF:S000181097|PMID:9023104
      IC
      GO:0030533
      C
      Mitochondrial tyrosine tRNA (tRNA-Tyr)
      tY(GUA)Q
      gene
      taxon:559292
      20030507
      SGD
      NaN
      NaN
    
    
      111388
      SGD
      S000007338
      tY(GUA)Q
      NaN
      GO:0030533
      SGD_REF:S000181097|PMID:9023104
      ISM
      NaN
      F
      Mitochondrial tyrosine tRNA (tRNA-Tyr)
      tY(GUA)Q
      gene
      taxon:559292
      20060721
      SGD
      NaN
      NaN

Phenotype



In [5]:

    
pUrl = 'http://downloads.yeastgenome.org/curation/literature/phenotype_data.tab'

p_cols = pd.read_csv('./p_cols.txt', names=['col_names'])
p_col_names = p_cols['col_names'].tolist()
print(p_col_names)

phenotype = pd.read_csv(pUrl, delimiter='\t', names=p_col_names)









    



['Feature_Name', 'Feature_Type', 'Gene_Name', 'SGDID', 'Reference', 'Experiment_Type', 'Mutant_Type', 'Allele', 'Strain_Background', 'Phenotype', 'Chemical', 'Condition', 'Details', 'Reporter']

ID Mapping Table



In [6]:

    
idmap = pd.read_csv('./yeast_clean4.txt', delimiter='\t')
idmap.head()









    Out[6]:






  
    
      
      symbol
      locus_name
      acc_number
      swiss-prot
      sgd
      sequence_length
      3d
      chromosome
    
  
  
    
      0
      AAC1
      YMR056C
      P04710
      ADT1_YEAST
      S000004660
      309
      13
      NaN
    
    
      1
      AAC3
      YBR085W
      P18238
      ADT3_YEAST
      S000000289
      307
      (3)
      2
    
    
      2
      AAD10
      YJR155W
      P47182
      AAD10_YEAST
      S000003916
      288
      10
      NaN
    
    
      3
      AAD14
      YNL331C
      P42884
      AAD14_YEAST
      S000005275
      376
      14
      NaN
    
    
      4
      AAD15
      YOL165C
      Q08361
      AAD15_YEAST
      S000005525
      143
      15
      NaN



In [7]:

    
# Create usuful map for ID mapping
sgd2info = {}

for idx, row in idmap.iterrows():
    entry = {}
    entry['locus'] = row['locus_name']
    entry['acc'] = row['acc_number']
    entry['swiss'] = row['swiss-prot']
    entry['length'] = row['sequence_length']
    
    symbols = row['symbol'].split(';')
    entry['symbol'] = symbols[0]
    
    if len(symbols) == 1:
        entry['alt_symbols'] = []
    else:
        entry['alt_symbols'] = symbols[1:]
    
    if row['3d'] == '(3)':
        entry['3d_struct_available'] = True
        entry['chromosome'] = row['chromosome']
    else:
        entry['3d_struct_available'] = False
        entry['chromosome'] = row['3d']
    
    sgd2info[row['sgd']] = entry



In [8]:

    
sgd2info['S000005299']









    Out[8]:





{'3d_struct_available': True,
 'acc': 'Q00955',
 'alt_symbols': ['ABP2', 'FAS3', 'MTR7'],
 'chromosome': '14',
 'length': '2233',
 'locus': 'YNR016C',
 'swiss': 'ACAC_YEAST',
 'symbol': 'ACC1'}

Define GO Term Entry



In [ ]:

    
# Map from GO Term to genes
go2gene = {}

go2idset = {}

for idx, row in yeastAnnotation.iterrows():
    goterm = row['GO_ID']
    gene_id = row['DB_Object_ID']
    symbol = row['DB_Object_Symbol']
    full_name = str(row['DB_Object_Name']).replace('\r\n', '')
    
    
    # for gene info
    if gene_id in sgd2info:
        entry = sgd2info[gene_id]
        entry['name'] = full_name
    
    cur_entry = []
    
    if goterm in go2gene:
        cur_entry = go2gene[goterm]
        gene_set = go2idset[goterm]
    else:
        gene_set = set()
        go2idset[goterm] = gene_set
    
    ids = go2idset[goterm]
    
    if gene_id not in ids:
        gene = {
            'sgdid': gene_id,
            'symbol': symbol,
            'name': full_name
        }
    
        ids.add(gene_id)
        go2idset[goterm] = ids
        
        cur_entry.append(gene)
        go2gene[goterm] = cur_entry



In [ ]:

    
sgd2info['S000005299']



In [ ]:

    
class GoTerm(DocType):
    termid = Text(index='not_analyzed')
    name = Text(analyzer='standard')
    namespace = Text(analyzer='standard')
    definition = Text(analyzer='standard')
    parents = Object(multi=True)
    children = Object(multi=True)

    genes = Object(multi=True)
    
    class Meta:
        index = 'terms'

class Gene(DocType):
    id = Text(index='not_analyzed')
    symbol = Text(analyzer='standard')
    name = Text(analyzer='standard')
    synonyms = Text(analyzer='standard', multi=True)
    locus = Text(analyzer='standard')
    
    class Meta:
        index = 'genes'



In [ ]:

    
GoTerm.init()
Gene.init()



In [ ]:

    
from goatools import obo_parser
oboUrl = './data/go.obo'
obo = obo_parser.GODag(oboUrl, optional_attrs=['def'])



In [ ]:

    
def get_go_term(term):
    g = {}
    if term.id in go2gene:
        g = go2gene[term.id]
    
    parents = []
    children = []
    
    for p in term.parents:
        parents.append({'id': p.id, 'name': p.name})
    for c in term.children:
        children.append({'id': c.id, 'name': c.name})
    
    definition = term.defn.split('"')[1]
        
    return GoTerm(
        meta={'id':  term.id},
        termid=term.id,
        name=term.name,
        namespace=term.namespace,
        definition=definition,
        parents=parents,
        children=children,
        genes=g
)

print(connections.get_connection().cluster.health())



In [ ]:

    
def get_gene(gene, id):
    name = ''
    if 'name' in gene:
        name = gene['name']
    
    return Gene(
        meta={'id':  id},
        id = id,
        symbol = gene['symbol'],
        name = name,
        synonyms = gene['alt_symbols'],
        locus = gene['locus']
)



In [ ]:

    
es = Elasticsearch(host='localhost', port=9200)
pool = []



In [ ]:

    
term_ids = obo.keys()
print(len(term_ids))

for id in term_ids:    
    if id not in go_set:
        continue
    
    d = get_go_term(obo[id])
    term = {'_index': getattr(d.meta, 'index', d._doc_type.index), '_type': d._doc_type.name,  '_id': d.termid, '_source': d.to_dict()}
    pool.append(term)
    if len(pool) > 5000:
        print('Bulk add start:')
        helpers.bulk(es, pool)
        print('Bulk add success!')

        pool = []

if len(pool) > 0:
    print('Last: ' + str(len(pool)))
    helpers.bulk(es, pool)
    print('---------------success!')



In [ ]:

    
ids = sgd2info.keys()

print(len(ids))

for id in ids:    
    d = get_gene(sgd2info[id], id)
    term = {'_index': getattr(d.meta, 'index', d._doc_type.index), '_type': d._doc_type.name, '_id': d.id, '_source': d.to_dict()}
    pool.append(term)
    if len(pool) > 5000:
        print('Bulk add start:')
        helpers.bulk(es, pool)
        print('Bulk add success!')

        pool = []

if len(pool) > 0:
    print('Last: ' + str(len(pool)))
    helpers.bulk(es, pool)
    print('---------------success!')



In [ ]:

    
s = Search(using=es, index="_all").query("match", name='proteasome')



In [ ]:

    
response = s.execute()



In [ ]:

    
import json

for hit in response:
    print(json.dumps(hit.to_dict(), indent=4))

	parent	child	type	in_tree
441932	GO:0015031	YHR083W	gene	NOT_TREE
441933	GO:1902582	YHR083W	gene	NOT_TREE
441934	GO:1902580	YHR083W	gene	NOT_TREE
441935	GO:0098799	YHR083W	gene	NOT_TREE
441936	GO:0098798	YHR083W	gene	NOT_TREE

	DB	DB_Object_ID	DB_Object_Symbol	Qualifier	GO_ID	DB:Reference	Evidence	With_or_From	Aspect	DB_Object_Name	DB_Object_Synonym	DB_Object_Type	taxon	Date	Assigned_by	Annotation_Extension	Gene_Product_Form_ID
111384	SGD	S000006732	tX(XXX)L	NaN	GO:0030533	SGD_REF:S000181097\|PMID:9023104	ISM	NaN	F	tRNA of undetermined specificity, predicted by...	tX(XXX)L\|tS(GCU)L	gene	taxon:559292	20030507	SGD	NaN	NaN
111385	SGD	S000006732	tX(XXX)L	NaN	GO:0005829	SGD_REF:S000181097\|PMID:9023104	IC	GO:0030533	C	tRNA of undetermined specificity, predicted by...	tX(XXX)L\|tS(GCU)L	gene	taxon:559292	20030507	SGD	NaN	NaN
111386	SGD	S000007338	tY(GUA)Q	NaN	GO:0070125	SGD_REF:S000181097\|PMID:9023104	IC	GO:0030533	P	Mitochondrial tyrosine tRNA (tRNA-Tyr)	tY(GUA)Q	gene	taxon:559292	20150730	SGD	NaN	NaN
111387	SGD	S000007338	tY(GUA)Q	NaN	GO:0005739	SGD_REF:S000181097\|PMID:9023104	IC	GO:0030533	C	Mitochondrial tyrosine tRNA (tRNA-Tyr)	tY(GUA)Q	gene	taxon:559292	20030507	SGD	NaN	NaN
111388	SGD	S000007338	tY(GUA)Q	NaN	GO:0030533	SGD_REF:S000181097\|PMID:9023104	ISM	NaN	F	Mitochondrial tyrosine tRNA (tRNA-Tyr)	tY(GUA)Q	gene	taxon:559292	20060721	SGD	NaN	NaN

	symbol	locus_name	acc_number	swiss-prot	sgd	sequence_length	3d	chromosome
0	AAC1	YMR056C	P04710	ADT1_YEAST	S000004660	309	13	NaN
1	AAC3	YBR085W	P18238	ADT3_YEAST	S000000289	307	(3)	2
2	AAD10	YJR155W	P47182	AAD10_YEAST	S000003916	288	10	NaN
3	AAD14	YNL331C	P42884	AAD14_YEAST	S000005275	376	14	NaN
4	AAD15	YOL165C	Q08361	AAD15_YEAST	S000005525	143	15	NaN