In [ ]:
!python --version
!wget http://geneontology.org/gene-associations/gene_association.sgd.gz -O ./data/gene_association.sgd.gz
!wget http://purl.obolibrary.org/obo/go.obo -O ./data/go.obo
In [1]:
# Create list of all terms from the GO Tree file
import pandas as pd
treeSourceUrl = 'http://chianti.ucsd.edu/~kono/ci/data/collapsed_go.no_IGI.propagated.small_parent_tree'
# Load the tree data
treeColNames = ['parent', 'child', 'type', 'in_tree']
tree = pd.read_csv(treeSourceUrl, delimiter='\t', names=treeColNames)
tree.tail()
Out[1]:
In [2]:
# Extract GO terms in the tree
p_list = tree['parent']
c_list = tree['child']
print(p_list.shape)
print(c_list.shape)
all_list = pd.concat([p_list, c_list])
print(all_list.shape)
all_set = all_list.unique()
print(all_set.shape)
go_set = set()
for t in all_set:
if t.startswith('GO:'):
go_set.add(t)
print(len(go_set))
In [3]:
from elasticsearch import Elasticsearch
from datetime import datetime
from elasticsearch_dsl import DocType, Date, Integer, Keyword, Text, Object, Nested, Index
from elasticsearch_dsl.connections import connections
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch_dsl import Search
import pandas as pd
from elasticsearch_dsl.query import MultiMatch, Match, Q
# Define a default Elasticsearch client
connections.create_connection(hosts=['localhost:9200'])
treeSourceUrl = 'http://chianti.ucsd.edu/~kono/ci/data/collapsed_go.no_IGI.propagated.small_parent_tree'
oboUrl = './data/go.obo'
yeastAnnotationUrl = './data/gene_association.sgd.gz'
kegg2goUrl = 'http://geneontology.org/external2go/kegg2go'
reactome2go = 'http://geneontology.org/external2go/reactome2go'
phenotypeUrl='http://downloads.yeastgenome.org/curation/literature/phenotype_data.tab'
In [4]:
yeastAnnotationUrl = './data/gene_association.sgd.gz'
cols = pd.read_csv('./annotation_columns.txt', names=['col_names'])
col_names = cols['col_names'].tolist()
print(col_names)
yeastAnnotation = pd.read_csv(yeastAnnotationUrl, delimiter='\t', comment='!', compression='gzip', names=col_names)
yeastAnnotation.tail()
Out[4]:
In [5]:
pUrl = 'http://downloads.yeastgenome.org/curation/literature/phenotype_data.tab'
p_cols = pd.read_csv('./p_cols.txt', names=['col_names'])
p_col_names = p_cols['col_names'].tolist()
print(p_col_names)
phenotype = pd.read_csv(pUrl, delimiter='\t', names=p_col_names)
In [6]:
idmap = pd.read_csv('./yeast_clean4.txt', delimiter='\t')
idmap.head()
Out[6]:
In [7]:
# Create usuful map for ID mapping
sgd2info = {}
for idx, row in idmap.iterrows():
entry = {}
entry['locus'] = row['locus_name']
entry['acc'] = row['acc_number']
entry['swiss'] = row['swiss-prot']
entry['length'] = row['sequence_length']
symbols = row['symbol'].split(';')
entry['symbol'] = symbols[0]
if len(symbols) == 1:
entry['alt_symbols'] = []
else:
entry['alt_symbols'] = symbols[1:]
if row['3d'] == '(3)':
entry['3d_struct_available'] = True
entry['chromosome'] = row['chromosome']
else:
entry['3d_struct_available'] = False
entry['chromosome'] = row['3d']
sgd2info[row['sgd']] = entry
In [8]:
sgd2info['S000005299']
Out[8]:
In [ ]:
# Map from GO Term to genes
go2gene = {}
go2idset = {}
for idx, row in yeastAnnotation.iterrows():
goterm = row['GO_ID']
gene_id = row['DB_Object_ID']
symbol = row['DB_Object_Symbol']
full_name = str(row['DB_Object_Name']).replace('\r\n', '')
# for gene info
if gene_id in sgd2info:
entry = sgd2info[gene_id]
entry['name'] = full_name
cur_entry = []
if goterm in go2gene:
cur_entry = go2gene[goterm]
gene_set = go2idset[goterm]
else:
gene_set = set()
go2idset[goterm] = gene_set
ids = go2idset[goterm]
if gene_id not in ids:
gene = {
'sgdid': gene_id,
'symbol': symbol,
'name': full_name
}
ids.add(gene_id)
go2idset[goterm] = ids
cur_entry.append(gene)
go2gene[goterm] = cur_entry
In [ ]:
sgd2info['S000005299']
In [ ]:
class GoTerm(DocType):
termid = Text(index='not_analyzed')
name = Text(analyzer='standard')
namespace = Text(analyzer='standard')
definition = Text(analyzer='standard')
parents = Object(multi=True)
children = Object(multi=True)
genes = Object(multi=True)
class Meta:
index = 'terms'
class Gene(DocType):
id = Text(index='not_analyzed')
symbol = Text(analyzer='standard')
name = Text(analyzer='standard')
synonyms = Text(analyzer='standard', multi=True)
locus = Text(analyzer='standard')
class Meta:
index = 'genes'
In [ ]:
GoTerm.init()
Gene.init()
In [ ]:
from goatools import obo_parser
oboUrl = './data/go.obo'
obo = obo_parser.GODag(oboUrl, optional_attrs=['def'])
In [ ]:
def get_go_term(term):
g = {}
if term.id in go2gene:
g = go2gene[term.id]
parents = []
children = []
for p in term.parents:
parents.append({'id': p.id, 'name': p.name})
for c in term.children:
children.append({'id': c.id, 'name': c.name})
definition = term.defn.split('"')[1]
return GoTerm(
meta={'id': term.id},
termid=term.id,
name=term.name,
namespace=term.namespace,
definition=definition,
parents=parents,
children=children,
genes=g
)
print(connections.get_connection().cluster.health())
In [ ]:
def get_gene(gene, id):
name = ''
if 'name' in gene:
name = gene['name']
return Gene(
meta={'id': id},
id = id,
symbol = gene['symbol'],
name = name,
synonyms = gene['alt_symbols'],
locus = gene['locus']
)
In [ ]:
es = Elasticsearch(host='localhost', port=9200)
pool = []
In [ ]:
term_ids = obo.keys()
print(len(term_ids))
for id in term_ids:
if id not in go_set:
continue
d = get_go_term(obo[id])
term = {'_index': getattr(d.meta, 'index', d._doc_type.index), '_type': d._doc_type.name, '_id': d.termid, '_source': d.to_dict()}
pool.append(term)
if len(pool) > 5000:
print('Bulk add start:')
helpers.bulk(es, pool)
print('Bulk add success!')
pool = []
if len(pool) > 0:
print('Last: ' + str(len(pool)))
helpers.bulk(es, pool)
print('---------------success!')
In [ ]:
ids = sgd2info.keys()
print(len(ids))
for id in ids:
d = get_gene(sgd2info[id], id)
term = {'_index': getattr(d.meta, 'index', d._doc_type.index), '_type': d._doc_type.name, '_id': d.id, '_source': d.to_dict()}
pool.append(term)
if len(pool) > 5000:
print('Bulk add start:')
helpers.bulk(es, pool)
print('Bulk add success!')
pool = []
if len(pool) > 0:
print('Last: ' + str(len(pool)))
helpers.bulk(es, pool)
print('---------------success!')
In [ ]:
s = Search(using=es, index="_all").query("match", name='proteasome')
In [ ]:
response = s.execute()
In [ ]:
import json
for hit in response:
print(json.dumps(hit.to_dict(), indent=4))