In [ ]:
!python --version
!wget http://geneontology.org/gene-associations/gene_association.sgd.gz -O ./data/gene_association.sgd.gz
!wget http://purl.obolibrary.org/obo/go.obo -O ./data/go.obo
!wget http://chianti.ucsd.edu/\~kono/ci/data/deep-cell/raw-interactions_clixo_intTable_kei_pm_ranks.txt -O ./data/interaction-table-atgo.txt
In [2]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import numpy as np
from goatools import obo_parser
# Annotation file for the CLIXO terms
clixo_mapping = './data/alignments_FDR_0.1_t_0.1'
oboUrl = './data/go.obo'
clixo_align = pd.read_csv(clixo_mapping, sep='\t', names=['term', 'go', 'score', 'fdr', 'genes'])
print(clixo_align['score'].max())
clixo_align.tail(10)
Out[2]:
In [3]:
clixo2go = {}
for row in clixo_align.itertuples():
c = str(row[1])
go = row[2]
val = {
'go': go,
'score': row[3].item(),
'fdr': row[4].item(),
'genes': row[5].item()
}
clixo2go[c] = val
print(clixo2go['10552'])
# Save to file (for updating CyJS file)
import json
with open('./data/clixo-mapping.json', 'w') as outfile:
json.dump(clixo2go, outfile)
In [4]:
obo = obo_parser.GODag(oboUrl, optional_attrs=['def'])
In [5]:
# test data
obo['GO:0006563'].defn
Out[5]:
In [6]:
# This directory should contains all of the YeastNet interaction files
# data_path = './data/raw-interactions'
# files = [f for f in listdir(data_path) if isfile(join(data_path, f))]
# files
In [7]:
# columns = ['source', 'target', 'score']
# all_interactions = pd.DataFrame(columns=columns)
# for f in files:
# if not f.startswith('INT'):
# continue
# int_type = f.split('.')[1]
# df = pd.read_csv(data_path+'/'+ f, delimiter='\t', names=columns)
# df['interaction'] = int_type
# all_interactions = pd.concat([all_interactions, df])
# print(all_interactions.shape)
# all_interactions.head(10)
In [8]:
# all_interactions.to_csv('./data/raw-interactions/all.txt', sep='\t')
In [9]:
# CLIXO term to gene mapping
mapping = pd.read_csv('./data/raw-interactions/preds_yeastnet_no_gi_0.04_0.5.txt.propagate.mapping', delimiter='\t', names=['gene', 'term'])
mapping.head()
Out[9]:
In [10]:
mapping['term'].unique().shape # Number of terms
Out[10]:
In [11]:
# All ORF names in CLIXO
mixed_ids = mapping['gene'].unique()
print(mixed_ids.shape)
geneset = set()
for row in mapping.itertuples():
geneset.add(row[1])
print(len(geneset))
In [12]:
# Import gene association file
yeastAnnotationUrl = './data/gene_association.sgd.gz'
cols = pd.read_csv('./annotation_columns.txt', names=['col_names'])
col_names = cols['col_names'].tolist()
yeastAnnotation = pd.read_csv(yeastAnnotationUrl, delimiter='\t', comment='!', compression='gzip', names=col_names)
yeastAnnotation.tail()
Out[12]:
In [13]:
# Mapping object: from any type of ID to SGD ID
to_sgd = {}
# Annotation for genes
sgd2fullname = {}
sgd2symbol = {}
for row in yeastAnnotation.itertuples():
sgd = row[2]
orf = row[3]
full_name = str(row[10]).replace('\r\n', '')
syn = str(row[11])
syns = syn.split('|')
to_sgd[orf] = sgd
for synonym in syns:
to_sgd[synonym] = sgd
sgd2fullname[sgd] = full_name
sgd2symbol[sgd] = orf
# Special case
to_sgd['AAD16'] = 'S000001837'
In [14]:
normalized_map = []
for row in mapping.itertuples():
gene = row[1]
term = str(row[2])
# Convert to SGD
sgd = gene
if gene in to_sgd.keys():
sgd = to_sgd[gene]
entry = (sgd, term)
normalized_map.append(entry)
In [15]:
# All ORF to SGD
all_sgd = list(map(lambda x: to_sgd[x] if x in to_sgd.keys() else x, mixed_ids))
if len(all_sgd) == len(mixed_ids):
print('All mapped!')
# This contains all gene IDs (SGD ID)
uniq_genes = set(all_sgd)
len(uniq_genes)
Out[15]:
In [16]:
geneset_sgd = set()
for gene in geneset:
if gene not in to_sgd.keys():
geneset_sgd.add(gene)
else:
geneset_sgd.add(to_sgd[gene])
len(geneset_sgd)
Out[16]:
In [17]:
df_genes = pd.DataFrame(list(uniq_genes))
# Save as text file and use it in UNIPROT ID Mapper
df_genes.to_csv('./data/all_sgd.txt', sep='\t', index=False, header=False)
In [18]:
uniprot = pd.read_csv('./data/uniprot-idmapping.txt', delimiter='\t')
print(uniprot.shape)
uniprot.head()
Out[18]:
In [19]:
sgd2orf = {}
for row in uniprot.itertuples():
sgd = row[2]
orf = row[11]
sgd2orf[sgd] = orf
# Test
missing = set()
for sgd in uniq_genes:
if sgd not in sgd2orf.keys():
missing.add(sgd)
print(len(missing))
print(missing)
In [20]:
idmap = pd.read_csv('./yeast_clean4.txt', delimiter='\t')
idmap.head()
Out[20]:
In [21]:
sgd2orf2 = {}
for row in idmap.itertuples():
sgd = row[5]
orf = row[2]
sgd2orf2[sgd] = orf
for sgd in missing:
sgd2orf[sgd] = sgd2orf2[sgd]
# Test
missing = set()
for sgd in uniq_genes:
if sgd not in sgd2orf.keys():
missing.add(sgd)
print(len(missing))
print(len(sgd2orf))
In [22]:
gene_map = {}
missing_count = 0
all_orf = set(sgd2orf.values())
len(all_orf)
Out[22]:
In [23]:
all_interactions = pd.read_csv('./data/interaction-table-atgo.txt', sep="\t")
all_interactions.head()
Out[23]:
In [24]:
## Filter edges
print('All interaction count: ' + str(all_interactions.shape))
filtered = []
# Filter
for row in all_interactions.itertuples():
if row[1] not in to_sgd.keys():
to_sgd[row[1]] = row[1]
if row[2] not in to_sgd.keys():
to_sgd[row[2]] = row[2]
In [25]:
term2gene = {}
for row in mapping.itertuples():
term = str(row[2])
gene = ''
if row[1] not in to_sgd.keys():
gene = row[1]
to_sgd[gene] = gene
else:
gene = to_sgd[row[1]]
assigned = set()
if term in term2gene.keys():
assigned = term2gene[term]
assigned.add(gene)
term2gene[term] = assigned
print(len(term2gene))
print(term2gene['10000'])
In [26]:
original_cols = all_interactions.columns
original_names = original_cols[3:10].tolist()
col_names = [ 'source', 'target', 'interaction', 'score', 'reference',]
print(original_names)
In [31]:
network = []
for row in all_interactions.itertuples():
for idx, col in enumerate(row):
if idx < 3 or idx > 9:
continue
else:
score = col.item()
if score == 0:
continue
interaction = original_names[idx-3]
pub = str(row[idx+8])
new_row = (to_sgd[row[1]], to_sgd[row[2]], interaction, score, pub)
network.append(new_row)
In [32]:
net_df = pd.DataFrame(network, columns=col_names)
net_df.head()
Out[32]:
In [53]:
print(net_df['score'].max())
print(net_df['score'].min())
In [35]:
net_df.to_csv('./data/clixo-raw-interactions.txt', sep='\t', encoding='utf-8', index=False)
In [36]:
# Create graph
import networkx as nx
g = nx.from_pandas_dataframe(net_df, source='source', target='target', edge_attr=['interaction', 'score', 'reference'])
In [37]:
g.nodes()[10]
print(list(term2gene['10275']))
sub = g.subgraph(list(term2gene['10275']))
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
nx.draw_circular(sub)
sub.edges()[1]
Out[37]:
In [38]:
term2interaction = {}
for term in term2gene.keys():
gset = term2gene[term]
assigned_itrs = []
glist = list(gset)
sub = g.subgraph(glist)
edges = sub.edges(data=True)
itrs = []
if len(edges) > 8000:
term2interaction[term] = itrs
continue
for e in edges:
data = {
'source': e[0],
'target': e[1],
'interaction': e[2]['interaction'],
'score': e[2]['score']
}
itrs.append(data)
term2interaction[term] = itrs
In [39]:
counts = []
for key in term2interaction.keys():
counts.append(len(term2interaction[key]))
max(counts)
Out[39]:
In [47]:
clixo_genes = {}
missing_name = set()
print(len(normalized_map))
for row in normalized_map:
sgd = row[0]
term = str(row[1])
orf = sgd2orf[sgd]
name = orf
symbol = ''
if sgd not in sgd2fullname.keys():
missing_name.add(sgd2orf[sgd])
name = orf
symbol = orf
else:
orf = sgd2orf[sgd]
name = sgd2fullname[sgd]
symbol = sgd2symbol[sgd]
entry = {
'sgdid': sgd,
'orf': orf,
'name': name,
'symbol': symbol
}
assigned_genes = []
if term in clixo_genes.keys():
assigned_genes = clixo_genes[term]['genes']
assigned_genes.append(entry)
clixo_genes[term] = {
'genes': assigned_genes
}
print(missing_name)
print(len(clixo_genes))
In [48]:
for key in clixo_genes.keys():
raw_interactions = []
gene_list = clixo_genes[key]['genes']
for gene in gene_list:
sgd = gene['sgdid']
if sgd in sgd2orf.keys():
orf = sgd2orf[sgd]
if orf in gene_map.keys():
raw_interactions.append(gene_map[orf])
clixo_genes[key]['interactions'] = term2interaction[key]
In [ ]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
# pp.pprint(clixo_genes['10000'])
In [49]:
from elasticsearch import Elasticsearch
from datetime import datetime
from elasticsearch_dsl import DocType, Date, Integer, Keyword, Text, Object, Nested, Index, Double, Integer
from elasticsearch_dsl.connections import connections
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch_dsl import Search
from elasticsearch_dsl.query import MultiMatch, Match, Q
# Define a default Elasticsearch client
connections.create_connection(hosts=['localhost:9200'])
Out[49]:
In [50]:
# Class which represents a CLIXO term
class ClixoTerm(DocType):
termid = Text(index='not_analyzed')
name = Text(analyzer='standard')
go = Object()
gene_count = Integer(index='not_analyzed')
genes = Object(multi=True)
interactions=Object(multi=True)
class Meta:
index = 'terms'
def get_clixo_term(key, term):
term_id = 'CLIXO:' + key
name = term_id
go = {
'goid': 'N/A',
'name': 'N/A',
'definition': 'N/A',
}
gene_count = 0
if key in clixo2go.keys():
go_alignment = clixo2go[key]
goid = go_alignment['go']
if goid == 'GO:00SUPER':
return ClixoTerm(
meta={'id': term_id},
termid=term_id,
name=name,
go=go,
gene_count = gene_count,
genes=term['genes'],
interactions=term2interaction[key])
def_raw = obo[goid].defn
def_str = def_raw.split('"')[1]
go['goid'] = goid
go['score'] = go_alignment['score']
go['fdr'] = go_alignment['fdr']
gene_count = go_alignment['genes']
go['name'] = obo[goid].name
go['definition'] = def_str
name = obo[goid].name
return ClixoTerm(
meta={'id': term_id},
termid=term_id,
name=name,
go=go,
gene_count = gene_count,
genes=term['genes'],
interactions=term2interaction[key]
)
In [51]:
print('init start==================')
ClixoTerm.init()
print('Init done ==================')
es = Elasticsearch(host='localhost', port=9200)
pool = []
print('Add start==================')
for id in clixo_genes.keys():
d = get_clixo_term(id, clixo_genes[id])
term = {'_index': getattr(d.meta, 'index', d._doc_type.index), '_type': d._doc_type.name, '_id': d.termid, '_source': d.to_dict()}
pool.append(term)
if len(pool) > 500:
print('Bulk add start:')
helpers.bulk(es, pool)
print('Bulk add success!')
pool = []
if len(pool) > 0:
print('Last: ' + str(len(pool)))
helpers.bulk(es, pool)
print('---------------success!')