import sys sys.path.append('./../') %load_ext autoreload %autoreload 2
In [1]:
from gensim.models import word2vec, doc2vec
model_trigram = word2vec.Word2Vec.load('models/trigram_100features_10minwords_5context')
model_doc2vec = doc2vec.Doc2Vec.load('models/doc2vec')
In [31]:
from threading import Thread
from time import sleep
from wiki_pubmed_fuzzy.ontology import get_ontology
import fuzzywuzzy.process as fuzzy_process
from fuzzywuzzy import fuzz
from wiki_pubmed_fuzzy import wiki
from wiki_pubmed_fuzzy import pubmed
from bot.lookup import search_doid
import NLP
#from xxx import xxx
from bot import lookup
query_results = None
def fn_get_q(query, names):
try:
global query_results
query_results = fuzzy_process.extractOne(query, names, scorer=fuzz.WRatio)
return True
except:
return False
wiki_results = None
def fn_get_wiki(query, names):
try:
global wiki_results
header = wiki.get_top_headers(query, 1)[0]
wiki_results = fuzzy_process.extractOne(header, names, scorer=fuzz.ratio)
#sleep(0.1)
return True
except:
return False
pubmed_results = None
def fn_get_pubmed(query, names):
global pubmed_results
string = pubmed.get(query, topK=1)
if string is not None:
string = string[0]
pubmed_results = fuzzy_process.extractOne(string, names, scorer=fuzz.partial_ratio)
return True
else:
return False
'''main'''
## from bot
query = 'degenerative disease'
def find_answer(query, model_trigram, model_doc2vec):
query = query.lower()
# load ontology
ontology = get_ontology('data/doid.obo')
name2doid = {term.name: term.id for term in ontology.get_terms()}
names = name2doid.keys()
doid2name = {term.id: term.name for term in ontology.get_terms()}
## exact match
if query in name2doid.keys():
doid = name2doid[query]
confidence = 100
else:
# no exact match
th_get_q = Thread(target = fn_get_q, args = (query,names,))
th_get_wiki = Thread(target = fn_get_wiki, args = (query,names,))
th_get_pubmed = Thread(target = fn_get_pubmed, args = (query,names,))
th_get_q.start()
th_get_wiki.start()
th_get_pubmed.start()
## search engine query --> vertices, p=100(NLP??); synonyms
doids = set()
doid_exact_results = search_doid(query, False, doids)
print doids
## synonyms NLP
#synonyms = NLP(query)
## new thread for NLP
## tree search on vertices (returned + synonyms)
## sleep ?
th_get_q.join()
print query_results
th_get_wiki.join()
print wiki_results
th_get_pubmed.join()
print pubmed_results
#prob_vec = NLP2(query, synonyms)
## final answer
## draw graph
doid = None
graph = None
string = ("Query: {:}\n".format(query) +
"{:}\n".format(doid) +
"Name: {:}\n".format(doid2name[doid]) +
"Confidence: {:}\%\n".format(confidence))
return string, graph
In [29]:
print find_answer(query)[0]
In [ ]:
In [ ]:
ontology = get_ontology('../data/doid.obo')
name2doid = {term.name: term.id for term in ontology.get_terms()}
doid2name = {term.id: term.name for term in ontology.get_terms()}
In [ ]:
In [3]:
import numpy as np
import re
In [3]:
lst = wiki.get_links_from_ontology(ontology)
print r'example:{:}'.format(repr(lst[10]))
In [4]:
page = wiki.get_html(lst[101])
page[:1000]
Out[4]:
In [17]:
string = "ventricular arrhythmia"
names = np.sort(name2doid.keys())
print fuzzy_process.extractOne(string, names, scorer=fuzz.token_set_ratio)
In [139]:
string = "Complete remission of hairy cell leukemia variant (HCL-v) complicated by red cell aplasia post treatment with rituximab."
print fuzzy_process.extractOne(string, names, scorer=fuzz.partial_ratio)
In [ ]:
In [18]:
query = "ventricular arrhythmia"
top = wiki.get_top_headers(query)
top
Out[18]:
In [20]:
for header in top:
results = fuzzy_process.extractOne(header, names, scorer=fuzz.token_set_ratio)
print results
In [ ]:
In [59]:
page = wikipedia.WikipediaPage(title='Cell_proliferation')
page.summary
Out[59]:
In [ ]:
[name for name in names if len(re.split(' ', name)) > 3]
In [49]:
query = 'hcl-v'
titles = pubmed.get(query)
titles_len = [len(title) for title in titles]
for i, string in enumerate(titles):
print("%d) %s" % (i+1, string))
print fuzzy_process.extractOne(string, names, scorer=fuzz.partial_ratio)
print
def find_synonym(s_ref, s): last = s_ref.find('(' + s + ')') if last == -1: return None
n_upper = len(''.join([c for c in s if c.isupper()]))
first = [(i,c) for i, c in enumerate(s_ref[:last]) if c.isupper()][-n_upper][0]
return s_ref[first:last-1]
print find_synonym('Wolff-Parkinson-White syndrome (WPW) and athletes: Darwin at play?', 'WPW')
In [27]:
import utils
print utils.find_synonym('Wolff-Parkinson-White syndrome (WPW) and athletes: Darwin at play?', 'WPW')
print utils.find_synonym('Complete remission of hairy cell leukemia variant (HCL-v)...', 'hcl-v')
In [29]:
s_ref = 'artery disease'
s = 'nonartery'
print utils.assym_dist(s, s_ref)
In [30]:
print 'Mean term name length:', np.mean([len(term.name) for term in ontology.get_terms()])
print 'Mean article title length:', np.mean(titles_len)
In [31]:
words = [re.split(' |-', term.name) for term in ontology.get_terms()]
words = np.unique([l for sublist in words for l in sublist if len(l) > 0])
words = [w for w in words if len(w) >= 4]
words[:10]
Out[31]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: