import sys sys.path.append('./../') %load_ext autoreload %autoreload 2


In [1]:
from gensim.models import word2vec, doc2vec
model_trigram = word2vec.Word2Vec.load('models/trigram_100features_10minwords_5context')
model_doc2vec = doc2vec.Doc2Vec.load('models/doc2vec')


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-1-398a1a2ddf0c> in <module>()
      1 from gensim.models import word2vec, doc2vec
----> 2 model_trigram = word2vec.Word2Vec.load('models/trigram_100features_10minwords_5context')
      3 model_doc2vec = doc2vec.Doc2Vec.load('models/doc2vec')

C:\Python27\lib\site-packages\gensim\models\word2vec.pyc in load(cls, *args, **kwargs)
   1483     @classmethod
   1484     def load(cls, *args, **kwargs):
-> 1485         model = super(Word2Vec, cls).load(*args, **kwargs)
   1486         # update older models
   1487         if hasattr(model, 'table'):

C:\Python27\lib\site-packages\gensim\utils.pyc in load(cls, fname, mmap)
    246         compress, subname = SaveLoad._adapt_by_suffix(fname)
    247 
--> 248         obj = unpickle(fname)
    249         obj._load_specials(fname, mmap, compress, subname)
    250         return obj

C:\Python27\lib\site-packages\gensim\utils.pyc in unpickle(fname)
    910     with smart_open(fname) as f:
    911         # Because of loading from S3 load can't be used (missing readline in smart_open)
--> 912         return _pickle.loads(f.read())
    913 
    914 

AttributeError: 'module' object has no attribute 'call_on_class_only'

Threading


In [31]:
from threading import Thread
from time import sleep

from wiki_pubmed_fuzzy.ontology import get_ontology
import fuzzywuzzy.process as fuzzy_process
from fuzzywuzzy import fuzz
from wiki_pubmed_fuzzy import wiki
from wiki_pubmed_fuzzy import pubmed

from bot.lookup import search_doid
import NLP
#from xxx import xxx 

from bot import lookup

query_results = None
def fn_get_q(query, names):
    try:
        global query_results
        query_results = fuzzy_process.extractOne(query, names, scorer=fuzz.WRatio)
        return True
    except:
        return False

wiki_results = None
def fn_get_wiki(query, names):
    try:
        global wiki_results
        header = wiki.get_top_headers(query, 1)[0]
        wiki_results = fuzzy_process.extractOne(header, names, scorer=fuzz.ratio)
        #sleep(0.1)
        return True
    except:
        return False

pubmed_results = None
def fn_get_pubmed(query, names):
    global pubmed_results
    string = pubmed.get(query, topK=1)

    if string is not None:
        string = string[0]
        pubmed_results = fuzzy_process.extractOne(string, names, scorer=fuzz.partial_ratio)
        return True
    else:
        return False

'''main'''
## from bot
query = 'degenerative disease'

def find_answer(query, model_trigram, model_doc2vec):
    query = query.lower()
    
    # load ontology
    ontology = get_ontology('data/doid.obo')
    name2doid = {term.name: term.id for term in ontology.get_terms()}
    names = name2doid.keys()
    doid2name = {term.id: term.name for term in ontology.get_terms()}
    
    ## exact match
    if query in name2doid.keys():
        doid = name2doid[query]
        confidence = 100
    else:
        # no exact match
        th_get_q = Thread(target = fn_get_q, args = (query,names,))
        th_get_wiki = Thread(target = fn_get_wiki, args = (query,names,))
        th_get_pubmed = Thread(target = fn_get_pubmed, args = (query,names,))

        th_get_q.start()
        th_get_wiki.start()
        th_get_pubmed.start()

        ## search engine query --> vertices, p=100(NLP??); synonyms
        doids = set()
        doid_exact_results = search_doid(query, False, doids)
        print doids

        ## synonyms NLP
        
        #synonyms = NLP(query)

        ## new thread for NLP

        ## tree search on vertices (returned + synonyms)
        
        ## sleep ?

        th_get_q.join()
        print query_results

        th_get_wiki.join()
        print wiki_results

        th_get_pubmed.join()
        print pubmed_results

        
        #prob_vec = NLP2(query, synonyms)
        
        ## final answer
        ## draw graph

        doid = None
    
    graph = None
    string = ("Query: {:}\n".format(query) + 
              "{:}\n".format(doid) + 
              "Name: {:}\n".format(doid2name[doid]) + 
              "Confidence: {:}\%\n".format(confidence))
    return string, graph

In [29]:
print find_answer(query)[0]


set([u'DOID:8398', u'DOID:11829', u'DOID:90', u'DOID:0060844', u'HP:0005237', u'DOID:9799', u'HP:0001379', u'DOID:1289', u'DOID:10120'])
('degenerative disc disease', 95)
('eye degenerative disease', 91)
('disease', 100)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-29-53857e14ee66> in <module>()
----> 1 print find_answer(query)[0]

<ipython-input-28-2485d8c88b6b> in find_answer(query)
    106     string = ("Query: {:}\n".format(query) + 
    107               "{:}\n".format(doid) +
--> 108               "Name: {:}\n".format(doid2name[doid]) +
    109               "Confidence: {:}\%\n".format(confidence))
    110     return string, graph

KeyError: None

In [ ]:

Read ontology


In [ ]:
ontology = get_ontology('../data/doid.obo')
name2doid = {term.name: term.id for term in ontology.get_terms()}
doid2name = {term.id: term.name for term in ontology.get_terms()}

In [ ]:


In [3]:
import numpy as np
import re

Wiki links from obo descriptions


In [3]:
lst = wiki.get_links_from_ontology(ontology)
print r'example:{:}'.format(repr(lst[10]))


example:'http://en.wikipedia.org/wiki/Abetalipoproteinemia'

urllib2 to read page in html


In [4]:
page = wiki.get_html(lst[101])
page[:1000]


Out[4]:
'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Ameloblastoma - Wikipedia</title>\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Ameloblastoma","wgTitle":"Ameloblastoma","wgCurRevisionId":766170591,"wgRevisionId":766170591,"wgArticleId":2020081,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All articles with dead external links","Articles with dead external links from February 2017","Articles with contributors link","Articles needing additional references from March 2009","All articles needing additional references","Commons category with local link same as on Wikidata","Odontogenic tumors"],"wgBreakFrames":fals'

Fuzzy logic


In [17]:
string = "ventricular arrhythmia"
names = np.sort(name2doid.keys())
print fuzzy_process.extractOne(string, names, scorer=fuzz.token_set_ratio)


('arrhythmogenic right ventricular cardiomyopathy', 67)

In [139]:
string = "Complete remission of hairy cell leukemia variant (HCL-v) complicated by red cell aplasia post treatment with rituximab."
print fuzzy_process.extractOne(string, names, scorer=fuzz.partial_ratio)


('hairy cell leukemia', 100)

In [ ]:

Wikipedia search engine: headers


In [18]:
query = "ventricular arrhythmia"

top = wiki.get_top_headers(query)
top


Out[18]:
[u'Cardiac arrhythmia',
 u'Re-entry ventricular arrhythmia',
 u'Ventricular fibrillation']

In [20]:
for header in top:
    results = fuzzy_process.extractOne(header, names, scorer=fuzz.token_set_ratio)
    print results


('cardiac arrest', 75)
('arrhythmogenic right ventricular cardiomyopathy', 67)
('atrial fibrillation', 79)

In [ ]:


In [59]:
page = wikipedia.WikipediaPage(title='Cell_proliferation')
page.summary


Out[59]:
u'The term cell growth is used in the contexts of biological cell development and cell division (reproduction). When used in the context of cell division, it refers to growth of cell populations, where a cell, known as the "mother cell", grows and divides to produce two "daughter cells" (M phase). When used in the context of cell development, the term refers to increase in cytoplasmic and organelle volume (G1 phase), as well as increase in genetic material (G2 phase) following the replication during S phase.'

In [ ]:

[name for name in names if len(re.split(' ', name)) > 3]

pub-med


In [49]:
query = 'hcl-v'
titles = pubmed.get(query)
titles_len = [len(title) for title in titles] 
for i, string in enumerate(titles):
    print("%d) %s" % (i+1, string))
    print fuzzy_process.extractOne(string, names, scorer=fuzz.partial_ratio)
    print


1) Complete remission of hairy cell leukemia variant (HCL-v) complicated by red cell aplasia post treatment with rituximab.
('hairy cell leukemia', 100)

def find_synonym(s_ref, s): last = s_ref.find('(' + s + ')') if last == -1: return None

n_upper = len(''.join([c for c in s if c.isupper()]))
first = [(i,c) for i, c in enumerate(s_ref[:last]) if c.isupper()][-n_upper][0]
return s_ref[first:last-1]

print find_synonym('Wolff-Parkinson-White syndrome (WPW) and athletes: Darwin at play?', 'WPW')

synonyms


In [27]:
import utils

print utils.find_synonym('Wolff-Parkinson-White syndrome (WPW) and athletes: Darwin at play?', 'WPW')
print utils.find_synonym('Complete remission of hairy cell leukemia variant (HCL-v)...', 'hcl-v')


wolff parkinson white 
hairy cell leukemia variant 

Assymetric distance


In [29]:
s_ref = 'artery disease'
s = 'nonartery'
print utils.assym_dist(s, s_ref)


7

Length statistics


In [30]:
print 'Mean term name length:', np.mean([len(term.name) for term in ontology.get_terms()])
print 'Mean article title length:', np.mean(titles_len)


Mean term name length: 27.5502935797
Mean article title length: 120.0

Unique words


In [31]:
words = [re.split(' |-', term.name) for term in ontology.get_terms()]
words = np.unique([l for sublist in words for l in sublist if len(l) > 0])
words = [w for w in words if len(w) >= 4]
words[:10]


Out[31]:
['(+)ssrna',
 '(1p)',
 '(atp',
 '(perianal)',
 ')ssrna',
 '1.4mb',
 '10q23',
 '13q14',
 '14q11',
 '15q11.2']

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: