In [1]:
import sys
sys.path.append('./../')
%load_ext autoreload
%autoreload 2

In [2]:
from ontology import get_ontology

ontology = get_ontology('../data/doid.obo')
name2doid = {term.name: term.id for term in ontology.get_terms()}
doid2name = {term.id: term.name for term in ontology.get_terms()}

In [ ]:


In [3]:
import numpy as np
import re

Wiki links from obo descriptions


In [3]:
import wiki
lst = wiki.get_links_from_ontology(ontology)
print r'example:{:}'.format(repr(lst[10]))


example:'http://en.wikipedia.org/wiki/Abetalipoproteinemia'

urllib2 to read page in html


In [4]:
page = wiki.get_html(lst[101])
page[:1000]


Out[4]:
'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Ameloblastoma - Wikipedia</title>\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Ameloblastoma","wgTitle":"Ameloblastoma","wgCurRevisionId":766170591,"wgRevisionId":766170591,"wgArticleId":2020081,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All articles with dead external links","Articles with dead external links from February 2017","Articles with contributors link","Articles needing additional references from March 2009","All articles needing additional references","Commons category with local link same as on Wikidata","Odontogenic tumors"],"wgBreakFrames":fals'

Fuzzy logic


In [6]:
import fuzzywuzzy.process as fuzzy_process
from fuzzywuzzy import fuzz

In [17]:
string = "ventricular arrhythmia"
names = np.sort(name2doid.keys())
print fuzzy_process.extractOne(string, names, scorer=fuzz.token_set_ratio)


('arrhythmogenic right ventricular cardiomyopathy', 67)

In [139]:
string = "Complete remission of hairy cell leukemia variant (HCL-v) complicated by red cell aplasia post treatment with rituximab."
print fuzzy_process.extractOne(string, names, scorer=fuzz.partial_ratio)


('hairy cell leukemia', 100)

In [ ]:

Wikipedia search engine: headers


In [18]:
query = "ventricular arrhythmia"

top = wiki.get_top_headers(query)
top


Out[18]:
[u'Cardiac arrhythmia',
 u'Re-entry ventricular arrhythmia',
 u'Ventricular fibrillation']

In [20]:
for header in top:
    results = fuzzy_process.extractOne(header, names, scorer=fuzz.token_set_ratio)
    print results


('cardiac arrest', 75)
('arrhythmogenic right ventricular cardiomyopathy', 67)
('atrial fibrillation', 79)

In [ ]:


In [59]:
page = wikipedia.WikipediaPage(title='Cell_proliferation')
page.summary


Out[59]:
u'The term cell growth is used in the contexts of biological cell development and cell division (reproduction). When used in the context of cell division, it refers to growth of cell populations, where a cell, known as the "mother cell", grows and divides to produce two "daughter cells" (M phase). When used in the context of cell development, the term refers to increase in cytoplasmic and organelle volume (G1 phase), as well as increase in genetic material (G2 phase) following the replication during S phase.'

In [ ]:

[name for name in names if len(re.split(' ', name)) > 3]

pub-med


In [49]:
import pubmed

query = 'hcl-v'
titles = pubmed.get(query)
titles_len = [len(title) for title in titles] 
for i, string in enumerate(titles):
    print("%d) %s" % (i+1, string))
    print fuzzy_process.extractOne(string, names, scorer=fuzz.partial_ratio)
    print


1) Complete remission of hairy cell leukemia variant (HCL-v) complicated by red cell aplasia post treatment with rituximab.
('hairy cell leukemia', 100)

def find_synonym(s_ref, s): last = s_ref.find('(' + s + ')') if last == -1: return None

n_upper = len(''.join([c for c in s if c.isupper()]))
first = [(i,c) for i, c in enumerate(s_ref[:last]) if c.isupper()][-n_upper][0]
return s_ref[first:last-1]

print find_synonym('Wolff-Parkinson-White syndrome (WPW) and athletes: Darwin at play?', 'WPW')

synonyms


In [27]:
import utils

print utils.find_synonym('Wolff-Parkinson-White syndrome (WPW) and athletes: Darwin at play?', 'WPW')
print utils.find_synonym('Complete remission of hairy cell leukemia variant (HCL-v)...', 'hcl-v')


wolff parkinson white 
hairy cell leukemia variant 

Assymetric distance


In [29]:
s_ref = 'artery disease'
s = 'nonartery'
print utils.assym_dist(s, s_ref)


7

Length statistics


In [30]:
print 'Mean term name length:', np.mean([len(term.name) for term in ontology.get_terms()])
print 'Mean article title length:', np.mean(titles_len)


Mean term name length: 27.5502935797
Mean article title length: 120.0

Unique words


In [31]:
words = [re.split(' |-', term.name) for term in ontology.get_terms()]
words = np.unique([l for sublist in words for l in sublist if len(l) > 0])
words = [w for w in words if len(w) >= 4]
words[:10]


Out[31]:
['(+)ssrna',
 '(1p)',
 '(atp',
 '(perianal)',
 ')ssrna',
 '1.4mb',
 '10q23',
 '13q14',
 '14q11',
 '15q11.2']

In [ ]:


In [ ]:

Threading


In [4]:
from threading import Thread
from time import sleep

from ontology import get_ontology

query_results = None
def fn_get_q(query):
    global query_results
    query_results = fuzzy_process.extractOne(query, names, scorer=fuzz.ratio)
    return True

wiki_results = None
def fn_get_wiki(query):
    global wiki_results
    header = wiki.get_top_headers(query, 1)[0]
    wiki_results = fuzzy_process.extractOne(header, names, scorer=fuzz.ratio)
    #sleep(0.1)
    return True

pubmed_results = None
def fn_get_pubmed(query):
    global pubmed_results
    string = pubmed.get(query, topK=1)

    if string is not None:
        string = string[0]
        print string
        pubmed_results = fuzzy_process.extractOne(string, names, scorer=fuzz.partial_ratio)
        return True
    else:
        return False

'''main'''
## from bot
query = 'valve disease'

def find_answer(query):
    query = query.lower()
    
    # load ontology
    ontology = get_ontology('../data/doid.obo')
    name2doid = {term.name: term.id for term in ontology.get_terms()}
    doid2name = {term.id: term.name for term in ontology.get_terms()}
    
    ## exact match
    if query in name2doid.keys():
        doid = name2doid[query]
    else:
        # exact match -- no
        th_get_q = Thread(target = fn_get_q, args = (query,))
        th_get_wiki = Thread(target = fn_get_wiki, args = (query,))
        th_get_pubmed = Thread(target = fn_get_pubmed, args = (query,))

        th_get_q.start()
        th_get_wiki.start()
        th_get_pubmed.start()


        ## search engine query --> vertices, p=100(NLP??); synonyms

        ## new thread for synonyms???

        ## synonyms NLP

        ## new thread for NLP

        ## tree search on vertices (returned + synonyms)

        ## sleep ?

        th_get_q.join()
        print query_results

        th_get_wiki.join()
        print wiki_results

        th_get_pubmed.join()
        print pubmed_results

        ## final answer
        ## draw graph

        doid = None
    
    graph = None
    return doid, graph

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: