In [ ]:
import cPickle as pickle, requests, codecs, time
from bs4 import BeautifulSoup
from bs4.element import Tag

from sqlalchemy import create_engine
from connect import mysqlusername, mysqlpassword, mysqlserver, mysqldbname
from collections import defaultdict

#from sqlalchemy.orm import sessionmaker
#from db_models import Base, ClinicalStudy, Facilities, Sponsors

from sqlalchemy.sql import func, select, and_, or_, not_, desc, bindparam
from db_tables import metadata, ConditionDescription

In [ ]:
mysqlserver = 'localhost'
engine = create_engine('mysql://%s:%s@%s/%s' % (mysqlusername, mysqlpassword, mysqlserver, mysqldbname))
conn = engine.connect()
metadata.create_all(engine)

In [ ]:
need_desc = conn.execute(ConditionDescription.select().where(ConditionDescription.c.description == '')).fetchall()

In [ ]:
to_get = {t[1]: t[0] for t in need_desc}

In [ ]:
enc = 'http://encyclopedia.thefreedictionary.com/%s'
dic = 'http://medical-dictionary.thefreedictionary.com/%s'
nlm = 'http://wsearch.nlm.nih.gov/ws/query?db=healthTopics&term=%s'
wiki_rev = 'http://en.wikipedia.org/w/api.php?format=xml&action=query&titles=%s&prop=revisions&rvprop=content'
wiki_ext = 'http://en.wikipedia.org/w/api.php?format=xml&action=query&prop=extracts&exintro=&titles=%s'

In [ ]:
descriptions = {}
not_found = []
do_over = []

In [ ]:
to_do = to_get.keys()
print len(to_do)

In [ ]:
for i in range(len(to_do)): 
    
    term = to_do[i]
    
    if term not in descriptions and term not in not_found:
        
        r = requests.get(nlm % term)
        if r.status_code == 429:
            sleep(310)
            r = requests.get(nlm % term)
        soup = BeautifulSoup(r.text)
        
        if int(soup.nlmsearchresult.count.contents[0]) > 0:
            summary = BeautifulSoup(soup.list.document.find('content', {'name': 'FullSummary'}).contents[0])
            descriptions[term] = (summary,'NLM')
            time.sleep(1)
        else:
            not_found.append(term)
    
    if i % 10 == 0: print '**************** %d ****************' % i

In [ ]:
print len(descriptions)
print len(not_found)
print len(to_do) - len(not_found)

In [ ]:
to_do = not_found[:]
not_found = []

In [ ]:
for i in range(len(to_do)):
    
    term = to_do[i]
    
    if term not in descriptions and term not in not_found:
        
        r = requests.get(wiki_rev % term)
        soup = BeautifulSoup(r.text)
        
        if soup.page.revisions:
            for rev in soup.page.revisions:
                txt = rev.contents[0]
                if '#REDIRECT' in txt:
                    term = txt[12:txt.index(']]')]
            
            r = requests.get(wiki_ext % term)
            soup = BeautifulSoup(r.text)
            
            if soup.page.find('extract'):
                descriptions[to_do[i]] = (soup.page.find('extract').get_text().replace('\n',''),'WIKIPEDIA')
                time.sleep(2)
                continue
        
        not_found.append(to_do[i])
    
    if i % 10 == 0: print '**************** %d ****************' % i

In [ ]:
# redirects = {}
# for i in range(len(to_do)):
    
#     term = to_do[i]
    
#     if term not in descriptions and term not in not_found:
        
#         r = requests.get(wiki_rev % term)
#         soup = BeautifulSoup(r.text)
        
#         if soup.page.revisions:
#             for rev in soup.page.revisions:
#                 txt = rev.contents[0]
#                 if '#REDIRECT' in txt:
#                     term = txt[12:txt.index(']]')]
#                     redirects[term] = to_do[i]
        
#     if i % 10 == 0: print '**************** %d ****************' % i

In [ ]:
#pickle.dump(descriptions,open('../data/condition_description_extra.pkl','wb'))
pickle.dump(new_wiki,open('../data/condition_description_wikionly.pkl','wb'))

In [ ]:
from IPython.display import HTML, Javascript, display
import random
for c in random.sample(descriptions,100):
    print c
    print display(HTML(unicode(descriptions[c][0])))
    print 
    print

Update database


In [ ]:
desc_ext = pickle.load(open('../data/condition_description_extra.pkl','rb'))
desc_wiki = pickle.load(open('../data/condition_description_wikionly.pkl','rb'))

In [ ]:
print len(desc_ext)
print len(desc_wiki)
print len(set(desc_ext) | set(desc_wiki))

In [ ]:
stmt = ConditionDescription.update().\
            where(ConditionDescription.c.mesh_term == bindparam('m')).\
            values(description=bindparam('descrip'))

In [ ]:
to_up = [{'m': d,
          'descrip': '%s<p>Source: Wikipedia</p>' % desc_wiki[d][0].replace('<p></p>','')}
         for d in desc_wiki]

In [ ]:
conn.execute(stmt, to_up)

In [ ]:
stmt = ConditionDescription.update().\
            where(ConditionDescription.c.mesh_term == bindparam('m')).\
            values(description=bindparam('descrip'))

In [ ]:
to_up = [{'m': d,
          'descrip': unicode(desc_ext[d][0]).replace('<p></p>','')}
         for d in desc_ext
         if d not in desc_wiki and desc_ext[d][1] == 'NLM']

In [ ]:
conn.execute(stmt, to_up)

In [ ]:
conn.close()

Free Dictionary and Encyclopedia


In [ ]:
for i in range(len(to_do)): 
    
    term = to_do[i]
    
    if term not in descriptions and term not in not_found:
        
        r = requests.get(enc % term)
        soup = BeautifulSoup(r.text)
        if soup.find(id="mainTxt"):
            count_tags = 0
            out_str = ''

            for l in list(soup.find(id="mainTxt").children):

                if type(l) is Tag: 
                    if l.name == 'table':
                        if count_tags == 0:
                            pass
                        else:
                            break
                    elif l.name == 'h2':
                        break
                    elif l.name == 'p':
                        if l.sup: 
                            sup = l.sup.extract()
                        this_str = l.get_text().strip()
                        if this_str: 
                            out_str += '<p>%s</p>' % this_str
                    elif l.name == 'ul':
                        out_str += '<ul>'
                        for li in l.findAll('li'):
                            if li.sup: 
                                sup = li.sup.extract()
                            this_str = li.get_text().strip()
                            if this_str: 
                                out_str += '<li>%s</li>' % this_str
                        out_str += '</ul>'

                    count_tags += 1

            descriptions[term] = (out_str,'ENCYCLOPEDIA')
            time.sleep(1)

        else:
            r = requests.get(dic % term)
            soup = BeautifulSoup(r.text)
            if soup.find("div", {'class': "pseg"}):
                out_str = ''
                defn = soup.find("div", {'class': "pseg"}).find("div").get_text().strip()
                out_str += '<p>%s</p>' % defn
                descriptions[term] = (out_str,'DICTIONARY')
                time.sleep(1)
            elif r.status_code == 200:
                print 'COULD NOT FIND %s' % term
                not_found.append(term)
            elif r.status_code == 403:
                print 'STOPPING FOR NOW'
                break
            else:
                print 'ERROR CODE: %d' % r.status_code
                do_over.append(term)
                time.sleep(10)
    
    if i % 10 == 0: print '**************** %d ****************' % i