In [ ]:
import cPickle as pickle, requests, codecs, time
from bs4 import BeautifulSoup
from bs4.element import Tag
from sqlalchemy import create_engine
from connect import mysqlusername, mysqlpassword, mysqlserver, mysqldbname
from collections import defaultdict
#from sqlalchemy.orm import sessionmaker
#from db_models import Base, ClinicalStudy, Facilities, Sponsors
from sqlalchemy.sql import func, select, and_, or_, not_, desc, bindparam
from db_tables import metadata, ConditionDescription
In [ ]:
mysqlserver = 'localhost'
engine = create_engine('mysql://%s:%s@%s/%s' % (mysqlusername, mysqlpassword, mysqlserver, mysqldbname))
conn = engine.connect()
metadata.create_all(engine)
In [ ]:
need_desc = conn.execute(ConditionDescription.select().where(ConditionDescription.c.description == '')).fetchall()
In [ ]:
to_get = {t[1]: t[0] for t in need_desc}
In [ ]:
enc = 'http://encyclopedia.thefreedictionary.com/%s'
dic = 'http://medical-dictionary.thefreedictionary.com/%s'
nlm = 'http://wsearch.nlm.nih.gov/ws/query?db=healthTopics&term=%s'
wiki_rev = 'http://en.wikipedia.org/w/api.php?format=xml&action=query&titles=%s&prop=revisions&rvprop=content'
wiki_ext = 'http://en.wikipedia.org/w/api.php?format=xml&action=query&prop=extracts&exintro=&titles=%s'
In [ ]:
descriptions = {}
not_found = []
do_over = []
In [ ]:
to_do = to_get.keys()
print len(to_do)
In [ ]:
for i in range(len(to_do)):
term = to_do[i]
if term not in descriptions and term not in not_found:
r = requests.get(nlm % term)
if r.status_code == 429:
sleep(310)
r = requests.get(nlm % term)
soup = BeautifulSoup(r.text)
if int(soup.nlmsearchresult.count.contents[0]) > 0:
summary = BeautifulSoup(soup.list.document.find('content', {'name': 'FullSummary'}).contents[0])
descriptions[term] = (summary,'NLM')
time.sleep(1)
else:
not_found.append(term)
if i % 10 == 0: print '**************** %d ****************' % i
In [ ]:
print len(descriptions)
print len(not_found)
print len(to_do) - len(not_found)
In [ ]:
to_do = not_found[:]
not_found = []
In [ ]:
for i in range(len(to_do)):
term = to_do[i]
if term not in descriptions and term not in not_found:
r = requests.get(wiki_rev % term)
soup = BeautifulSoup(r.text)
if soup.page.revisions:
for rev in soup.page.revisions:
txt = rev.contents[0]
if '#REDIRECT' in txt:
term = txt[12:txt.index(']]')]
r = requests.get(wiki_ext % term)
soup = BeautifulSoup(r.text)
if soup.page.find('extract'):
descriptions[to_do[i]] = (soup.page.find('extract').get_text().replace('\n',''),'WIKIPEDIA')
time.sleep(2)
continue
not_found.append(to_do[i])
if i % 10 == 0: print '**************** %d ****************' % i
In [ ]:
# redirects = {}
# for i in range(len(to_do)):
# term = to_do[i]
# if term not in descriptions and term not in not_found:
# r = requests.get(wiki_rev % term)
# soup = BeautifulSoup(r.text)
# if soup.page.revisions:
# for rev in soup.page.revisions:
# txt = rev.contents[0]
# if '#REDIRECT' in txt:
# term = txt[12:txt.index(']]')]
# redirects[term] = to_do[i]
# if i % 10 == 0: print '**************** %d ****************' % i
In [ ]:
#pickle.dump(descriptions,open('../data/condition_description_extra.pkl','wb'))
pickle.dump(new_wiki,open('../data/condition_description_wikionly.pkl','wb'))
In [ ]:
from IPython.display import HTML, Javascript, display
import random
for c in random.sample(descriptions,100):
print c
print display(HTML(unicode(descriptions[c][0])))
print
print
In [ ]:
desc_ext = pickle.load(open('../data/condition_description_extra.pkl','rb'))
desc_wiki = pickle.load(open('../data/condition_description_wikionly.pkl','rb'))
In [ ]:
print len(desc_ext)
print len(desc_wiki)
print len(set(desc_ext) | set(desc_wiki))
In [ ]:
stmt = ConditionDescription.update().\
where(ConditionDescription.c.mesh_term == bindparam('m')).\
values(description=bindparam('descrip'))
In [ ]:
to_up = [{'m': d,
'descrip': '%s<p>Source: Wikipedia</p>' % desc_wiki[d][0].replace('<p></p>','')}
for d in desc_wiki]
In [ ]:
conn.execute(stmt, to_up)
In [ ]:
stmt = ConditionDescription.update().\
where(ConditionDescription.c.mesh_term == bindparam('m')).\
values(description=bindparam('descrip'))
In [ ]:
to_up = [{'m': d,
'descrip': unicode(desc_ext[d][0]).replace('<p></p>','')}
for d in desc_ext
if d not in desc_wiki and desc_ext[d][1] == 'NLM']
In [ ]:
conn.execute(stmt, to_up)
In [ ]:
conn.close()
In [ ]:
for i in range(len(to_do)):
term = to_do[i]
if term not in descriptions and term not in not_found:
r = requests.get(enc % term)
soup = BeautifulSoup(r.text)
if soup.find(id="mainTxt"):
count_tags = 0
out_str = ''
for l in list(soup.find(id="mainTxt").children):
if type(l) is Tag:
if l.name == 'table':
if count_tags == 0:
pass
else:
break
elif l.name == 'h2':
break
elif l.name == 'p':
if l.sup:
sup = l.sup.extract()
this_str = l.get_text().strip()
if this_str:
out_str += '<p>%s</p>' % this_str
elif l.name == 'ul':
out_str += '<ul>'
for li in l.findAll('li'):
if li.sup:
sup = li.sup.extract()
this_str = li.get_text().strip()
if this_str:
out_str += '<li>%s</li>' % this_str
out_str += '</ul>'
count_tags += 1
descriptions[term] = (out_str,'ENCYCLOPEDIA')
time.sleep(1)
else:
r = requests.get(dic % term)
soup = BeautifulSoup(r.text)
if soup.find("div", {'class': "pseg"}):
out_str = ''
defn = soup.find("div", {'class': "pseg"}).find("div").get_text().strip()
out_str += '<p>%s</p>' % defn
descriptions[term] = (out_str,'DICTIONARY')
time.sleep(1)
elif r.status_code == 200:
print 'COULD NOT FIND %s' % term
not_found.append(term)
elif r.status_code == 403:
print 'STOPPING FOR NOW'
break
else:
print 'ERROR CODE: %d' % r.status_code
do_over.append(term)
time.sleep(10)
if i % 10 == 0: print '**************** %d ****************' % i