In [ ]:
import cPickle as pickle, requests, codecs
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from connect import mysqlusername, mysqlpassword, mysqlserver, mysqldbname
from collections import defaultdict
#from sqlalchemy.orm import sessionmaker
#from db_models import Base, ClinicalStudy, Facilities, Sponsors
from sqlalchemy.sql import func, select, and_, or_, not_, desc
from db_tables import metadata, Reference, TrialPublications
In [ ]:
publications = pickle.load(open('../data/pub_lookup_dict.pkl','rb'))
In [ ]:
pmids = publications.keys()
In [ ]:
print len(pmids)
In [ ]:
template = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=xml&retmax=10000'
bad = []
ok = {}
In [ ]:
for i in range(0,len(pmids),100):
these_pubs = [p for p in pmids[i:i+100] if p not in ok]
if these_pubs:
r = requests.get(template % these_pubs)
if r.status_code == 200:
soup = BeautifulSoup(r.text)
for a in soup.findAll('pubmedarticle'):
this_pmid = a.pmid.contents[0]
if this_pmid in these_pubs:
title = a.article.articletitle.contents[0]
if a.article.journal.isoabbreviation:
journal = a.article.journal.isoabbreviation.contents[0]
else:
journal = a.article.journal.title.contents[0]
if journal[-1] != '.': journal + '.'
if a.article.journal.journalissue.volume:
vol = a.article.journal.journalissue.volume.contents[0]
else:
vol = ''
if a.article.journal.journalissue.issue:
iss = '(%s)' % a.article.journal.journalissue.issue.contents[0]
else:
iss = ''
pubdate = ' '.join([c for c in a.article.journal.journalissue.pubdate.strings if c != '\n'])
if a.article.pagination.medlinepgn.contents:
pages = ':%s' % a.article.pagination.medlinepgn.contents[0]
else:
pages = ''
authors = []
for t in a.article.authorlist.findAll('author'):
if t.lastname:
if t.initials:
authors.append(t.lastname.contents[0] + ' ' + t.initials.contents[0])
else:
authors.append(t.lastname.contents[0])
ok[this_pmid] = {'authors': ', '.join(authors),
'title': title,
'citation': '%s %s;%s%s%s.' % (journal,
pubdate,
vol,
iss,
pages),
}
else:
bad += these_pubs
if i % 100 == 0: print i
In [ ]:
print len(ok)
print len(bad)
In [ ]:
f = codecs.open('../data/pub_lookup_journals.txt','w','utf-8')
for p in ok:
to_write = '||'.join([p,ok[p]['authors'],ok[p]['title'].replace(u'\u2029',' '),ok[p]['citation']]) + '\n'
f.write(to_write)
f.close()
In [ ]:
investigators_trials = pickle.load(open('../data/trial_invest_pub_match_dict.pkl','rb'))
publications = pickle.load(open('../data/pub_lookup_dict.pkl','rb'))
In [ ]:
journals = {}
for row in codecs.open('../data/pub_lookup_journals.txt','r','utf-8').readlines():
k, a, t, c = row.strip().split('||')
journals[k] = {'authors': a, 'title': t, 'citation': c}
In [ ]:
mysqlserver = 'localhost'
engine = create_engine('mysql://%s:%s@%s/%s' % (mysqlusername, mysqlpassword, mysqlserver, mysqldbname))
conn = engine.connect()
metadata.create_all(engine)
Set up PMID to NCT_ID dictionary
In [ ]:
pmid_trials = defaultdict(dict)
for i in investigators_trials:
for n in investigators_trials[i]:
for p in investigators_trials[i][n]:
pmid_trials[p][n] = 0.0
Get current reference table and split apart information into units we need
In [ ]:
cur_ref = conn.execute(Reference.select()).fetchall()
In [ ]:
def split_cite(full_cite):
if full_cite[:20] == '[No authors listed] ':
authors = None
full_cite = full_cite[20:]
else:
authors = full_cite[:full_cite.find('.')]
full_cite = full_cite[full_cite.find('.')+2:]
title = full_cite[:full_cite.find('.')]
citation = full_cite[full_cite.find('.')+2:]
return authors, title, citation
for ref_id, nct_id, typ, full_cite, pmid in cur_ref:
if pmid:
pmid_trials[pmid][nct_id] = 1.0
if pmid not in journals:
authors, title, citation = split_cite(full_cite)
journals[pmid] = {'authors': authors,
'title': title,
'citation': citation}
Write to database
In [ ]:
def fix_authors(author_str):
if author_str:
author_list = author_str.split(', ')
if author_list > 10:
return ', '.join(author_list[:10] + ['et al.'])
else:
return author_str
else:
return ''
In [ ]:
insert_obj = [{'nct_id': n,
'pubmed_id': p,
'authors': fix_authors(journals[p]['authors']),
'title': journals[p]['title'],
'citation': journals[p]['citation'],
'confidence': pmid_trials[p][n]}
for p in journals
for n in pmid_trials[p]]
In [ ]:
for k in range(0,len(insert_obj),5000):
print k
conn.execute(TrialPublications.insert(),insert_obj[k:k+5000])
In [ ]:
conn.close()
In [ ]: