In [1]:
# -*- coding: utf-8 -*-
In [1]:
import requests
from bs4 import BeautifulSoup
import codecs
import json
import string
import pandas as pd
from string import punctuation
import cPickle as pickle
import os
import re
import unicodedata
import time
In [3]:
column_names = ['Investigator_id', "facility_id", "nct_id", 'Name', 'Investigator_Type', 'Null']
investigators_df = pd.read_csv('data/investigators.txt', names=column_names, sep="|", encoding='utf-8', quoting=3)
investigators_df = investigators_df[investigators_df.columns[:-1]]
# remove everything after the comma in names to get rid of Dr. ect
investigators_df.Name = investigators_df.Name.apply(lambda x: x.split(',')[0])
#different number of unique ids to unique names
print '# of unique facility ids: ', len(set(investigators_df.Investigator_id))
print '# of unique investigator names: ', len(set(investigators_df.Name))
In [4]:
len(set(investigators_df.Investigator_id))
Out[4]:
In [5]:
#get a dict of all unique investigator names associated with their ids and studies
investigator_dict = {}
duplicate = []
for invest, data in investigators_df.groupby('Name'):
# if len(set(data.Investigator_id)) > 1:
# duplicate.append(data)
#remove Site Reference ID's
if 'Site Reference ID' in list(data.Name)[0]:
continue
investigator_dict[list(data.Name)[0]] = {'id': list(set(data.Investigator_id)), 'Trials':list(set(data.nct_id))}
In [6]:
len(investigator_dict)
Out[6]:
In [9]:
investigator_dict['Thasarat Vajaranant']
Out[9]:
In [8]:
#save investigator dict
pickle.dump(investigator_dict, open('data/investigator_dict.pkl','wb'))
In [ ]:
len(set(map(lambda x: x.lower(), investigator_dict.keys())))
In [19]:
# load in already run investigators
done = set()
files = os.listdir('../data')
for f in files:
if 'investigator_dict_' in f:
done.update(pickle.load(open('../data/' + f,'rb')).keys())
listnum = len([f for f in files if 'investigator_dict_' in f]) + 1
#break up the list of names into 5 chuncks
todo = [i for i in investigator_dict.keys() if i not in done][:20000]
In [ ]:
def get_publications(name_list, list_num):
count=0
result_dict = {}
for name in name_list:
start_time = time.time()
#get publication ids for investigator
search_start = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed'
investigator = name
r = requests.get(search_start + '&term=%s[Author]&retmax=100000' % investigator)
soup = BeautifulSoup(r.text)
ids = [s.text for s in soup.find_all('id')]
#don't do second call if there are no ids
if len(ids) == 0:
continue
#get publications from ids
summary_start = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed'
id_list = ','.join(ids)
r = requests.get(summary_start + '&id=%s&retmode=xml&retmax=10000' % id_list)
#pickle does not work with soup objects
#soup = BeautifulSoup(r.text)
#add publications to the dict
result_dict[name] = ' '.join(r.text.split())
#time taken
if (time.time() - start_time) < 1:
time.sleep(1-(time.time() - start_time))
count += 1
if (count % 500) == 0 or count == len(name_list):
pickle.dump(result_dict, open('investigator_dict_%d.pkl' % (list_num),'wb'))
print count
list_num += 1
result_dict = {}
In [ ]:
pub_dict = get_publications(todo, listnum)
In [ ]: