In [1]:

    
# -*- coding: utf-8 -*-



In [1]:

    
import requests
from bs4 import BeautifulSoup
import codecs
import json
import string
import pandas as pd
from string import punctuation
import cPickle as pickle
import os
import re
import unicodedata
import time

Load Investigator Data



In [3]:

    
column_names = ['Investigator_id', "facility_id", "nct_id", 'Name', 'Investigator_Type', 'Null']

investigators_df = pd.read_csv('data/investigators.txt', names=column_names, sep="|", encoding='utf-8', quoting=3)
investigators_df = investigators_df[investigators_df.columns[:-1]]

# remove everything after the comma in names to get rid of Dr. ect
investigators_df.Name = investigators_df.Name.apply(lambda x: x.split(',')[0])

#different number of unique ids to unique names
print '# of unique facility ids: ', len(set(investigators_df.Investigator_id))

print '# of unique investigator names: ', len(set(investigators_df.Name))









    



# of unique facility ids:  147021
# of unique investigator names:  98129



In [4]:

    
len(set(investigators_df.Investigator_id))









    Out[4]:





147021

Invesigator Id actually = facility id



In [5]:

    
#get a dict of all unique investigator names associated with their ids and studies
investigator_dict = {}
duplicate = []
for invest, data in investigators_df.groupby('Name'):
#     if len(set(data.Investigator_id)) > 1:
#         duplicate.append(data)
    #remove Site Reference ID's
    if 'Site Reference ID' in list(data.Name)[0]:
        continue
    investigator_dict[list(data.Name)[0]] = {'id': list(set(data.Investigator_id)), 'Trials':list(set(data.nct_id))}



In [6]:

    
len(investigator_dict)









    Out[6]:





91029



In [9]:

    
investigator_dict['Thasarat Vajaranant']









    Out[9]:





{'Trials': [u'NCT01630551'], 'id': [80886]}



In [8]:

    
#save investigator dict
pickle.dump(investigator_dict, open('data/investigator_dict.pkl','wb'))



In [ ]:

    
len(set(map(lambda x: x.lower(), investigator_dict.keys())))

Pull publications



In [19]:

    
# load in already run investigators
done = set()
files = os.listdir('../data')
for f in files:
    if 'investigator_dict_' in f:
        done.update(pickle.load(open('../data/' + f,'rb')).keys())

listnum = len([f for f in files if 'investigator_dict_' in f]) + 1

#break up the list of names into 5 chuncks
todo = [i for i in investigator_dict.keys() if i not in done][:20000]



In [ ]:

    
def get_publications(name_list, list_num):
    count=0
    result_dict = {}
    for name in name_list:
        start_time = time.time()
        #get publication ids for investigator
        search_start = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed'
        investigator = name

        r = requests.get(search_start + '&term=%s[Author]&retmax=100000' % investigator)
        soup = BeautifulSoup(r.text)
        ids = [s.text for s in soup.find_all('id')]
        
        #don't do second call if there are no ids
        if len(ids) == 0:
            continue
        
        #get publications from ids
        summary_start = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed'
        id_list = ','.join(ids)

        r = requests.get(summary_start + '&id=%s&retmode=xml&retmax=10000' % id_list)

        #pickle does not work with soup objects
        #soup = BeautifulSoup(r.text)
        
        #add publications to the dict
        result_dict[name] = ' '.join(r.text.split())
        
        #time taken
        if (time.time() - start_time) < 1:
            time.sleep(1-(time.time() - start_time))

        count += 1

        if (count % 500) == 0 or count == len(name_list):
            pickle.dump(result_dict, open('investigator_dict_%d.pkl' % (list_num),'wb'))
            print count
            list_num += 1
            result_dict = {}



In [ ]:

    
pub_dict = get_publications(todo, listnum)



In [ ]: