In [1]:
# -*- coding: utf-8 -*-

In [1]:
import requests
from bs4 import BeautifulSoup
import codecs
import json
import string
import pandas as pd
from string import punctuation
import cPickle as pickle
import os
import re
import unicodedata
import time

Load Investigator Data


In [3]:
column_names = ['Investigator_id', "facility_id", "nct_id", 'Name', 'Investigator_Type', 'Null']

investigators_df = pd.read_csv('data/investigators.txt', names=column_names, sep="|", encoding='utf-8', quoting=3)
investigators_df = investigators_df[investigators_df.columns[:-1]]

# remove everything after the comma in names to get rid of Dr. ect
investigators_df.Name = investigators_df.Name.apply(lambda x: x.split(',')[0])

#different number of unique ids to unique names
print '# of unique facility ids: ', len(set(investigators_df.Investigator_id))

print '# of unique investigator names: ', len(set(investigators_df.Name))


# of unique facility ids:  147021
# of unique investigator names:  98129

In [4]:
len(set(investigators_df.Investigator_id))


Out[4]:
147021

Invesigator Id actually = facility id


In [5]:
#get a dict of all unique investigator names associated with their ids and studies
investigator_dict = {}
duplicate = []
for invest, data in investigators_df.groupby('Name'):
#     if len(set(data.Investigator_id)) > 1:
#         duplicate.append(data)
    #remove Site Reference ID's
    if 'Site Reference ID' in list(data.Name)[0]:
        continue
    investigator_dict[list(data.Name)[0]] = {'id': list(set(data.Investigator_id)), 'Trials':list(set(data.nct_id))}

In [6]:
len(investigator_dict)


Out[6]:
91029

In [9]:
investigator_dict['Thasarat Vajaranant']


Out[9]:
{'Trials': [u'NCT01630551'], 'id': [80886]}

In [8]:
#save investigator dict
pickle.dump(investigator_dict, open('data/investigator_dict.pkl','wb'))

In [ ]:
len(set(map(lambda x: x.lower(), investigator_dict.keys())))

Pull publications


In [19]:
# load in already run investigators
done = set()
files = os.listdir('../data')
for f in files:
    if 'investigator_dict_' in f:
        done.update(pickle.load(open('../data/' + f,'rb')).keys())

listnum = len([f for f in files if 'investigator_dict_' in f]) + 1

#break up the list of names into 5 chuncks
todo = [i for i in investigator_dict.keys() if i not in done][:20000]

In [ ]:
def get_publications(name_list, list_num):
    count=0
    result_dict = {}
    for name in name_list:
        start_time = time.time()
        #get publication ids for investigator
        search_start = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed'
        investigator = name

        r = requests.get(search_start + '&term=%s[Author]&retmax=100000' % investigator)
        soup = BeautifulSoup(r.text)
        ids = [s.text for s in soup.find_all('id')]
        
        #don't do second call if there are no ids
        if len(ids) == 0:
            continue
        
        #get publications from ids
        summary_start = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed'
        id_list = ','.join(ids)

        r = requests.get(summary_start + '&id=%s&retmode=xml&retmax=10000' % id_list)

        #pickle does not work with soup objects
        #soup = BeautifulSoup(r.text)
        
        #add publications to the dict
        result_dict[name] = ' '.join(r.text.split())
        
        #time taken
        if (time.time() - start_time) < 1:
            time.sleep(1-(time.time() - start_time))

        count += 1

        if (count % 500) == 0 or count == len(name_list):
            pickle.dump(result_dict, open('investigator_dict_%d.pkl' % (list_num),'wb'))
            print count
            list_num += 1
            result_dict = {}

In [ ]:
pub_dict = get_publications(todo, listnum)

In [ ]: