In [41]:
import codecs, requests, nltk, json, string, cPickle as pickle, Levenshtein as L, random, collections
import pandas as pd
from string import punctuation
from connect import gkey

Loading data

Facilities


In [162]:
column_names = ["facility_id",
                "nct_id",
                "status",
                "facility_name",
                "city",
                "state",
                "zipcode",
                "country"]

facilities = pd.read_csv('../data/facilities.txt', names=column_names, sep="|", encoding='utf-8', quoting=3)
facilities_nct = facilities.set_index('nct_id')

Sponsors


In [151]:
column_names = ["sponsor_id",
                "nct_id",
                "sponsor_type",
                "agency",
                "agency_class"]

sponsors = pd.read_csv('../data/sponsors.txt', names=column_names, sep="|", encoding='utf-8', quoting=3)
sponsors_nct = sponsors.set_index('nct_id')
sponsor_count = sponsors.groupby('agency').count()['nct_id']

Freebase lookups


In [5]:
sponsor_poss = pickle.load(open('../data/sponsor_guess.pickle','rb'))

List of acceptable Freebase categories


In [6]:
cat_ok = {"College/University",
            "Business Operation",
            "Hospital",
            "Organization",
            "Nonprofit organization",
            "Pharmaceutical Preparations Business",
            "Government Agency",
            "Pharmaceutical industry Business",
            "Biotechnology Business",
            "Health care Business",
            "Biological Product (except Diagnostic) Manufacturing Business",
            "Employer",
            "Biomedical research Organization",
            "Venture Funded Company",
            "Biomedical research",
            "Clinical and Research Cancer Center",
            "Surgical and Medical Instruments and Apparatus Business",
            "Drug manufacturer",
            "Educational Institution",
            "Private university",
            "Physician",
            "Disease or medical condition",
            "Internal medicine Hospital",
            "Academic",
            "Electromedical and Electrotherapeutic Apparatus Business",
            "Government Office or Title",
            "In Vitro and In Vivo Diagnostic Substances Business",
            "Medical Equipment and Supplies Manufacturing Business",
            "Drug",
            "Medical specialty",
            "Commercial Physical and Biological Research Business",
            "Orthopedic, Prosthetic, and Surgical Appliances and Supplies Business",
            "Anatomical structure",
            "Medical school",
            "Chemical industry Business",
            "Public school",
            "Hospital and Medical Service Plans Business",
            "Analytical Laboratory Instrument Manufacturing Business",
            "Medical Laboratories Business",
            "Consumer product",
            "Surgeon",
            "Non-profit organization",
            "Educational Institution Campus",
            "Scientist",
            "School district",
            "Food Manufacturing Business",
            "Manufacturing Business",
            "Veterans' benefits Organization",
            "Autism Organization",
            "Diabetes mellitus Organization",
            "School",
            "Think tank",
            "Venture Investor",
            "Crude Petroleum and Natural Gas Extraction Business",
            "Cancer Center Constituent",
            "US State",
            "Dentistry",
            "Military post",
            "General Medical and Surgical Hospitals Business",
            "Drug brand",
            "Financial Services Business",
            "Consumer electronics Business",
            "Household Audio and Video Equipment Business",
            "Bottled and Canned Soft Drinks and Carbonated Waters Business",
            "Alternative medicine Organization",
            "Philanthropy Organization",
            "Petroleum industry Business",
            "Alzheimer's disease Organization",
            "Conglomerate Business",
            "Surgery Organization",
            "Public health Organization",
            "Drug policy Organization",
            "Pharmacies and Drug Stores Business",
            "Agricultural science Organization",
            "Photographic Equipment and Supplies Business",
            "National university",
            "Conservation Organization",
            "Medical trial sponsor",
            "University system",
            "Research Business",
            "Information technology Business",
            "Inventor",
            "Stem Cell Research Business",
            "Perfumes, Cosmetics, and Other Toilet Preparations Business",
            "Defunct Organization",
            "Computer hardware Business",
            "Chemist",
            "Physicist",
            "International development Organization",
            "Department",
            "Foundation",
            "Medicinal and Botanical Manufacturing Business",
            "Psychologist",
            "Cancer Center",
            "Telecom Equipment Vendor Business",
            "Land-grant university",
            "Cooperative",
            "Sports equipment Business",
            "Neurologist",
            "Drugs and Druggists' Sundries Merchant Wholesalers Business",
            "Engineering and Construction Services Business",
            "Aerospace Business",
            "Mining Business",
            "Pediatrics Hospital",
            "Internet Publishing and Broadcasting and Web Search Portals Business",
            "Medical Treatment",
            "Blood donation Organization",
            "Instruments for Measuring and Testing of Electricity and Electrical Signals Business",
            "Cause Of Death",
            "Women's rights Organization",
            "Consumer company",
            "Private equity Business",
            "Plastics Material and Resin Manufacturing Business",
            "Agricultural marketing Organization",
            "Natural history Museum",
            "Institute of technology",
            "Toy Business",
            "Anti-epileptic Agent Drug",
            "Macrolide Antibiotic Drug",
            "Vaccination Organization",
            "Life Insurance Business",
            "Physical medicine and rehabilitation Hospital",
            "Palliative care Organization",
            "Mental health Organization",
            "Biotechnology Organization"
            }

Picking best match based on Freebase category


In [7]:
best_match = {}
not_found = []

In [ ]:
for a in sponsors['agency'].value_counts().index:
    if a not in best_match and a not in not_found:
        aa = a.encode('utf-8')
        if len(sponsor_poss[aa]) > 0:
            if 'notable' in sponsor_poss[aa][0] and sponsor_poss[aa][0]['notable']['name'] in cat_ok:
                best_match[a] = sponsor_poss[aa][0]
            else:
                print u'Sponsor: %s' % a
                for i, s in enumerate(sponsor_poss[aa][:5]):
                    print '%d.  %s' % (i, str(s))
                same = raw_input('Go with one of these? (enter number) ')
                if same in ['0','1','2','3','4']:
                    best_match[a] = sponsor_poss[aa][int(same)]
                else:
                    not_found.append(a)
                print
                print
        else:
            not_found.append(a)

In [104]:
pickle.dump(best_match,open('../data/facility_match_good.pkl','wb'))
pickle.dump(not_found,open('../data/facility_match_bad.pkl','wb'))

Getting data for good matches


In [43]:
# procedure get data
def query_url(url, param_data=None, retries=5):
    '''
    url: full URL from which to request data
    retries: maximum number of retries before moving on
    '''
    while retries > 0:
        try:
            if param_data:
                r = requests.get(url, params=param_data)
            else:
                r = requests.get(url)
        except:
            print '  Got error querying %s. Retrying.' % url
            pass
        if r.status_code == 200:
            return json.loads(r.text)
        else:
            retries -= 1
    
    # if we've tried enough times, print problem and return nothing
    print '  Unable to query %s. Status code %d.' % (url, r.status_code)
    return None

In [143]:
# procedure to get type and location data
def query_freebase(mid, name):
    '''
    mid: string representing unique Freebase ID
    name: string of entity name, e.g. 'University of California, Berkeley'
    '''
    # initialize variables
    type_data = None
    loc_data = None
    
    # type information
    url = 'https://www.googleapis.com/freebase/v1/topic/m/%s?filter=/type/object/type&key=%s' % (mid, gkey)
    type_info = query_url(url)
    if type_info and 'property' in type_info and '/type/object/type' in type_info['property']:
        type_data = [d['text'] for d in type_info['property']['/type/object/type']['values']
                     if d['id'][:5] not in ('/base','/user','/comm')]
    
    # location information
    url = 'https://www.googleapis.com/freebase/v1/topic/m/%s?filter=/location/location/geolocation&key=%s' % (mid, gkey)
    loc_info = query_url(url)
    if loc_info and 'property' in loc_info and '/location/location/geolocation' in loc_info['property']:
        for v in loc_info['property']['/location/location/geolocation']['values']:
            if '/type/object/type' in v['property'] and v['property']['/type/object/type']['values'][0]['text'] == 'Geocode':
                loc_data = {'lat': v['property']['/location/geocode/latitude']['values'][0]['value'],
                            'lng': v['property']['/location/geocode/longitude']['values'][0]['value'],
                            'source': 'Freebase'
                            }
    
    # backfilling with gmaps
    if not loc_data:
        param_data = {'address': name, 'key': gkey}
        url = 'https://maps.googleapis.com/maps/api/geocode/json'
        loc_info = query_url(url, param_data=param_data)
        if loc_info and len(loc_info['results']) > 0:
            loc_data = loc_info['results'][0]['geometry']['location']
            loc_data['address'] = loc_info['results'][0]['formatted_address']
            loc_data['source'] = 'Google Maps'
    
    return (type_data, loc_data)

In [64]:
to_remove = []

In [ ]:
for b in sponsors['agency'].value_counts().index:
    if b in best_match and b not in to_remove and 'types' not in best_match[b]:
        # print the basics
        print 'Sponsor name: %s' % b
        print 'Freebase name: %s' % best_match[b]['name']
        print 'Freebase id: %s' % best_match[b]['mid'][3:]
        if 'notable' in best_match[b]:
            print 'Notable for: %s' % best_match[b]['notable']['name']
        else:
            print 'No notable listing'
        print
        
        # print top facilities
        cur_trials = sponsors[sponsors.agency == b]['nct_id']
        facs = facilities_nct.loc[cur_trials]['facility_name'].value_counts()[:3].index
        print 'Top facilities: %s' % str(list(facs))
        print
        
        # get type and location information
        type_data, loc_data = query_freebase(best_match[b]['mid'][3:], best_match[b]['name'])
        print 'Freebase types: ' + ', '.join(type_data)
        print
        print 'Location info: %s' % str(loc_data)
        
        # check if this is acceptable
        ok = raw_input('Enter to accept, "m" to see other possibilities, "d" to drop geocode, any other key to reject both\n')
        if not ok or ok.strip() == '':
            best_match[b]['types'] = type_data
            if loc_data:
                best_match[b]['geo'] = loc_data
        elif ok.lower()[0] == 'd':
            best_match[b]['types'] = type_data
        elif ok.lower()[0] == 'm':
            for i, p in enumerate(sponsor_poss[b.encode('utf-8')]):
                if len(p['name']) > 0:
                    print '%d. %s (%s) %s' % (i, p['name'], p['notable']['name'] if 'notable' in p else 'no notable', p['mid'])
            ok2 = raw_input('Any of these? ')
            try:
                new_s = int(ok2)
                type_data, loc_data = query_freebase(sponsor_poss[b][new_s]['mid'][3:], sponsor_poss[b][new_s]['name'])
                print 'Freebase types: ' + ', '.join(type_data)
                print
                print 'Location info: %s' % str(loc_data)
                ok3 = raw_input('Enter to accept, any other key to reject ')
                if not ok3 or ok3.strip() == '':
                    best_match[b] = sponsor_poss[b][new_s]
                    best_match[b]['types'] = type_data
                    if loc_data:
                        best_match[b]['geo'] = loc_data
                else:
                    to_remove.append(b)
            except:
                to_remove.append(b)
        else:
            to_remove.append(b)
        print
        print

In [177]:
len([b for b in best_match if 'types' in best_match[b]])


Out[177]:
998

In [108]:
to_remove


Out[108]:
[u'Lundbeck Foundation',
 u'Hospital for Special Surgery, New York',
 u'The Royal Norwegian Ministry of Health',
 u'Aesculap AG',
 u'Hvidovre University Hospital',
 u'Universit\xe0 degli Studi di Ferrara',
 u'Christiana Care Health Services',
 u'ORA, Inc.',
 u'Fresenius Medical Care North America',
 u'Dong-A Pharmaceutical Co., Ltd.',
 u'Seattle Institute for Biomedical and Clinical Research',
 u'Ministry of Health, France',
 u'Huashan Hospital',
 u'Baylor Research Institute',
 u'Alberta Heritage Foundation for Medical Research',
 u'Southeast University, China',
 u'Biotronik, Inc.',
 u'Healthpoint',
 u'Cardiovascular Institute & Fuwai Hospital',
 u'Mount Sinai Hospital, Canada',
 u'Sahlgrenska University Hospital, Sweden',
 u'ConvaTec Inc.',
 u'Academisch Medisch Centrum - Universiteit van Amsterdam (AMC-UvA)',
 u'Institute of Tropical Medicine, Belgium']

In [179]:
not_found = list(set(not_found + to_remove))
for r in to_remove:
    if r in best_match: del best_match[r]

for b in best_match.keys():
    best_match[b]['trials'] = sponsor_count[b]
pickle.dump(best_match,open('../data/facility_match_good.pkl','wb'))
pickle.dump(not_found,open('../data/facility_match_bad.pkl','wb'))

In [ ]: