In [41]:
import codecs, requests, nltk, json, string, cPickle as pickle, Levenshtein as L, random, collections
import pandas as pd
from string import punctuation
from connect import gkey
Facilities
In [162]:
column_names = ["facility_id",
"nct_id",
"status",
"facility_name",
"city",
"state",
"zipcode",
"country"]
facilities = pd.read_csv('../data/facilities.txt', names=column_names, sep="|", encoding='utf-8', quoting=3)
facilities_nct = facilities.set_index('nct_id')
Sponsors
In [151]:
column_names = ["sponsor_id",
"nct_id",
"sponsor_type",
"agency",
"agency_class"]
sponsors = pd.read_csv('../data/sponsors.txt', names=column_names, sep="|", encoding='utf-8', quoting=3)
sponsors_nct = sponsors.set_index('nct_id')
sponsor_count = sponsors.groupby('agency').count()['nct_id']
Freebase lookups
In [5]:
sponsor_poss = pickle.load(open('../data/sponsor_guess.pickle','rb'))
List of acceptable Freebase categories
In [6]:
cat_ok = {"College/University",
"Business Operation",
"Hospital",
"Organization",
"Nonprofit organization",
"Pharmaceutical Preparations Business",
"Government Agency",
"Pharmaceutical industry Business",
"Biotechnology Business",
"Health care Business",
"Biological Product (except Diagnostic) Manufacturing Business",
"Employer",
"Biomedical research Organization",
"Venture Funded Company",
"Biomedical research",
"Clinical and Research Cancer Center",
"Surgical and Medical Instruments and Apparatus Business",
"Drug manufacturer",
"Educational Institution",
"Private university",
"Physician",
"Disease or medical condition",
"Internal medicine Hospital",
"Academic",
"Electromedical and Electrotherapeutic Apparatus Business",
"Government Office or Title",
"In Vitro and In Vivo Diagnostic Substances Business",
"Medical Equipment and Supplies Manufacturing Business",
"Drug",
"Medical specialty",
"Commercial Physical and Biological Research Business",
"Orthopedic, Prosthetic, and Surgical Appliances and Supplies Business",
"Anatomical structure",
"Medical school",
"Chemical industry Business",
"Public school",
"Hospital and Medical Service Plans Business",
"Analytical Laboratory Instrument Manufacturing Business",
"Medical Laboratories Business",
"Consumer product",
"Surgeon",
"Non-profit organization",
"Educational Institution Campus",
"Scientist",
"School district",
"Food Manufacturing Business",
"Manufacturing Business",
"Veterans' benefits Organization",
"Autism Organization",
"Diabetes mellitus Organization",
"School",
"Think tank",
"Venture Investor",
"Crude Petroleum and Natural Gas Extraction Business",
"Cancer Center Constituent",
"US State",
"Dentistry",
"Military post",
"General Medical and Surgical Hospitals Business",
"Drug brand",
"Financial Services Business",
"Consumer electronics Business",
"Household Audio and Video Equipment Business",
"Bottled and Canned Soft Drinks and Carbonated Waters Business",
"Alternative medicine Organization",
"Philanthropy Organization",
"Petroleum industry Business",
"Alzheimer's disease Organization",
"Conglomerate Business",
"Surgery Organization",
"Public health Organization",
"Drug policy Organization",
"Pharmacies and Drug Stores Business",
"Agricultural science Organization",
"Photographic Equipment and Supplies Business",
"National university",
"Conservation Organization",
"Medical trial sponsor",
"University system",
"Research Business",
"Information technology Business",
"Inventor",
"Stem Cell Research Business",
"Perfumes, Cosmetics, and Other Toilet Preparations Business",
"Defunct Organization",
"Computer hardware Business",
"Chemist",
"Physicist",
"International development Organization",
"Department",
"Foundation",
"Medicinal and Botanical Manufacturing Business",
"Psychologist",
"Cancer Center",
"Telecom Equipment Vendor Business",
"Land-grant university",
"Cooperative",
"Sports equipment Business",
"Neurologist",
"Drugs and Druggists' Sundries Merchant Wholesalers Business",
"Engineering and Construction Services Business",
"Aerospace Business",
"Mining Business",
"Pediatrics Hospital",
"Internet Publishing and Broadcasting and Web Search Portals Business",
"Medical Treatment",
"Blood donation Organization",
"Instruments for Measuring and Testing of Electricity and Electrical Signals Business",
"Cause Of Death",
"Women's rights Organization",
"Consumer company",
"Private equity Business",
"Plastics Material and Resin Manufacturing Business",
"Agricultural marketing Organization",
"Natural history Museum",
"Institute of technology",
"Toy Business",
"Anti-epileptic Agent Drug",
"Macrolide Antibiotic Drug",
"Vaccination Organization",
"Life Insurance Business",
"Physical medicine and rehabilitation Hospital",
"Palliative care Organization",
"Mental health Organization",
"Biotechnology Organization"
}
In [7]:
best_match = {}
not_found = []
In [ ]:
for a in sponsors['agency'].value_counts().index:
if a not in best_match and a not in not_found:
aa = a.encode('utf-8')
if len(sponsor_poss[aa]) > 0:
if 'notable' in sponsor_poss[aa][0] and sponsor_poss[aa][0]['notable']['name'] in cat_ok:
best_match[a] = sponsor_poss[aa][0]
else:
print u'Sponsor: %s' % a
for i, s in enumerate(sponsor_poss[aa][:5]):
print '%d. %s' % (i, str(s))
same = raw_input('Go with one of these? (enter number) ')
if same in ['0','1','2','3','4']:
best_match[a] = sponsor_poss[aa][int(same)]
else:
not_found.append(a)
print
print
else:
not_found.append(a)
In [104]:
pickle.dump(best_match,open('../data/facility_match_good.pkl','wb'))
pickle.dump(not_found,open('../data/facility_match_bad.pkl','wb'))
In [43]:
# procedure get data
def query_url(url, param_data=None, retries=5):
'''
url: full URL from which to request data
retries: maximum number of retries before moving on
'''
while retries > 0:
try:
if param_data:
r = requests.get(url, params=param_data)
else:
r = requests.get(url)
except:
print ' Got error querying %s. Retrying.' % url
pass
if r.status_code == 200:
return json.loads(r.text)
else:
retries -= 1
# if we've tried enough times, print problem and return nothing
print ' Unable to query %s. Status code %d.' % (url, r.status_code)
return None
In [143]:
# procedure to get type and location data
def query_freebase(mid, name):
'''
mid: string representing unique Freebase ID
name: string of entity name, e.g. 'University of California, Berkeley'
'''
# initialize variables
type_data = None
loc_data = None
# type information
url = 'https://www.googleapis.com/freebase/v1/topic/m/%s?filter=/type/object/type&key=%s' % (mid, gkey)
type_info = query_url(url)
if type_info and 'property' in type_info and '/type/object/type' in type_info['property']:
type_data = [d['text'] for d in type_info['property']['/type/object/type']['values']
if d['id'][:5] not in ('/base','/user','/comm')]
# location information
url = 'https://www.googleapis.com/freebase/v1/topic/m/%s?filter=/location/location/geolocation&key=%s' % (mid, gkey)
loc_info = query_url(url)
if loc_info and 'property' in loc_info and '/location/location/geolocation' in loc_info['property']:
for v in loc_info['property']['/location/location/geolocation']['values']:
if '/type/object/type' in v['property'] and v['property']['/type/object/type']['values'][0]['text'] == 'Geocode':
loc_data = {'lat': v['property']['/location/geocode/latitude']['values'][0]['value'],
'lng': v['property']['/location/geocode/longitude']['values'][0]['value'],
'source': 'Freebase'
}
# backfilling with gmaps
if not loc_data:
param_data = {'address': name, 'key': gkey}
url = 'https://maps.googleapis.com/maps/api/geocode/json'
loc_info = query_url(url, param_data=param_data)
if loc_info and len(loc_info['results']) > 0:
loc_data = loc_info['results'][0]['geometry']['location']
loc_data['address'] = loc_info['results'][0]['formatted_address']
loc_data['source'] = 'Google Maps'
return (type_data, loc_data)
In [64]:
to_remove = []
In [ ]:
for b in sponsors['agency'].value_counts().index:
if b in best_match and b not in to_remove and 'types' not in best_match[b]:
# print the basics
print 'Sponsor name: %s' % b
print 'Freebase name: %s' % best_match[b]['name']
print 'Freebase id: %s' % best_match[b]['mid'][3:]
if 'notable' in best_match[b]:
print 'Notable for: %s' % best_match[b]['notable']['name']
else:
print 'No notable listing'
print
# print top facilities
cur_trials = sponsors[sponsors.agency == b]['nct_id']
facs = facilities_nct.loc[cur_trials]['facility_name'].value_counts()[:3].index
print 'Top facilities: %s' % str(list(facs))
print
# get type and location information
type_data, loc_data = query_freebase(best_match[b]['mid'][3:], best_match[b]['name'])
print 'Freebase types: ' + ', '.join(type_data)
print
print 'Location info: %s' % str(loc_data)
# check if this is acceptable
ok = raw_input('Enter to accept, "m" to see other possibilities, "d" to drop geocode, any other key to reject both\n')
if not ok or ok.strip() == '':
best_match[b]['types'] = type_data
if loc_data:
best_match[b]['geo'] = loc_data
elif ok.lower()[0] == 'd':
best_match[b]['types'] = type_data
elif ok.lower()[0] == 'm':
for i, p in enumerate(sponsor_poss[b.encode('utf-8')]):
if len(p['name']) > 0:
print '%d. %s (%s) %s' % (i, p['name'], p['notable']['name'] if 'notable' in p else 'no notable', p['mid'])
ok2 = raw_input('Any of these? ')
try:
new_s = int(ok2)
type_data, loc_data = query_freebase(sponsor_poss[b][new_s]['mid'][3:], sponsor_poss[b][new_s]['name'])
print 'Freebase types: ' + ', '.join(type_data)
print
print 'Location info: %s' % str(loc_data)
ok3 = raw_input('Enter to accept, any other key to reject ')
if not ok3 or ok3.strip() == '':
best_match[b] = sponsor_poss[b][new_s]
best_match[b]['types'] = type_data
if loc_data:
best_match[b]['geo'] = loc_data
else:
to_remove.append(b)
except:
to_remove.append(b)
else:
to_remove.append(b)
print
print
In [177]:
len([b for b in best_match if 'types' in best_match[b]])
Out[177]:
In [108]:
to_remove
Out[108]:
In [179]:
not_found = list(set(not_found + to_remove))
for r in to_remove:
if r in best_match: del best_match[r]
for b in best_match.keys():
best_match[b]['trials'] = sponsor_count[b]
pickle.dump(best_match,open('../data/facility_match_good.pkl','wb'))
pickle.dump(not_found,open('../data/facility_match_bad.pkl','wb'))
In [ ]: