In [1]:
import cPickle as pickle, codecs, requests, json, random, re, datetime
from string import punctuation
from bs4 import BeautifulSoup
from collections import Counter, defaultdict
# SQLAlchemy setup
from sqlalchemy import create_engine
from sqlalchemy.sql import func, select
from connect import mysqlusername, mysqlpassword, mysqlserver, mysqldbname
from db_tables import metadata, InstitutionDescription, InstitutionLookup, InstitutionSponsors, InstitutionFacilities
Initialize procedures and variables
In [3]:
# procedure to lowercase and remove all punctuation from a string
def standardize(s):
tokens = s.split()
if tokens[-1][0] == "(":
s = ' '.join(tokens[:-1])
return re.sub(' +',' ',''.join(c for c in s.replace('/',' ').replace('-',' ') if c not in punctuation).lower())
# list of meaningless facility/site names
bad_names = [r'investigat[a-z]+ site',
r'research site',
r'recruiting site',
r'^private (practice|clinic)$',
r'research facility',
r'local (institut|investigat)',
r'study site',
r'clinical site',
r'call cent[re]+',
r'site ref',
r'site[ :]+#?[0-9]+',
r'^#?[0-9\.]+$',
r'for additional information',
r'call for information',
r'the study is ',
r'[0-9\-]{10,}',
r'administrative office',
r'^research (center|centre|fac|site|stie|unit|med)',
r'^site ',
r'^hospital$',
r'^test facility',
r'^practice$'
]
# list of valid institution names that appear inside other valid institution names
ok_dupes = ['helsinki university',
"queen s university",
"drexel university",
"new york university",
"uppsala university",
'aids clinical trials group',
'mcgill university',
'university of connecticut',
'cornell university',
'institute of child health',
'university of texas',
'university college, london',
'university of south carolina',
'university of ottawa',
'indiana university',
'medical research council',
'institute of child health',
'state university of new york',
'university of california',
'university hospital',
'hospital']
Sponsors
In [ ]:
sponsors = defaultdict(list)
sponsor_ids = defaultdict(list)
sponsors_sp = defaultdict(set)
for row in codecs.open('../data/sponsors.txt','r','utf-8').readlines():
sponsor_id, nct_id, sponsor_type, sponsor_name, funding_type = row.strip().split('|')
# add trial to sponsor dictionary
sponsors[sponsor_name].append(nct_id)
# add sponsor id to sponsor_dictionary
sponsor_ids[sponsor_name].append(int(sponsor_id))
# add name to standardized lookup dictionary
sponsors_sp[standardize(sponsor_name)].add(sponsor_name)
sponsors_s = {s: Counter(sponsors_sp[s]) for s in sponsors_sp}
del sponsors_sp
Sponsors matched to Freebase
In [4]:
# see institutions/Sponsor_* and institutions/Institution_* for the origin of this dictionary
sponsors_good = pickle.load(open('../data/facility_match_good.pkl','rb'))
sponsors_good_s = {standardize(s): s for s in sponsors_good.keys()}
Facilities
In [ ]:
all_facs = {}
all_facs_trials = {}
all_facs_r = defaultdict(list)
fac_names = defaultdict(list)
for row in codecs.open('../data/facilities.txt','r','utf-8').readlines():
fac_id, nct_id, status, fac_name, city, state, zipcode, country = row.strip().split("|")
id_tup = (fac_name, city, state, zipcode, country)
all_facs[int(fac_id)] = id_tup
all_facs_trials[int(fac_id)] = nct_id
all_facs_r[id_tup].append(int(fac_id))
if len(fac_name) > 0 and len(standardize(fac_name)) > 0:
fac_names[standardize(fac_name)].append(int(fac_id))
fac_names_s = {s: Counter(all_facs[f][0] for f in fac_names[s]) for s in fac_names}
In [ ]:
# get deduplicated facilities
# see institutions/Facility_* for the origin of this file
facs = pickle.load(open('../data/facility_clusters.pkl','rb'))
In [ ]:
# get most frequent facility name for deduped facility clusters
facs_mode = facs.groupby(['cluster','facility_name']).facility_id.count().reset_index()
facs_mode['len_fac'] = facs_mode.facility_name.apply(lambda x: len(x))
clus_name = facs_mode.sort(['cluster', 0, 'len_fac'], ascending=[1,0,1]).drop_duplicates(['cluster'])[['cluster','facility_name']].set_index('cluster')
In [ ]:
# create lookup of cluster to facility id
clus_lookup = facs[['cluster','facility_id']].set_index('cluster')
clus_names = defaultdict(set)
for c in clus_name[clus_name.facility_name.apply(lambda x: standardize(x) in sponsors_good_s)].index:
c_name = standardize(clus_name.loc[c].values[0])
# fix University of Texas issue
if c_name == 'university of texas at austin': c_name = 'university of texas'
for f in clus_lookup.loc[c].values:
clus_names[c_name].add(f[0])
Add the canonical institutions (ie those that have been matched to Freebase)
In [ ]:
# get lookup of (good) sponsor names that appear in other (good) sponsor names
dupes = {i: [j for j in sponsors_good_s if re.search(r'\b%s\b' % i, j) and i != j]
for i in sponsors_good_s
if i not in ok_dupes and
len([j for j in sponsors_good_s if re.search(r'\b%s\b' % i, j) and i != j]) > 0}
dupes['university of texas'] = ['university of texas at austin']
dupes_r = {i for k, v in dupes.items() for i in v}
ok_dupes_lookup = {i: [j for j in sponsors_good_s if re.search(r'\b%s\b' % i, j) and i != j]
for i in sponsors_good_s
if i in ok_dupes and
len([j for j in sponsors_good_s if re.search(r'\b%s\b' % i, j) and i != j]) > 0}
In [ ]:
print datetime.datetime.now()
print
inst_dict = {}
fac_inst = defaultdict(set)
found_in_fac = defaultdict(set)
# add all sponsors that have been matched to Freebase metadata
for s in sponsors_good_s.keys():
if s not in dupes_r:
# actual key
sk = sponsors_good_s[s]
# generating institution object
inst_dict[s] = {'name': sk,
'cert': 'gold',
'loc': sponsors_good[sk]['geo']['loc'] or '',
'img': sponsors_good[sk]['image'] or '',
'desc': sponsors_good[sk]['summary'] or '',
'lat': sponsors_good[sk]['geo']['lat'] if 'lat' in sponsors_good[sk]['geo'] else 999.0,
'lng': sponsors_good[sk]['geo']['lng'] if 'lng' in sponsors_good[sk]['geo'] else 999.0,
'sponsors': set(sponsors_s[s].keys())}
if s in clus_names:
inst_dict[s]['facs'] = clus_names[s]
for f in clus_names[s]:
fac_inst[f].add(s)
else:
inst_dict[s]['facs'] = set()
if s in dupes:
for d in dupes[s]:
inst_dict[s]['sponsors'] |= set(sponsors_s[d].keys())
if d in clus_names:
inst_dict[s]['facs'] |= clus_names[d]
for f in clus_names[d]:
fac_inst[f].add(s)
# fill in facilities that include the certified sponsor name
if s not in ['3m','rand','university of texas']:
r = re.compile(r'\b%s\b' % s)
poss_fac = [f for f in fac_names if s in f and s != f]
for f in poss_fac:
if r.search(f) and \
(s not in ok_dupes or s not in ok_dupes_lookup \
or not max([True if re.search(r'\b%s\b' % d, f) else False
for d in ok_dupes_lookup[s]])) and \
(s != 'abbott' or not max([True if d in f else False
for d in ['northwest','minneap',' md']])):
found_in_fac[s].add(f)
for fid in fac_names[f]:
inst_dict[s]['facs'].add(fid)
fac_inst[fid].add(s)
print datetime.datetime.now()
print
In [ ]:
print len([f for i in inst_dict.values() for f in i['facs']])
print len(set([f for i in inst_dict.values() for f in i['facs']]))
print len(fac_inst)
print len(inst_dict)
Add other sponsors who haven't been match to Freebase
In [ ]:
# add other sponsors who haven't been matched to Freebase
print datetime.datetime.now()
print
multi_match = defaultdict(set)
s_in_i = defaultdict(set)
i_in_s = defaultdict(set)
inst_re = {i: re.compile(r'\b%s\b' % i) for i in inst_dict}
done_sponsors = {standardize(s) for i in inst_dict for s in inst_dict[i]['sponsors']}
for s in sponsors_s.keys():
if s not in done_sponsors and len(s.replace(' ','')) > 1:
# flag to add new institution from sponsor name
add_anyway = True
# check to see if any current institution name is a subset of this institution
poss_inst_match = [i for i in inst_re if inst_re[i].search(s)]
for cur_inst in poss_inst_match:
if cur_inst not in ok_dupes:
add_anyway = False
i_in_s[cur_inst].add(s)
inst_dict[cur_inst]['sponsors'] |= set(sponsors_s[s].keys())
# check to see in any facilities are part of an existing institution, and if so, add them to that
if s in fac_names and add_anyway:
these_facs = set(fac_names[s])
these_inst = list(set(i for f in these_facs if f in fac_inst for i in fac_inst[f]))
for cur_inst in these_inst:
add_anyway = False
inst_in_spon = cur_inst in s
inst_in_dupe = cur_inst in ok_dupes
spon_in_inst = s in cur_inst
spon_in_dupe = s in ok_dupes
if (inst_in_spon and not inst_in_dupe) or (spon_in_inst and not spon_in_dupe):
if inst_in_spon:
i_in_s[cur_inst].add(s)
if spon_in_inst:
s_in_i[cur_inst].add(s)
inst_dict[cur_inst]['facs'] |= these_facs
inst_dict[cur_inst]['sponsors'] |= set(sponsors_s[s].keys())
for f in these_facs:
fac_inst[f].add(cur_inst)
else:
add_anyway = True
multi_match[s].add(cur_inst)
# otherwise just add the sponsor and any matching facilities
if add_anyway:
inst_dict[s] = {'name': sponsors_s[s].most_common(1)[0][0],
'cert': 'sponsor',
'loc': '',
'img': '',
'desc': '',
'lat': 999.0,
'lng': 999.0,
'sponsors': set(sponsors_s[s].keys())}
if s in fac_names:
inst_dict[s]['facs'] = set(fac_names[s])
for f in fac_names[s]:
fac_inst[f].add(s)
else:
inst_dict[s]['facs'] = set()
print datetime.datetime.now()
print
In [ ]:
print len([f for i in inst_dict.values() for f in i['facs']])
print len(set([f for i in inst_dict.values() for f in i['facs']]))
print len(fac_inst)
print len(inst_dict)
In [ ]:
# # resetting to just the canonical institutions
# for i in inst_dict.keys():
# if not inst_dict[i]['cert']: del inst_dict[i]
# inst_set = set(inst_dict.keys())
# for f in fac_inst.keys():
# fac_inst[f] &= inst_set
Fill in facilities that don't have meaningless names
In [ ]:
# filling in facilities where some id's weren't picked up by the other matches
to_fill = defaultdict(set)
for s in fac_names:
if s not in multi_match and s not in inst_dict:
these_facs = set(fac_names[s])
these_inst = list(set(i for f in these_facs if f in fac_inst for i in fac_inst[f]))
if len(these_inst) >= 1:
for cur_inst in these_inst:
if (cur_inst in s and cur_inst not in ok_dupes) or (s in cur_inst and s not in ok_dupes):
inst_dict[cur_inst]['facs'] |= these_facs
for f in these_facs:
fac_inst[f].add(cur_inst)
else:
to_fill[s] |= set(fac_names[s])
elif min([not re.search(b,s) for b in bad_names]):
to_fill[s] |= set(fac_names[s])
In [ ]:
sorted([(t, len(to_fill[t])) for t in to_fill], key=lambda x: x[1], reverse=True)[:10]
In [ ]:
for f in to_fill:
inst_dict[f] = {'name': fac_names_s[f].most_common(1)[0][0],
'cert': 'facility',
'loc': '',
'img': '',
'desc': '',
'lat': 999.0,
'lng': 999.0,
'sponsors': set(),
'facs': to_fill[f]}
In [ ]:
print len([f for i in inst_dict.values() for f in i['facs']])
print len(set([f for i in inst_dict.values() for f in i['facs']]))
print len(fac_inst)
print len(inst_dict)
In [ ]:
mysqlserver = 'localhost'
engine = create_engine('mysql://%s:%s@%s/%s' % (mysqlusername, mysqlpassword, mysqlserver, mysqldbname))
conn = engine.connect()
metadata.create_all(engine)
Prepare data
In [ ]:
inst_ids = {i: e for e, i in enumerate(inst_dict.keys())}
In [ ]:
inst_trials = {i: set(f for s in inst_dict[i]['sponsors'] for f in sponsors[s]) | \
set(all_facs_trials[f] for f in inst_dict[i]['facs'])
for i in inst_dict}
In [ ]:
inst_facs = {i: set(all_facs[f] for f in inst_dict[i]['facs']) for i in inst_dict}
In [ ]:
k_insert = [{'institution_id': inst_ids[i],
'facility_name': t[0],
'city': t[1],
'state': t[2],
'zip': t[3],
'country': t[4],
'trial_count': len(set(all_facs_trials[n] for n in all_facs_r[t]))}
for i in inst_facs.keys()
for t in inst_facs[i]]
Insert data
In [ ]:
for k in range(0,len(inst_dict),10000):
print k
conn.execute(InstitutionDescription.insert(), [{'institution_id': inst_ids[i],
'name': inst_dict[i]['name'],
'source': inst_dict[i]['cert'].upper(),
'location': inst_dict[i]['loc'],
'image_url': inst_dict[i]['img'],
'description': inst_dict[i]['desc'],
'latitude': inst_dict[i]['lat'],
'longitude': inst_dict[i]['lng'],
'trial_count': len(inst_trials[i])}
for i in inst_dict.keys()[k:k+10000]])
In [ ]:
for k in range(0,len(inst_dict),10000):
print k
conn.execute(InstitutionSponsors.insert(), [{'institution_id': inst_ids[i],
'sponsor_name': s,
'trial_count': len(sponsors[s])}
for i in inst_dict.keys()[k:k+10000]
for s in inst_dict[i]['sponsors']])
In [ ]:
for k in range(0,len(k_insert),20000):
print k
conn.execute(InstitutionFacilities.insert(), k_insert[k:k+20000])
In [ ]:
for k in range(0,len(inst_trials),10000):
print k
conn.execute(InstitutionLookup.insert(), [{'institution_id': inst_ids[i],
'nct_id': t}
for i in inst_dict.keys()[k:k+10000]
for t in inst_trials[i]])
In [ ]:
conn.close()
In [ ]: