In [1]:
import cPickle as pickle, codecs, requests, json, random, re, datetime
from string import punctuation
from bs4 import BeautifulSoup
from collections import Counter, defaultdict


# SQLAlchemy setup
from sqlalchemy import create_engine
from sqlalchemy.sql import func, select
from connect import mysqlusername, mysqlpassword, mysqlserver, mysqldbname
from db_tables import metadata, InstitutionDescription, InstitutionLookup, InstitutionSponsors, InstitutionFacilities

Initialize procedures and variables


In [3]:
# procedure to lowercase and remove all punctuation from a string
def standardize(s):
    tokens = s.split()
    if tokens[-1][0] == "(":
        s = ' '.join(tokens[:-1])
    return re.sub('  +',' ',''.join(c for c in s.replace('/',' ').replace('-',' ') if c not in punctuation).lower())

# list of meaningless facility/site names
bad_names = [r'investigat[a-z]+ site',
             r'research site',
             r'recruiting site',
             r'^private (practice|clinic)$',
             r'research facility',
             r'local (institut|investigat)',
             r'study site',
             r'clinical site',
             r'call cent[re]+',
             r'site ref',
             r'site[ :]+#?[0-9]+',
             r'^#?[0-9\.]+$',
             r'for additional information',
             r'call for information',
             r'the study is ',
             r'[0-9\-]{10,}',
             r'administrative office',
             r'^research (center|centre|fac|site|stie|unit|med)',
             r'^site ',
             r'^hospital$',
             r'^test facility',
             r'^practice$'
             ]

# list of valid institution names that appear inside other valid institution names
ok_dupes = ['helsinki university',
            "queen s university", 
            "drexel university", 
            "new york university", 
            "uppsala university",
            'aids clinical trials group', 
            'mcgill university', 
            'university of connecticut', 
            'cornell university',
            'institute of child health', 
            'university of texas', 
            'university college, london', 
            'university of south carolina',
            'university of ottawa', 
            'indiana university',
            'medical research council',
            'institute of child health',
            'state university of new york',
            'university of california',
            'university hospital',
            'hospital']

Load institution data

Sponsors


In [ ]:
sponsors = defaultdict(list)
sponsor_ids = defaultdict(list)
sponsors_sp = defaultdict(set)
for row in codecs.open('../data/sponsors.txt','r','utf-8').readlines():
    sponsor_id, nct_id, sponsor_type, sponsor_name, funding_type = row.strip().split('|')
    
    # add trial to sponsor dictionary
    sponsors[sponsor_name].append(nct_id)
    
    # add sponsor id to sponsor_dictionary
    sponsor_ids[sponsor_name].append(int(sponsor_id))
    
    # add name to standardized lookup dictionary
    sponsors_sp[standardize(sponsor_name)].add(sponsor_name)

sponsors_s = {s: Counter(sponsors_sp[s]) for s in sponsors_sp}
del sponsors_sp

Sponsors matched to Freebase


In [4]:
# see institutions/Sponsor_* and institutions/Institution_* for the origin of this dictionary
sponsors_good = pickle.load(open('../data/facility_match_good.pkl','rb'))
sponsors_good_s = {standardize(s): s for s in sponsors_good.keys()}

Facilities


In [ ]:
all_facs = {}
all_facs_trials = {}
all_facs_r = defaultdict(list)
fac_names = defaultdict(list)
for row in codecs.open('../data/facilities.txt','r','utf-8').readlines():
    
    fac_id, nct_id, status, fac_name, city, state, zipcode, country = row.strip().split("|")
    id_tup = (fac_name, city, state, zipcode, country)
    
    all_facs[int(fac_id)] = id_tup
    all_facs_trials[int(fac_id)] = nct_id
    all_facs_r[id_tup].append(int(fac_id))
    if len(fac_name) > 0 and len(standardize(fac_name)) > 0:
        fac_names[standardize(fac_name)].append(int(fac_id))

fac_names_s = {s: Counter(all_facs[f][0] for f in fac_names[s]) for s in fac_names}

In [ ]:
# get deduplicated facilities
# see institutions/Facility_* for the origin of this file
facs = pickle.load(open('../data/facility_clusters.pkl','rb'))

In [ ]:
# get most frequent facility name for deduped facility clusters
facs_mode = facs.groupby(['cluster','facility_name']).facility_id.count().reset_index()
facs_mode['len_fac'] = facs_mode.facility_name.apply(lambda x: len(x))
clus_name = facs_mode.sort(['cluster', 0, 'len_fac'], ascending=[1,0,1]).drop_duplicates(['cluster'])[['cluster','facility_name']].set_index('cluster')

In [ ]:
# create lookup of cluster to facility id
clus_lookup = facs[['cluster','facility_id']].set_index('cluster')
clus_names = defaultdict(set)
for c in clus_name[clus_name.facility_name.apply(lambda x: standardize(x) in sponsors_good_s)].index:
    c_name = standardize(clus_name.loc[c].values[0])
    # fix University of Texas issue
    if c_name == 'university of texas at austin': c_name = 'university of texas'
    for f in clus_lookup.loc[c].values:
        clus_names[c_name].add(f[0])

Summarize institution data

Add the canonical institutions (ie those that have been matched to Freebase)


In [ ]:
# get lookup of (good) sponsor names that appear in other (good) sponsor names
dupes = {i: [j for j in sponsors_good_s if re.search(r'\b%s\b' % i, j) and i != j]
         for i in sponsors_good_s
         if i not in ok_dupes and
         len([j for j in sponsors_good_s if re.search(r'\b%s\b' % i, j) and i != j]) > 0}
dupes['university of texas'] = ['university of texas at austin']
dupes_r = {i for k, v in dupes.items() for i in v}

ok_dupes_lookup = {i: [j for j in sponsors_good_s if re.search(r'\b%s\b' % i, j) and i != j]
                   for i in sponsors_good_s
                   if i in ok_dupes and
                   len([j for j in sponsors_good_s if re.search(r'\b%s\b' % i, j) and i != j]) > 0}

In [ ]:
print datetime.datetime.now()
print

inst_dict = {}
fac_inst = defaultdict(set)
found_in_fac = defaultdict(set)

# add all sponsors that have been matched to Freebase metadata
for s in sponsors_good_s.keys():
    
    if s not in dupes_r:

        # actual key
        sk = sponsors_good_s[s]

        # generating institution object
        inst_dict[s] = {'name': sk,
                        'cert': 'gold',
                        'loc': sponsors_good[sk]['geo']['loc'] or '',
                        'img': sponsors_good[sk]['image'] or '',
                        'desc': sponsors_good[sk]['summary'] or '',
                        'lat': sponsors_good[sk]['geo']['lat'] if 'lat' in sponsors_good[sk]['geo'] else 999.0,
                        'lng': sponsors_good[sk]['geo']['lng'] if 'lng' in sponsors_good[sk]['geo'] else 999.0,
                        'sponsors': set(sponsors_s[s].keys())}

        if s in clus_names: 
            inst_dict[s]['facs'] = clus_names[s]
            for f in clus_names[s]:
                fac_inst[f].add(s)
        else:
            inst_dict[s]['facs'] = set()
        
        if s in dupes:
            for d in dupes[s]:
                inst_dict[s]['sponsors'] |= set(sponsors_s[d].keys())
                if d in clus_names:
                    inst_dict[s]['facs'] |= clus_names[d]
                for f in clus_names[d]:
                    fac_inst[f].add(s)

        # fill in facilities that include the certified sponsor name
        if s not in ['3m','rand','university of texas']:
            r = re.compile(r'\b%s\b' % s)
            poss_fac = [f for f in fac_names if s in f and s != f]
            for f in poss_fac:
                if r.search(f) and \
                   (s not in ok_dupes or s not in ok_dupes_lookup \
                                      or not max([True if re.search(r'\b%s\b' % d, f) else False 
                                                  for d in ok_dupes_lookup[s]])) and \
                   (s != 'abbott' or not max([True if d in f else False 
                                              for d in ['northwest','minneap',' md']])):
                    found_in_fac[s].add(f)
                    for fid in fac_names[f]:
                        inst_dict[s]['facs'].add(fid)
                        fac_inst[fid].add(s)

print datetime.datetime.now()
print

In [ ]:
print len([f for i in inst_dict.values() for f in i['facs']])
print len(set([f for i in inst_dict.values() for f in i['facs']]))
print len(fac_inst)
print len(inst_dict)

Add other sponsors who haven't been match to Freebase


In [ ]:
# add other sponsors who haven't been matched to Freebase
print datetime.datetime.now()
print

multi_match = defaultdict(set)
s_in_i = defaultdict(set)
i_in_s = defaultdict(set)

inst_re = {i: re.compile(r'\b%s\b' % i) for i in inst_dict}

done_sponsors = {standardize(s) for i in inst_dict for s in inst_dict[i]['sponsors']}

for s in sponsors_s.keys():
    
    if s not in done_sponsors and len(s.replace(' ','')) > 1:
        
        # flag to add new institution from sponsor name
        add_anyway = True
        
        # check to see if any current institution name is a subset of this institution
        poss_inst_match = [i for i in inst_re if inst_re[i].search(s)]
        for cur_inst in poss_inst_match:
            if cur_inst not in ok_dupes:
                add_anyway = False
                i_in_s[cur_inst].add(s)
                inst_dict[cur_inst]['sponsors'] |= set(sponsors_s[s].keys())
        
        # check to see in any facilities are part of an existing institution, and if so, add them to that
        if s in fac_names and add_anyway:
            
            these_facs = set(fac_names[s])
            these_inst = list(set(i for f in these_facs if f in fac_inst for i in fac_inst[f]))
            for cur_inst in these_inst:
                
                add_anyway = False
                inst_in_spon = cur_inst in s
                inst_in_dupe = cur_inst in ok_dupes
                spon_in_inst = s in cur_inst
                spon_in_dupe = s in ok_dupes
                
                if (inst_in_spon and not inst_in_dupe) or (spon_in_inst and not spon_in_dupe):
                    if inst_in_spon:
                        i_in_s[cur_inst].add(s)
                    if spon_in_inst:
                        s_in_i[cur_inst].add(s)
                    inst_dict[cur_inst]['facs'] |= these_facs
                    inst_dict[cur_inst]['sponsors'] |= set(sponsors_s[s].keys())
                    for f in these_facs:
                        fac_inst[f].add(cur_inst)

                else:
                    add_anyway = True
                    multi_match[s].add(cur_inst)
        
        # otherwise just add the sponsor and any matching facilities
        if add_anyway:
            
            inst_dict[s] = {'name': sponsors_s[s].most_common(1)[0][0],
                            'cert': 'sponsor',
                            'loc': '',
                            'img': '',
                            'desc': '',
                            'lat': 999.0,
                            'lng': 999.0,
                            'sponsors': set(sponsors_s[s].keys())}

            if s in fac_names:
                inst_dict[s]['facs'] = set(fac_names[s])
                for f in fac_names[s]:
                    fac_inst[f].add(s)
            else:
                inst_dict[s]['facs'] = set()

print datetime.datetime.now()
print

In [ ]:
print len([f for i in inst_dict.values() for f in i['facs']])
print len(set([f for i in inst_dict.values() for f in i['facs']]))
print len(fac_inst)
print len(inst_dict)

In [ ]:
# # resetting to just the canonical institutions
# for i in inst_dict.keys():
#     if not inst_dict[i]['cert']: del inst_dict[i]

# inst_set = set(inst_dict.keys())
# for f in fac_inst.keys():
#     fac_inst[f] &= inst_set

Fill in facilities that don't have meaningless names


In [ ]:
# filling in facilities where some id's weren't picked up by the other matches
to_fill = defaultdict(set)
for s in fac_names:
    if s not in multi_match and s not in inst_dict:
        these_facs = set(fac_names[s])
        these_inst = list(set(i for f in these_facs if f in fac_inst for i in fac_inst[f]))
        if len(these_inst) >= 1:
            for cur_inst in these_inst:
                if (cur_inst in s and cur_inst not in ok_dupes) or (s in cur_inst and s not in ok_dupes):
                    inst_dict[cur_inst]['facs'] |= these_facs
                    for f in these_facs:
                        fac_inst[f].add(cur_inst)
                else:
                    to_fill[s] |= set(fac_names[s])
        elif min([not re.search(b,s) for b in bad_names]):
            to_fill[s] |= set(fac_names[s])

In [ ]:
sorted([(t, len(to_fill[t])) for t in to_fill], key=lambda x: x[1], reverse=True)[:10]

In [ ]:
for f in to_fill:
    inst_dict[f] = {'name': fac_names_s[f].most_common(1)[0][0],
                    'cert': 'facility',
                    'loc': '',
                    'img': '',
                    'desc': '',
                    'lat': 999.0,
                    'lng': 999.0,
                    'sponsors': set(),
                    'facs': to_fill[f]}

In [ ]:
print len([f for i in inst_dict.values() for f in i['facs']])
print len(set([f for i in inst_dict.values() for f in i['facs']]))
print len(fac_inst)
print len(inst_dict)

Set up MySQL


In [ ]:
mysqlserver = 'localhost'
engine = create_engine('mysql://%s:%s@%s/%s' % (mysqlusername, mysqlpassword, mysqlserver, mysqldbname))
conn = engine.connect()
metadata.create_all(engine)

Prepare data


In [ ]:
inst_ids = {i: e for e, i in enumerate(inst_dict.keys())}

In [ ]:
inst_trials = {i: set(f for s in inst_dict[i]['sponsors'] for f in sponsors[s]) | \
                  set(all_facs_trials[f] for f in inst_dict[i]['facs'])
               for i in inst_dict}

In [ ]:
inst_facs = {i: set(all_facs[f] for f in inst_dict[i]['facs']) for i in inst_dict}

In [ ]:
k_insert = [{'institution_id': inst_ids[i],
               'facility_name': t[0],
               'city': t[1],
               'state': t[2],
               'zip': t[3],
               'country': t[4],
               'trial_count': len(set(all_facs_trials[n] for n in all_facs_r[t]))}
              for i in inst_facs.keys()
              for t in inst_facs[i]]

Insert data


In [ ]:
for k in range(0,len(inst_dict),10000):
    print k
    conn.execute(InstitutionDescription.insert(), [{'institution_id': inst_ids[i],
                                                    'name': inst_dict[i]['name'],
                                                    'source': inst_dict[i]['cert'].upper(),
                                                    'location': inst_dict[i]['loc'],
                                                    'image_url': inst_dict[i]['img'],
                                                    'description': inst_dict[i]['desc'],
                                                    'latitude': inst_dict[i]['lat'],
                                                    'longitude': inst_dict[i]['lng'],
                                                    'trial_count': len(inst_trials[i])}
                                                   for i in inst_dict.keys()[k:k+10000]])

In [ ]:
for k in range(0,len(inst_dict),10000):
    print k
    conn.execute(InstitutionSponsors.insert(), [{'institution_id': inst_ids[i],
                                                 'sponsor_name': s,
                                                 'trial_count': len(sponsors[s])}
                                                for i in inst_dict.keys()[k:k+10000]
                                                for s in inst_dict[i]['sponsors']])

In [ ]:
for k in range(0,len(k_insert),20000):
    print k
    conn.execute(InstitutionFacilities.insert(), k_insert[k:k+20000])

In [ ]:
for k in range(0,len(inst_trials),10000):
    print k
    conn.execute(InstitutionLookup.insert(), [{'institution_id': inst_ids[i],
                                               'nct_id': t}
                                              for i in inst_dict.keys()[k:k+10000]
                                              for t in inst_trials[i]])

In [ ]:
conn.close()

In [ ]: