In [ ]:
import pymongo, cPickle as pickle, codecs, requests, json, random
from bs4 import BeautifulSoup
from collections import Counter
from scipy import stats
from connect import mongoip, gkey

Read data

Condition data


In [11]:
corrections = {"Sarcoma, Ewing's": 'Sarcoma, Ewing',
               'Beta-Thalassemia': 'beta-Thalassemia',
               'Von Willebrand Disease, Type 3': 'von Willebrand Disease, Type 3',
               'Von Willebrand Disease, Type 2': 'von Willebrand Disease, Type 2',
               'Von Willebrand Disease, Type 1': 'von Willebrand Disease, Type 1',
               'Felty''s Syndrome': 'Felty Syndrome',
               'Von Hippel-Lindau Disease': 'von Hippel-Lindau Disease',
               'Retrognathism': 'Retrognathia',
               'Regurgitation, Gastric': 'Laryngopharyngeal Reflux',
               'Persistent Hyperinsulinemia Hypoglycemia of Infancy': 'Congenital Hyperinsulinism',
               'Von Willebrand Diseases': 'von Willebrand Diseases',
               'Pontine Glioma': 'Brain Stem Neoplasms',
               'Mental Retardation': 'Intellectual Disability',
               'Overdose': 'Drug Overdose',
               'Beta-Mannosidosis': 'beta-Mannosidosis',
               'Alpha 1-Antitrypsin Deficiency': 'alpha 1-Antitrypsin Deficiency',
               'Intervertebral Disk Displacement': 'Intervertebral Disc Displacement',
               'Alpha-Thalassemia': 'alpha-Thalassemia',
               'Mycobacterium Infections, Atypical': 'Mycobacterium Infections, Nontuberculous',
               'Legg-Perthes Disease': 'Legg-Calve-Perthes Disease',
               'Intervertebral Disk Degeneration': 'Intervertebral Disc Degeneration',
               'Alpha-Mannosidosis': 'alpha-Mannosidosis',
               'Gestational Trophoblastic Disease': 'Gestational Trophoblastic Neoplasms'
               }

cond = {}
cond_trials = {}
for row in codecs.open('../data/condition_browse.txt','r','utf-8').readlines():
    row_id, trial_id, mesh_term = row.strip().split('|')
    
    # fix mesh term if necessary
    if mesh_term in corrections: mesh_term = corrections[mesh_term]
    
    # add condition to trial dictionary
    if trial_id not in cond: cond[trial_id] = []
    cond[trial_id].append(mesh_term)
    
    # add trial to condition dictionary
    if mesh_term not in cond_trials: cond_trials[mesh_term] = []
    cond_trials[mesh_term].append(trial_id)

Intervention data


In [12]:
inv = {}
for row in codecs.open('../data/intervention_browse.txt','r','utf-8').readlines():
    row_id, trial_id, mesh_term = row.strip().split('|')
    
    # add intervention to trial dictionary
    if trial_id not in inv: inv[trial_id] = []
    inv[trial_id].append(mesh_term)

Medline topics data (also a thesaurus)


In [13]:
soup = BeautifulSoup(codecs.open('../data/mplus_topics_2014-11-04.xml','r','utf-8').read())

In [14]:
# synonyms for MeSH terms (and reverse), and topic descriptions
mesh_syn = {}
topic_desc = {}
for t in soup.find_all("health-topic",language="English"):
    # topic summary
    topic_desc[t.attrs["title"]] = t.find("full-summary").text.replace('\n','').replace('\t','')
    
    # MeSH synonyms
    cur_mesh = t.find("mesh-heading").descriptor.text
    if cur_mesh in cond_trials:
        if cur_mesh not in mesh_syn: mesh_syn[cur_mesh] = set()
        mesh_syn[cur_mesh] |=  set([t.attrs["title"]] + [a.text for a in t.find_all("also-called")])

# cleanup synonyms lookup dictionary
for m in mesh_syn.keys():
    cur_set = mesh_syn[m].copy()
    for s in mesh_syn[m]:
        if m.lower() == s.lower() or len(s) == 1: cur_set -= set([s])
    if len(cur_set) == 0:
        del(mesh_syn[m])
    else:
        mesh_syn[m] = cur_set

In [15]:
# get list of common terms
common_terms = {t for m in mesh_syn for t in mesh_syn[m]}

# create reverse lookup dictionary of common to MeSH term
mesh_syn_r = {}
for c in common_terms:
    mesh_syn_r[c] = [m for m in mesh_syn if c in mesh_syn[m]]

In [16]:
# expand synonym dictionary to lookup for all terms, not just MeSH terms
all_syn = mesh_syn.copy()
for m in mesh_syn.keys():
    for t in mesh_syn[m]:
        if t not in all_syn: all_syn[t] = set()
        all_syn[t] |= (set([m]) | mesh_syn[m]) - set([t])

Sponsors


In [17]:
sponsors = {}
sponsor_ids = {}
sponsors_trials = {}
id_cnt = 101
for row in codecs.open('../data/sponsors.txt','r','utf-8').readlines():
    row_id, nct_id, sponsor_type, sponsor_name, funding_type = row.split('|')
    
    # add trial to sponsor dictionary
    if sponsor_name not in sponsors: 
        sponsors[sponsor_name] = {'id': str(id_cnt),
                                  'trials': []
                                  }
        sponsor_ids[str(id_cnt)] = sponsor_name
        id_cnt += 1
    sponsors[sponsor_name]['trials'].append(nct_id)
    
    # add sponsor to trial dictionary
    if nct_id not in sponsors_trials: sponsors_trials[nct_id] = {}
    if sponsor_type == 'Collaborator':
        if 'coll' not in sponsors_trials[nct_id]: sponsors_trials[nct_id]['coll'] = []
        sponsors_trials[nct_id]['coll'].append(sponsor_name)
    else:
        sponsors_trials[nct_id]['pri'] = sponsor_name

In [19]:
sponsors_good = pickle.load(open('../data/facility_match_good.pkl','rb'))

In [20]:
sponsors_cond = {}
for c in cond_trials:
    sponsors_cond[c] = Counter([sponsors[sponsors_trials[t]['pri']]['id'] for t in cond_trials[c] if sponsors_trials[t]['pri'] in sponsors_good.keys()])

Trial data (status, title, sponsor, conditions, interventions)


In [21]:
active_trials = {}
for row in codecs.open('../data/clinical_study.txt','r','utf-8').readlines():
    data = row.split('|')
    if data[11] in ('Enrolling by invitation','Active, not recruiting','Recruiting'):
        nct_id = data[0]
        active_trials[nct_id] = {'status': data[11],
                                  'title': data[4],
                                  'sponsor_pri': sponsors_trials[nct_id]['pri'],
                                  'sponsor_coll': sponsors_trials[nct_id]['coll'] if 'coll' in sponsors_trials[nct_id] else None,
                                  'conds': cond[nct_id] if nct_id in cond else None,
                                  'invs': inv[nct_id] if nct_id in inv else None}

Facilities


In [22]:
all_facs = {}
trial_facs = {}
for row in codecs.open('../data/facilities.txt','r','utf-8').readlines():
    data = row.split("|")
    all_facs[data[0]] = data[1]
    if data[1] not in trial_facs: trial_facs[data[1]] = []
    trial_facs[data[1]].append(data[0])

In [23]:
# get deduplicated facilities
facs = pickle.load(open('../data/facility_clusters.pkl','rb'))

In [24]:
# get most frequent facility name for deduped facility clusters
facs_mode = facs.groupby(['cluster','facility_name']).facility_id.count().reset_index()
facs_mode['len_fac'] = facs_mode.facility_name.apply(lambda x: len(x))
clus_name = facs_mode.sort(['cluster', 0, 'len_fac'], ascending=[1,0,1]).drop_duplicates(['cluster'])[['cluster','facility_name']].set_index('cluster')

# create lookup of cluster to trial id
clus_lookup = facs[['cluster','facility_id']].set_index('cluster')
clus_trials = {}
for c in clus_name[clus_name.facility_name.apply(lambda x: x in sponsors_good)].index:
    c_name = clus_name.loc[c].values[0]
    if c_name not in clus_trials: clus_trials[c_name] = []
    for f in clus_lookup.loc[c].values:
        clus_trials[c_name].append(all_facs[str(f[0])])

Investigator and publication data


In [25]:
all_invest_trials = {}
for row in codecs.open('../data/investigators.txt','r','utf-8').readlines():
    data = row.split("|")
    if data[2] not in all_invest_trials: all_invest_trials[data[2]] = []
    all_invest_trials[data[2]].append(data[0])

In [26]:
investigators_ids = pickle.load(open('../data/id_investigator_lookup.pkl','rb'))
investigators_trials = pickle.load(open('../data/trial_invest_pub_match_dict.pkl','rb'))
publications = pickle.load(open('../data/pub_lookup_dict.pkl','rb'))

In [27]:
# create trials to investigator list dictionary
trial_pubs = {}
for i in investigators_trials.keys():
    for t in investigators_trials[i].keys():
        if t not in trial_pubs: trial_pubs[t] = []
        trial_pubs[t] += investigators_trials[i][t]

Summarize condition data


In [28]:
# getting counts of trials for each condition (and synonym)
cond_cnt = {}

def add_to_dict(word, subtotal):
    if word not in cond_cnt: cond_cnt[word] = 0
    cond_cnt[word] += subtotal

for c, trials in cond_trials.items():
    add_to_dict(c, len(trials))
    if c in mesh_syn:
        for t in mesh_syn[c]:
            add_to_dict(t, len(trials))

In [29]:
cond_ids = {c: i+100 for i, c in enumerate(cond_cnt.keys())}

In [20]:
cond_final = []
for c in cond_cnt.keys():
    # flag for is a MeSH term, also to be used later
    is_mesh = c in cond_trials
    if not is_mesh:
        cur_mesh = mesh_syn_r[c][0]
    else:
        cur_mesh = c
    
    cond_dict = {'cond_name': c,
                 'cond_id': cond_ids[c],
                 'num_trials': cond_cnt[c],
                 'cond_name_mesh': cur_mesh,
                 'cond_summary': None,
                 'cond_synonyms': None,
                 'cond_trials_active': [],
                 'cond_inst_top': []
                 }
    
    # filling in summary if it exists for this term or any synonym
    if c in topic_desc:
        cond_dict['cond_summary'] = topic_desc[c]
    elif c in all_syn:
        for s in all_syn[c]:
            if s in topic_desc:
                cond_dict['cond_summary'] = topic_desc[s]
    
    # filling in synonyms if they exist
    if c in all_syn:
        cond_dict['cond_synonyms'] = list(all_syn[c])
    
    # filling in active trials
    for t in cond_trials[cur_mesh]:
        if t in active_trials:
            trial_dict = {'trial_id': t,
                          'trial_title': active_trials[t]['title'],
                          'trial_sponsor': active_trials[t]['sponsor_pri'],
                          'trial_cond': [{'cond_name': tcn, 'cond_id': cond_ids[tcn]} for tcn in active_trials[t]['conds']],
                          'trial_iv': active_trials[t]['invs']
                          }
            cond_dict['cond_trials_active'].append(trial_dict)
    
    # filling in top institutions
    for sid, cnt in sorted(sponsors_cond[cur_mesh].items(), key=lambda x: x[1], reverse=True)[:5]:
        inst_dict = {'inst_name': sponsors_good[sponsor_ids[sid]]['name'],
                     'inst_id': sid,
                     'inst_img': sponsors_good[sponsor_ids[sid]]['image'],
                     'inst_loc': sponsors_good[sponsor_ids[sid]]['geo']['loc']
                     }
        cond_dict['cond_inst_top'].append(inst_dict)
    
    
    cond_final.append(cond_dict)

Summarize institution data


In [30]:
# getting counts of trials for each sponsor (and facility)
inst_trials = {s: set(sponsors[s]['trials']) for s in sponsors_good.keys()}

for s in clus_trials:
    inst_trials[s] |= set(clus_trials[s])

In [31]:
inst_final = []

for s in inst_trials.keys():
    inst_dict = {'inst_name': s,
                 'inst_id': [sid for sid, sname in sponsor_ids.items() if sname==s][0],
                 'num_trials': len(inst_trials[s]),
                 'inst_loc': sponsors_good[s]['geo']['loc'],
                 'inst_img': sponsors_good[s]['image'],
                 'inst_summary': sponsors_good[s]['summary'],
                 'inst_trials_active': [],
                 'inst_researchers': [],
                 'inst_rating': random.choice([3,4,5])
                 }
    
    # filling in active trials
    for t in inst_trials[s]:
        if t in active_trials:
            trial_dict = {'trial_id': t,
                          'trial_title': active_trials[t]['title'],
                          'trial_sponsor': active_trials[t]['sponsor_pri'],
                          'trial_cond': [{'cond_name': tcn, 'cond_id': cond_ids[tcn]} for tcn in active_trials[t]['conds']] if active_trials[t]['conds'] else None,
                          'trial_iv': active_trials[t]['invs']
                          }
            inst_dict['inst_trials_active'].append(trial_dict)
    
    # filling in researchers and publications
    investigator_list = Counter([investigators_ids[int(p)] 
                                 for t in inst_trials[s]
                                 if t in all_invest_trials
                                 for p in all_invest_trials[t]
                                 if int(p) in investigators_ids])
    for invname, ntrials in sorted(investigator_list.items(), key=lambda x: x[1], reverse=True):
        if ntrials > 1:
            inst_dict['inst_researchers'].append({'researcher_name': invname, 'researcher_count': ntrials})
    
    # filling in publication data
    unique_pubs = set([pub for t in inst_trials[s] if t in trial_pubs for pub in trial_pubs[t]])
    pub_temp = []
    for p in unique_pubs:
        doi = dict(publications[p]['other_ids'])['doi'] if 'doi' in dict(publications[p]['other_ids']) else None
        pub_temp.append((int(publications[p]['year']),
                         {'pubmed_id': p,
                         'pub_title': publications[p]['title'],
                         'pub_authors': [a[0] for a in publications[p]['authors']],
                         'pub_doi': doi}))
    inst_dict['inst_pubs'] = [p for y, p in sorted(pub_temp, key=lambda x: x[0], reverse=True) if y >= 2010]
    
    # filling in conditions data
    inst_cond_top = Counter([m for t in inst_trials[s] if t in cond for m in cond[t]])
    inst_dict['inst_cond_top'] = [{'cond_name': c,
                                   'cond_id': cond_ids[c],
                                   'trial_count': cnt} 
                                  for c, cnt in sorted(inst_cond_top.items(), key=lambda x: x[1], reverse=True)[:20]]
    
        
    inst_final.append(inst_dict)

Connect to MongoDB instance and write objects


In [37]:
c = pymongo.MongoClient(host=mongoip)
db = c.ctdb

In [38]:
db.conditions.drop()
cond_coll = db.conditions
oids = cond_coll.insert(cond_final)

In [44]:
pickle.dump(dict(zip(oids,cond_final)),open('../data/condition_json.pkl','wb'))

In [39]:
db.institutions.drop()
inst_coll = db.institutions
oids = inst_coll.insert(inst_final)

In [33]:
pickle.dump(inst_final,open('../data/institution_json.pkl','wb'))

In [2]:
inst_final = pickle.load(open('../data/institution_json.pkl','rb'))

Write new typeahead and locations json objects


In [40]:
typeahead = [{'label': d['cond_name'],
              'category': 'Condition',
              'cond_id': str(d['cond_id'])}
             for d in sorted(cond_final, key=lambda x: x['num_trials'], reverse=True)] + \
            [{'label': d['inst_name'],
              'category': 'Institution',
              'inst_id': str(d['inst_id'])}
             for d in sorted(inst_final, key=lambda x: x['num_trials'], reverse=True)]

In [44]:
with open('../data/typeahead.json','wb') as fp:
    json.dump(typeahead, fp)

In [55]:
locations = []
for b in sponsors_good.keys():
    if 'geo' in sponsors_good[b] and 'lat' in sponsors_good[b]['geo']:
        locations.append([sponsors_good[b]['geo']['lat'], 
                          sponsors_good[b]['geo']['lng'], 
                          sponsors_good[b]['name'],
                          [sid for sid, sname in sponsor_ids.items() if sname==b][0]])

In [57]:
with open('../data/locations.json','wb') as fp:
    json.dump(locations, fp)

In [ ]: