In [ ]:
import pymongo, cPickle as pickle, codecs, requests, json, random
from bs4 import BeautifulSoup
from collections import Counter
from scipy import stats
from connect import mongoip, gkey
Condition data
In [11]:
corrections = {"Sarcoma, Ewing's": 'Sarcoma, Ewing',
'Beta-Thalassemia': 'beta-Thalassemia',
'Von Willebrand Disease, Type 3': 'von Willebrand Disease, Type 3',
'Von Willebrand Disease, Type 2': 'von Willebrand Disease, Type 2',
'Von Willebrand Disease, Type 1': 'von Willebrand Disease, Type 1',
'Felty''s Syndrome': 'Felty Syndrome',
'Von Hippel-Lindau Disease': 'von Hippel-Lindau Disease',
'Retrognathism': 'Retrognathia',
'Regurgitation, Gastric': 'Laryngopharyngeal Reflux',
'Persistent Hyperinsulinemia Hypoglycemia of Infancy': 'Congenital Hyperinsulinism',
'Von Willebrand Diseases': 'von Willebrand Diseases',
'Pontine Glioma': 'Brain Stem Neoplasms',
'Mental Retardation': 'Intellectual Disability',
'Overdose': 'Drug Overdose',
'Beta-Mannosidosis': 'beta-Mannosidosis',
'Alpha 1-Antitrypsin Deficiency': 'alpha 1-Antitrypsin Deficiency',
'Intervertebral Disk Displacement': 'Intervertebral Disc Displacement',
'Alpha-Thalassemia': 'alpha-Thalassemia',
'Mycobacterium Infections, Atypical': 'Mycobacterium Infections, Nontuberculous',
'Legg-Perthes Disease': 'Legg-Calve-Perthes Disease',
'Intervertebral Disk Degeneration': 'Intervertebral Disc Degeneration',
'Alpha-Mannosidosis': 'alpha-Mannosidosis',
'Gestational Trophoblastic Disease': 'Gestational Trophoblastic Neoplasms'
}
cond = {}
cond_trials = {}
for row in codecs.open('../data/condition_browse.txt','r','utf-8').readlines():
row_id, trial_id, mesh_term = row.strip().split('|')
# fix mesh term if necessary
if mesh_term in corrections: mesh_term = corrections[mesh_term]
# add condition to trial dictionary
if trial_id not in cond: cond[trial_id] = []
cond[trial_id].append(mesh_term)
# add trial to condition dictionary
if mesh_term not in cond_trials: cond_trials[mesh_term] = []
cond_trials[mesh_term].append(trial_id)
Intervention data
In [12]:
inv = {}
for row in codecs.open('../data/intervention_browse.txt','r','utf-8').readlines():
row_id, trial_id, mesh_term = row.strip().split('|')
# add intervention to trial dictionary
if trial_id not in inv: inv[trial_id] = []
inv[trial_id].append(mesh_term)
Medline topics data (also a thesaurus)
In [13]:
soup = BeautifulSoup(codecs.open('../data/mplus_topics_2014-11-04.xml','r','utf-8').read())
In [14]:
# synonyms for MeSH terms (and reverse), and topic descriptions
mesh_syn = {}
topic_desc = {}
for t in soup.find_all("health-topic",language="English"):
# topic summary
topic_desc[t.attrs["title"]] = t.find("full-summary").text.replace('\n','').replace('\t','')
# MeSH synonyms
cur_mesh = t.find("mesh-heading").descriptor.text
if cur_mesh in cond_trials:
if cur_mesh not in mesh_syn: mesh_syn[cur_mesh] = set()
mesh_syn[cur_mesh] |= set([t.attrs["title"]] + [a.text for a in t.find_all("also-called")])
# cleanup synonyms lookup dictionary
for m in mesh_syn.keys():
cur_set = mesh_syn[m].copy()
for s in mesh_syn[m]:
if m.lower() == s.lower() or len(s) == 1: cur_set -= set([s])
if len(cur_set) == 0:
del(mesh_syn[m])
else:
mesh_syn[m] = cur_set
In [15]:
# get list of common terms
common_terms = {t for m in mesh_syn for t in mesh_syn[m]}
# create reverse lookup dictionary of common to MeSH term
mesh_syn_r = {}
for c in common_terms:
mesh_syn_r[c] = [m for m in mesh_syn if c in mesh_syn[m]]
In [16]:
# expand synonym dictionary to lookup for all terms, not just MeSH terms
all_syn = mesh_syn.copy()
for m in mesh_syn.keys():
for t in mesh_syn[m]:
if t not in all_syn: all_syn[t] = set()
all_syn[t] |= (set([m]) | mesh_syn[m]) - set([t])
Sponsors
In [17]:
sponsors = {}
sponsor_ids = {}
sponsors_trials = {}
id_cnt = 101
for row in codecs.open('../data/sponsors.txt','r','utf-8').readlines():
row_id, nct_id, sponsor_type, sponsor_name, funding_type = row.split('|')
# add trial to sponsor dictionary
if sponsor_name not in sponsors:
sponsors[sponsor_name] = {'id': str(id_cnt),
'trials': []
}
sponsor_ids[str(id_cnt)] = sponsor_name
id_cnt += 1
sponsors[sponsor_name]['trials'].append(nct_id)
# add sponsor to trial dictionary
if nct_id not in sponsors_trials: sponsors_trials[nct_id] = {}
if sponsor_type == 'Collaborator':
if 'coll' not in sponsors_trials[nct_id]: sponsors_trials[nct_id]['coll'] = []
sponsors_trials[nct_id]['coll'].append(sponsor_name)
else:
sponsors_trials[nct_id]['pri'] = sponsor_name
In [19]:
sponsors_good = pickle.load(open('../data/facility_match_good.pkl','rb'))
In [20]:
sponsors_cond = {}
for c in cond_trials:
sponsors_cond[c] = Counter([sponsors[sponsors_trials[t]['pri']]['id'] for t in cond_trials[c] if sponsors_trials[t]['pri'] in sponsors_good.keys()])
Trial data (status, title, sponsor, conditions, interventions)
In [21]:
active_trials = {}
for row in codecs.open('../data/clinical_study.txt','r','utf-8').readlines():
data = row.split('|')
if data[11] in ('Enrolling by invitation','Active, not recruiting','Recruiting'):
nct_id = data[0]
active_trials[nct_id] = {'status': data[11],
'title': data[4],
'sponsor_pri': sponsors_trials[nct_id]['pri'],
'sponsor_coll': sponsors_trials[nct_id]['coll'] if 'coll' in sponsors_trials[nct_id] else None,
'conds': cond[nct_id] if nct_id in cond else None,
'invs': inv[nct_id] if nct_id in inv else None}
Facilities
In [22]:
all_facs = {}
trial_facs = {}
for row in codecs.open('../data/facilities.txt','r','utf-8').readlines():
data = row.split("|")
all_facs[data[0]] = data[1]
if data[1] not in trial_facs: trial_facs[data[1]] = []
trial_facs[data[1]].append(data[0])
In [23]:
# get deduplicated facilities
facs = pickle.load(open('../data/facility_clusters.pkl','rb'))
In [24]:
# get most frequent facility name for deduped facility clusters
facs_mode = facs.groupby(['cluster','facility_name']).facility_id.count().reset_index()
facs_mode['len_fac'] = facs_mode.facility_name.apply(lambda x: len(x))
clus_name = facs_mode.sort(['cluster', 0, 'len_fac'], ascending=[1,0,1]).drop_duplicates(['cluster'])[['cluster','facility_name']].set_index('cluster')
# create lookup of cluster to trial id
clus_lookup = facs[['cluster','facility_id']].set_index('cluster')
clus_trials = {}
for c in clus_name[clus_name.facility_name.apply(lambda x: x in sponsors_good)].index:
c_name = clus_name.loc[c].values[0]
if c_name not in clus_trials: clus_trials[c_name] = []
for f in clus_lookup.loc[c].values:
clus_trials[c_name].append(all_facs[str(f[0])])
Investigator and publication data
In [25]:
all_invest_trials = {}
for row in codecs.open('../data/investigators.txt','r','utf-8').readlines():
data = row.split("|")
if data[2] not in all_invest_trials: all_invest_trials[data[2]] = []
all_invest_trials[data[2]].append(data[0])
In [26]:
investigators_ids = pickle.load(open('../data/id_investigator_lookup.pkl','rb'))
investigators_trials = pickle.load(open('../data/trial_invest_pub_match_dict.pkl','rb'))
publications = pickle.load(open('../data/pub_lookup_dict.pkl','rb'))
In [27]:
# create trials to investigator list dictionary
trial_pubs = {}
for i in investigators_trials.keys():
for t in investigators_trials[i].keys():
if t not in trial_pubs: trial_pubs[t] = []
trial_pubs[t] += investigators_trials[i][t]
In [28]:
# getting counts of trials for each condition (and synonym)
cond_cnt = {}
def add_to_dict(word, subtotal):
if word not in cond_cnt: cond_cnt[word] = 0
cond_cnt[word] += subtotal
for c, trials in cond_trials.items():
add_to_dict(c, len(trials))
if c in mesh_syn:
for t in mesh_syn[c]:
add_to_dict(t, len(trials))
In [29]:
cond_ids = {c: i+100 for i, c in enumerate(cond_cnt.keys())}
In [20]:
cond_final = []
for c in cond_cnt.keys():
# flag for is a MeSH term, also to be used later
is_mesh = c in cond_trials
if not is_mesh:
cur_mesh = mesh_syn_r[c][0]
else:
cur_mesh = c
cond_dict = {'cond_name': c,
'cond_id': cond_ids[c],
'num_trials': cond_cnt[c],
'cond_name_mesh': cur_mesh,
'cond_summary': None,
'cond_synonyms': None,
'cond_trials_active': [],
'cond_inst_top': []
}
# filling in summary if it exists for this term or any synonym
if c in topic_desc:
cond_dict['cond_summary'] = topic_desc[c]
elif c in all_syn:
for s in all_syn[c]:
if s in topic_desc:
cond_dict['cond_summary'] = topic_desc[s]
# filling in synonyms if they exist
if c in all_syn:
cond_dict['cond_synonyms'] = list(all_syn[c])
# filling in active trials
for t in cond_trials[cur_mesh]:
if t in active_trials:
trial_dict = {'trial_id': t,
'trial_title': active_trials[t]['title'],
'trial_sponsor': active_trials[t]['sponsor_pri'],
'trial_cond': [{'cond_name': tcn, 'cond_id': cond_ids[tcn]} for tcn in active_trials[t]['conds']],
'trial_iv': active_trials[t]['invs']
}
cond_dict['cond_trials_active'].append(trial_dict)
# filling in top institutions
for sid, cnt in sorted(sponsors_cond[cur_mesh].items(), key=lambda x: x[1], reverse=True)[:5]:
inst_dict = {'inst_name': sponsors_good[sponsor_ids[sid]]['name'],
'inst_id': sid,
'inst_img': sponsors_good[sponsor_ids[sid]]['image'],
'inst_loc': sponsors_good[sponsor_ids[sid]]['geo']['loc']
}
cond_dict['cond_inst_top'].append(inst_dict)
cond_final.append(cond_dict)
In [30]:
# getting counts of trials for each sponsor (and facility)
inst_trials = {s: set(sponsors[s]['trials']) for s in sponsors_good.keys()}
for s in clus_trials:
inst_trials[s] |= set(clus_trials[s])
In [31]:
inst_final = []
for s in inst_trials.keys():
inst_dict = {'inst_name': s,
'inst_id': [sid for sid, sname in sponsor_ids.items() if sname==s][0],
'num_trials': len(inst_trials[s]),
'inst_loc': sponsors_good[s]['geo']['loc'],
'inst_img': sponsors_good[s]['image'],
'inst_summary': sponsors_good[s]['summary'],
'inst_trials_active': [],
'inst_researchers': [],
'inst_rating': random.choice([3,4,5])
}
# filling in active trials
for t in inst_trials[s]:
if t in active_trials:
trial_dict = {'trial_id': t,
'trial_title': active_trials[t]['title'],
'trial_sponsor': active_trials[t]['sponsor_pri'],
'trial_cond': [{'cond_name': tcn, 'cond_id': cond_ids[tcn]} for tcn in active_trials[t]['conds']] if active_trials[t]['conds'] else None,
'trial_iv': active_trials[t]['invs']
}
inst_dict['inst_trials_active'].append(trial_dict)
# filling in researchers and publications
investigator_list = Counter([investigators_ids[int(p)]
for t in inst_trials[s]
if t in all_invest_trials
for p in all_invest_trials[t]
if int(p) in investigators_ids])
for invname, ntrials in sorted(investigator_list.items(), key=lambda x: x[1], reverse=True):
if ntrials > 1:
inst_dict['inst_researchers'].append({'researcher_name': invname, 'researcher_count': ntrials})
# filling in publication data
unique_pubs = set([pub for t in inst_trials[s] if t in trial_pubs for pub in trial_pubs[t]])
pub_temp = []
for p in unique_pubs:
doi = dict(publications[p]['other_ids'])['doi'] if 'doi' in dict(publications[p]['other_ids']) else None
pub_temp.append((int(publications[p]['year']),
{'pubmed_id': p,
'pub_title': publications[p]['title'],
'pub_authors': [a[0] for a in publications[p]['authors']],
'pub_doi': doi}))
inst_dict['inst_pubs'] = [p for y, p in sorted(pub_temp, key=lambda x: x[0], reverse=True) if y >= 2010]
# filling in conditions data
inst_cond_top = Counter([m for t in inst_trials[s] if t in cond for m in cond[t]])
inst_dict['inst_cond_top'] = [{'cond_name': c,
'cond_id': cond_ids[c],
'trial_count': cnt}
for c, cnt in sorted(inst_cond_top.items(), key=lambda x: x[1], reverse=True)[:20]]
inst_final.append(inst_dict)
In [37]:
c = pymongo.MongoClient(host=mongoip)
db = c.ctdb
In [38]:
db.conditions.drop()
cond_coll = db.conditions
oids = cond_coll.insert(cond_final)
In [44]:
pickle.dump(dict(zip(oids,cond_final)),open('../data/condition_json.pkl','wb'))
In [39]:
db.institutions.drop()
inst_coll = db.institutions
oids = inst_coll.insert(inst_final)
In [33]:
pickle.dump(inst_final,open('../data/institution_json.pkl','wb'))
In [2]:
inst_final = pickle.load(open('../data/institution_json.pkl','rb'))
In [40]:
typeahead = [{'label': d['cond_name'],
'category': 'Condition',
'cond_id': str(d['cond_id'])}
for d in sorted(cond_final, key=lambda x: x['num_trials'], reverse=True)] + \
[{'label': d['inst_name'],
'category': 'Institution',
'inst_id': str(d['inst_id'])}
for d in sorted(inst_final, key=lambda x: x['num_trials'], reverse=True)]
In [44]:
with open('../data/typeahead.json','wb') as fp:
json.dump(typeahead, fp)
In [55]:
locations = []
for b in sponsors_good.keys():
if 'geo' in sponsors_good[b] and 'lat' in sponsors_good[b]['geo']:
locations.append([sponsors_good[b]['geo']['lat'],
sponsors_good[b]['geo']['lng'],
sponsors_good[b]['name'],
[sid for sid, sname in sponsor_ids.items() if sname==b][0]])
In [57]:
with open('../data/locations.json','wb') as fp:
json.dump(locations, fp)
In [ ]: