In [2]:
import codecs, pandas as pd, cPickle as pickle, nltk
from collections import Counter
%matplotlib inline
In [3]:
corrections = {"Sarcoma, Ewing's": 'Sarcoma, Ewing',
'Beta-Thalassemia': 'beta-Thalassemia',
'Von Willebrand Disease, Type 3': 'von Willebrand Disease, Type 3',
'Von Willebrand Disease, Type 2': 'von Willebrand Disease, Type 2',
'Von Willebrand Disease, Type 1': 'von Willebrand Disease, Type 1',
'Felty''s Syndrome': 'Felty Syndrome',
'Von Hippel-Lindau Disease': 'von Hippel-Lindau Disease',
'Retrognathism': 'Retrognathia',
'Regurgitation, Gastric': 'Laryngopharyngeal Reflux',
'Persistent Hyperinsulinemia Hypoglycemia of Infancy': 'Congenital Hyperinsulinism',
'Von Willebrand Diseases': 'von Willebrand Diseases',
'Pontine Glioma': 'Brain Stem Neoplasms',
'Mental Retardation': 'Intellectual Disability',
'Overdose': 'Drug Overdose',
'Beta-Mannosidosis': 'beta-Mannosidosis',
'Alpha 1-Antitrypsin Deficiency': 'alpha 1-Antitrypsin Deficiency',
'Intervertebral Disk Displacement': 'Intervertebral Disc Displacement',
'Alpha-Thalassemia': 'alpha-Thalassemia',
'Mycobacterium Infections, Atypical': 'Mycobacterium Infections, Nontuberculous',
'Legg-Perthes Disease': 'Legg-Calve-Perthes Disease',
'Intervertebral Disk Degeneration': 'Intervertebral Disc Degeneration',
'Alpha-Mannosidosis': 'alpha-Mannosidosis',
'Gestational Trophoblastic Disease': 'Gestational Trophoblastic Neoplasms'
}
In [4]:
cond = {}
for row in codecs.open('../data/condition_browse.txt','r','utf-8').readlines():
row_id, trial_id, mesh_term = row.strip().split('|')
if trial_id not in cond: cond[trial_id] = []
if mesh_term in corrections:
cond[trial_id].append(corrections[mesh_term])
else:
cond[trial_id].append(mesh_term)
interv = {}
for row in codecs.open('../data/intervention_browse.txt','r','utf-8').readlines():
row_id, trial_id, mesh_term = row.strip().split('|')
if trial_id not in interv: interv[trial_id] = []
interv[trial_id].append(mesh_term)
mesh_terms = {}
mesh_lookup = {}
for row in codecs.open('../data/mesh_thesaurus.txt','r','utf-8').readlines():
row_id, mesh_id, mesh_term = row.strip().split('|')
mesh_lookup[mesh_id] = mesh_term
if mesh_term not in mesh_terms: mesh_terms[mesh_term] = []
mesh_terms[mesh_term].append(mesh_id)
In [4]:
c = nltk.FreqDist([len(t) for t in cond.values()])
In [5]:
c.plot()
In [6]:
d = nltk.FreqDist([min([len(m) for m in mesh_terms[t[0]] if t[0] in mesh_terms]) for t in cond.values()
if len(t) == 1 and t[0] in mesh_terms])
In [7]:
d.plot()
In [9]:
f = nltk.FreqDist([t[i] for t in cond.values()
for i in range(len(t))
if t[i] in mesh_terms and Counter([m[:3] for m in mesh_terms[t[i]]]).items() == [(u'C17', 1), (u'C04', 1)]])
In [10]:
f.items()
Out[10]:
In [17]:
g = nltk.FreqDist([tuple(Counter([m[:3] for i in range(len(t)) if t[i] in mesh_terms
for m in mesh_terms[t[i]]]).items())
for t in cond.values()])
In [43]:
g.items()[:50]
Out[43]:
In [32]:
h = nltk.FreqDist([tuple(a.items())
for a in [Counter([m[:3] for c in v if c in mesh_terms for m in mesh_terms[c]])
for k, v in cond.items()]])
In [44]:
h.items()[:50]
Out[44]:
In [37]:
m = nltk.FreqDist([b for a in [Counter([m[:3] for c in v if c in mesh_terms for m in mesh_terms[c]])
for k, v in cond.items()] for b in a.keys()])
In [38]:
sorted(m.items())
Out[38]:
In [39]:
n = nltk.FreqDist([b for a in [Counter([m[:3] for c in v if c in mesh_terms for m in mesh_terms[c]])
for k, v in cond.items()] for b in a.keys() if len(a) == 1])
In [40]:
sorted(n.items())
Out[40]:
In [41]:
len(set([c for k, v in cond.items() for c in v]))
Out[41]:
In [14]:
len([k for k, v in nltk.FreqDist([c for k, v in cond.items() for c in v]).items() if v >= 10])
Out[14]:
In [ ]: