In [1]:
# basic NLP
import nltk, codecs, string, random, HTMLParser, math
from collections import Counter
from nltk.corpus import brown, wordnet as wn
# scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
import scipy.sparse as sps
import numpy as np
from sklearn.metrics.pairwise import linear_kernel
from __future__ import division
# creating functions
h= HTMLParser.HTMLParser()
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
stopset = set(nltk.corpus.stopwords.words('english'))
In [2]:
corrections = {"Sarcoma, Ewing's": 'Sarcoma, Ewing',
'Beta-Thalassemia': 'beta-Thalassemia',
'Von Willebrand Disease, Type 3': 'von Willebrand Disease, Type 3',
'Von Willebrand Disease, Type 2': 'von Willebrand Disease, Type 2',
'Von Willebrand Disease, Type 1': 'von Willebrand Disease, Type 1',
'Felty''s Syndrome': 'Felty Syndrome',
'Von Hippel-Lindau Disease': 'von Hippel-Lindau Disease',
'Retrognathism': 'Retrognathia',
'Regurgitation, Gastric': 'Laryngopharyngeal Reflux',
'Persistent Hyperinsulinemia Hypoglycemia of Infancy': 'Congenital Hyperinsulinism',
'Von Willebrand Diseases': 'von Willebrand Diseases',
'Pontine Glioma': 'Brain Stem Neoplasms',
'Mental Retardation': 'Intellectual Disability',
'Overdose': 'Drug Overdose',
'Beta-Mannosidosis': 'beta-Mannosidosis',
'Alpha 1-Antitrypsin Deficiency': 'alpha 1-Antitrypsin Deficiency',
'Intervertebral Disk Displacement': 'Intervertebral Disc Displacement',
'Alpha-Thalassemia': 'alpha-Thalassemia',
'Mycobacterium Infections, Atypical': 'Mycobacterium Infections, Nontuberculous',
'Legg-Perthes Disease': 'Legg-Calve-Perthes Disease',
'Intervertebral Disk Degeneration': 'Intervertebral Disc Degeneration',
'Alpha-Mannosidosis': 'alpha-Mannosidosis',
'Gestational Trophoblastic Disease': 'Gestational Trophoblastic Neoplasms'
}
cond = {}
for row in codecs.open('../data/condition_browse.txt','r','utf-8').readlines():
row_id, trial_id, mesh_term = row.strip().split('|')
if mesh_term in corrections: mesh_term = corrections[mesh_term]
if mesh_term not in cond: cond[mesh_term] = []
cond[mesh_term].append(trial_id)
# limiting to conditions that appear in ten or more trials
top_cond = {c for c in cond if len(cond[c]) >= 10}
trials = {t for c in top_cond for t in cond[c]}
In [3]:
trial_desc = {}
for row in codecs.open('../data/clinical_study.txt','r','utf-8').readlines():
data = row.split('|')
trial_desc[data[0]] = (data[9],data[10])
to_classify = [t for t in trial_desc if t not in trials]
In [4]:
cond_text = {c: ' '.join(' '.join(trial_desc[t]) for t in cond[c])
for c in top_cond}
In [5]:
tfidf = TfidfVectorizer(stop_words=stopset)
train_mat = tfidf.fit_transform(cond_text.values())
apply_mat = tfidf.transform(' '.join(trial_desc[t]) for t in to_classify)
In [38]:
n = 109
print to_classify[n]
print trial_desc[to_classify[n]]
print
cosine_similarities = linear_kernel(apply_mat[n], train_mat).flatten()
for i, v in sorted(list(enumerate(cosine_similarities)),key=lambda x: x[1], reverse=True)[:10]:
print '%s :: %g' % (cond_text.keys()[i], v)
In [28]:
cond_text.keys()[553]
Out[28]:
In [24]:
trial_desc[to_classify[90]]
Out[24]:
In [ ]: