Load modules


In [1]:
# basic NLP
import nltk, codecs, string, random, HTMLParser, math
from collections import Counter
from nltk.corpus import brown, wordnet as wn

# scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
import scipy.sparse as sps
import numpy as np
from sklearn.metrics.pairwise import linear_kernel

from __future__ import division

# creating functions
h= HTMLParser.HTMLParser()
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
stopset = set(nltk.corpus.stopwords.words('english'))

Load data


In [2]:
corrections = {"Sarcoma, Ewing's": 'Sarcoma, Ewing',
               'Beta-Thalassemia': 'beta-Thalassemia',
               'Von Willebrand Disease, Type 3': 'von Willebrand Disease, Type 3',
               'Von Willebrand Disease, Type 2': 'von Willebrand Disease, Type 2',
               'Von Willebrand Disease, Type 1': 'von Willebrand Disease, Type 1',
               'Felty''s Syndrome': 'Felty Syndrome',
               'Von Hippel-Lindau Disease': 'von Hippel-Lindau Disease',
               'Retrognathism': 'Retrognathia',
               'Regurgitation, Gastric': 'Laryngopharyngeal Reflux',
               'Persistent Hyperinsulinemia Hypoglycemia of Infancy': 'Congenital Hyperinsulinism',
               'Von Willebrand Diseases': 'von Willebrand Diseases',
               'Pontine Glioma': 'Brain Stem Neoplasms',
               'Mental Retardation': 'Intellectual Disability',
               'Overdose': 'Drug Overdose',
               'Beta-Mannosidosis': 'beta-Mannosidosis',
               'Alpha 1-Antitrypsin Deficiency': 'alpha 1-Antitrypsin Deficiency',
               'Intervertebral Disk Displacement': 'Intervertebral Disc Displacement',
               'Alpha-Thalassemia': 'alpha-Thalassemia',
               'Mycobacterium Infections, Atypical': 'Mycobacterium Infections, Nontuberculous',
               'Legg-Perthes Disease': 'Legg-Calve-Perthes Disease',
               'Intervertebral Disk Degeneration': 'Intervertebral Disc Degeneration',
               'Alpha-Mannosidosis': 'alpha-Mannosidosis',
               'Gestational Trophoblastic Disease': 'Gestational Trophoblastic Neoplasms'
               }
cond = {}
for row in codecs.open('../data/condition_browse.txt','r','utf-8').readlines():
    row_id, trial_id, mesh_term = row.strip().split('|')
    if mesh_term in corrections: mesh_term = corrections[mesh_term]
    if mesh_term not in cond: cond[mesh_term] = []
    cond[mesh_term].append(trial_id)

# limiting to conditions that appear in ten or more trials
top_cond = {c for c in cond if len(cond[c]) >= 10}
trials = {t for c in top_cond for t in cond[c]}

In [3]:
trial_desc = {}
for row in codecs.open('../data/clinical_study.txt','r','utf-8').readlines():
    data = row.split('|')
    trial_desc[data[0]] = (data[9],data[10])

to_classify = [t for t in trial_desc if t not in trials]

Process text


In [4]:
cond_text = {c: ' '.join(' '.join(trial_desc[t]) for t in cond[c]) 
             for c in top_cond}

Building tf-idf matrix


In [5]:
tfidf = TfidfVectorizer(stop_words=stopset)
train_mat = tfidf.fit_transform(cond_text.values())
apply_mat = tfidf.transform(' '.join(trial_desc[t]) for t in to_classify)

In [38]:
n = 109

print to_classify[n]
print trial_desc[to_classify[n]]
print

cosine_similarities = linear_kernel(apply_mat[n], train_mat).flatten()

for i, v in sorted(list(enumerate(cosine_similarities)),key=lambda x: x[1], reverse=True)[:10]:
    print '%s :: %g' % (cond_text.keys()[i], v)


NCT01221012
(u"Current protective clothing of any kind, especially the nuclear-biological-chemical (NBC) protective garments, amplify thermal stress because of the inherent properties of the clothing material. High insulation coefficient and low water vapor permeability of the protective cloth limit the ability of sweat to readily evaporate, which consequently result with a significant elevation of body temperature and with excessive body fluid loss (because of inefficient sweating) that might result with significant dehydration.  Combat soldiers require to perform intense physical activities under hostile environmental conditions, including in contaminated areas that need the use of protective garments. The ability to fulfill a mission derives from the soldier's professionalism and from his physiological limits. It is well established that wearing protective garments affect tolerance time (TT) and performance.  To operate in contaminated areas suitable protective garments should be worn. The inherent characteristics of the current protective garments limit, however, the effective working periods with such an ensemble, which is further aggravated in hot climate. Thus, in an attempt to increase TT and enhance work effectiveness, improved protective garments, which result with a lessened heat-stress, are under different stages of development.  The relative influence of air permeability properties, garment weight and garment construction on heat stress and physiological strain will be investigated.  The results will be quantified in terms of body temperatures, heart rate, fluid balance, subjective sensation and the maximum wear time (if necessary extrapolated).", u'')

Stress, Psychological :: 0.0752396
Dehydration :: 0.0740921
Body Weight :: 0.0628225
Kyphosis :: 0.0619349
Hyperhidrosis :: 0.0577191
Weight Loss :: 0.055042
Obesity :: 0.0516736
Hot Flashes :: 0.0511953
Hypothermia :: 0.0510834
Overweight :: 0.0483266

In [28]:
cond_text.keys()[553]


Out[28]:
u'Postoperative Complications'

In [24]:
trial_desc[to_classify[90]]


Out[24]:
(u'The purpose of this single centre study is to evaluate whether the use of Harmonic Synergy Dissecting Hook (Harmonic Scalpel) in breast reconstruction surgery is superior to conventional diathermy in terms of improving ease and speed of perforator dissection, reducing postoperative pain, days of hospital stay and time taken to return to daily activities.',
 u'')

In [ ]: