Load necessary packages and extensions


In [1]:
%load_ext sql
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from collections import Counter
from __future__ import division

Get conditions data from database


In [2]:
# sql connection parameters come from connect.py
have_connect = !ls connect.py 2>/dev/null
if len(have_connect) == 0:
    !mv ../../connect.py .
from connect import *

get_ipython().magic('sql mysql://' + mysqlusername + ':' + mysqlpassword + '@' + mysqlserver + ':3306/' + mysqldbname)
conditions = %sql select nct_id, mesh_term from condition_browse
mesh_lookup = %sql select mesh_id, mesh_term from mesh_thesaurus


309589 rows affected.
54935 rows affected.

Create dictionary of mesh_terms to mesh_id's


In [3]:
mesh_terms = {mesh_id: mesh_term for mesh_id, mesh_term in mesh_lookup}
mesh_ids = {}
for id, term in mesh_lookup:
    # create term-to-id lookup
    if term not in mesh_ids: mesh_ids[term] = set()
    mesh_ids[term].add(id)

Create dictionary of condition categories to list of relevant studies


In [4]:
condition_study = {}
for studyid, cond in conditions:
    mesh_cats = set([mesh_terms[m[:7]] for m in mesh_ids[cond]])
    for m in list(mesh_cats):
        if m not in condition_study: condition_study[m] = set()
        condition_study[m].add(studyid)

Remove categories with less than 40 studies


In [5]:
for c in condition_study.keys():
    if len(condition_study[c]) < 40:
        del condition_study[c]

Define Jaccard similarity function


In [6]:
def jaccard_similarity(set1, set2):
    return float(len(set1 & set2)) / len(set1 | set2)

Generate distance matrix


In [7]:
max_dist = 10
condition_list = condition_study.keys()
num_cond = len(condition_list)
df = pd.DataFrame(index=condition_list, columns=condition_list, dtype=np.float32)
for i in range(num_cond):
    for j in range(num_cond):
        cond1 = condition_list[i]
        cond2 = condition_list[j]
        if i == j:
            dist = 0
        elif j < i:
            dist = df[cond2][cond1]
        elif len(condition_study[cond1] & condition_study[cond2]) > 0:
            sim = jaccard_similarity(condition_study[cond1], condition_study[cond2])
            dist = max([np.log10(sim), (-1 * max_dist)]) * -1
        else:
            dist = max_dist
        df[cond1][cond2] = dist

Cluster using DBSCAN

Iteratively create clusters


In [8]:
clus_df = df.copy()
final_groups = []
groups = []
for s in range(10,1,-1):
    for i in range(40):
        db = DBSCAN(eps=(40 - i)/10.0, min_samples=s, metric='precomputed').fit(clus_df.as_matrix())
        cnt = Counter(db.labels_)
        if -1 in cnt and 0 in cnt and len(cnt) == 2 and cnt[-1] > cnt[0]:
            nonzero = {clus_df.index.values[i]: n for i, n in enumerate(db.labels_) if n >= 0}
        else:
            nonzero = {clus_df.index.values[i]: n for i, n in enumerate(db.labels_) if n > 0}
        for ax in [0,1]:
            clus_df.drop(nonzero.keys(), axis=ax, inplace=True)
        for c in set(nonzero.values()):
            groups.append([k for k, v in nonzero.items() if v == c])
    to_add = [g for g in groups if len(g) <= 15]
    print s
    print to_add
    final_groups += to_add
    # regenerate dataframe for next round
    clus_df = df.copy()
    to_drop = {m for g in final_groups for m in g}
    for ax in [0,1]:
        clus_df.drop(list(to_drop), axis=ax, inplace=True)
    groups = []

print
print 'Total classified: %d out of %d' % (len([f for g in final_groups for f in g]), len(condition_list))
print '%d clusters' % len(final_groups)


10
[['Substance Withdrawal Syndrome', 'Anxiety Disorders', 'Sociology', 'Marijuana Abuse', 'Substance-Related Disorders', 'Tobacco Use Disorder', 'Mood Disorders', 'Alcohol-Related Disorders', 'Cocaine-Related Disorders', 'Behavior', 'Opioid-Related Disorders'], ['Immunologic Deficiency Syndromes', 'Virus Diseases', 'Sexually Transmitted Diseases', 'DNA Virus Infections', 'Skin Diseases, Viral', 'Slow Virus Diseases', 'Hepatitis, Viral, Human', 'RNA Virus Infections']]
9
[]
8
[['Optic Nerve Diseases', 'Otorhinolaryngologic Neoplasms', 'Ocular Motility Disorders', 'Neurocutaneous Syndromes', 'Laryngeal Diseases', 'Cranial Nerve Diseases']]
7
[['Orbital Diseases', 'Eye Diseases, Hereditary', 'Lacrimal Apparatus Diseases', 'Eye Diseases', 'Corneal Diseases']]
6
[['Leg Injuries', 'Hip Injuries', 'Arm Injuries', 'Fractures, Bone', 'Spinal Injuries', 'Back Injuries']]
5
[]
4
[['Fetal Diseases', 'Rupture', 'Pregnancy Complications'], ['Genetic Variation', 'Cellular Structures', 'Ploidies', 'Genetic Structures'], ['Bone Diseases', 'Endocrine System Diseases', 'Dwarfism', 'Pituitary Diseases']]
3
[['Skin Diseases, Parasitic', 'Helminthiasis', 'Protozoan Infections'], ['Biological Processes', 'Connective Tissue', 'Burns'], ['Dissociative Disorders', 'Nervous System Diseases', 'Autonomic Nervous System Diseases', 'Somatoform Disorders'], ['Radiation Injuries', 'Environment', 'Public Health'], ['Uveal Diseases', 'Eye Neoplasms', 'Retinal Diseases'], ['Eye Infections, Viral', 'Conjunctival Diseases', 'Eye Infections'], ['Tumor Virus Infections', 'Mycoses', 'Neoplasms, Experimental'], ['Neoplasms, Multiple Primary', 'Neoplastic Syndromes, Hereditary', 'Nervous System Malformations'], ['Rheumatic Diseases', 'Joint Diseases', 'Encephalitis, Viral', 'Central Nervous System Viral Diseases', 'Connective Tissue Diseases', 'Poisoning', 'Autoimmune Diseases', 'Demyelinating Diseases', 'Autoimmune Diseases of the Nervous System', 'Arbovirus Infections', 'Neurotoxicity Syndromes'], ['Nutrition Disorders', 'Neurodegenerative Diseases', 'Anthropometry', 'Body Constitution', 'Physiological Processes', 'Delirium, Dementia, Amnestic, Cognitive Disorders', 'Neurologic Manifestations', 'Central Nervous System Diseases', 'Diagnostic Techniques and Procedures', 'Neurobehavioral Manifestations', 'Signs and Symptoms'], ['Vascular Diseases', 'Pathologic Processes', 'Heart Diseases'], ['Craniocerebral Trauma', 'Spinal Cord Injuries', 'Trauma, Nervous System'], ['Musculoskeletal Abnormalities', 'Temporomandibular Joint Disorders', 'Stomatognathic System Abnormalities', 'Jaw Diseases']]
2
[['Ear Diseases', 'Vision Disorders', 'Lens Diseases', 'Refractive Errors', 'Ocular Hypertension'], ['Foot Diseases', 'Fasciitis', 'Foot Deformities'], ['Wounds, Nonpenetrating', 'Wounds and Injuries'], ['Mental Disorders Diagnosed in Childhood', 'Schizophrenia and Disorders with Psychotic Features', 'Personality Disorders', 'Mental Disorders'], ['Immune System Processes', 'Purpura, Thrombocytopenic'], ['Nervous System Physiological Phenomena', 'Psychophysiology'], ['Pharyngeal Diseases', 'Mouth Diseases', 'Tooth Diseases'], ['Pathological Conditions, Anatomical', 'Biliary Tract Diseases'], ['Muscular Diseases', 'Tendon Injuries', 'Neuromuscular Diseases'], ['Diabetes Mellitus', 'Metabolic Diseases'], ['Infection', 'Bacterial Infections'], ['Congenital Abnormalities', 'Cardiovascular Abnormalities'], ['Neoplasms, Second Primary', 'Neoplastic Processes'], ['Respiratory Tract Infections', 'Nose Diseases'], ['Respiratory Tract Neoplasms', 'Bronchial Diseases', 'Lung Diseases', 'Hypersensitivity', 'Respiratory Hypersensitivity'], ['Digestive System Neoplasms', 'Pancreatic Diseases', 'Endocrine Gland Neoplasms', 'Infant, Newborn, Diseases', 'Gonadal Disorders', 'Lymphatic Diseases', 'Hematologic Diseases', 'Gastrointestinal Diseases', 'Immunoproliferative Disorders', 'Neoplasms by Histologic Type', 'Neoplasms by Site'], ['Urogenital Neoplasms', 'Genital Diseases, Male'], ['Female Urogenital Diseases', 'Urologic Diseases']]

Total classified: 160 out of 204
39 clusters

Add unclustered in as a group


In [9]:
unclustered = [c for c in condition_list if c not in {s for g in final_groups for s in g}]
print unclustered
final_groups.append(unclustered)


['Graft vs Host Disease', 'Thoracic Injuries', 'Cardiovascular Diseases', 'Pregnancy Complications, Neoplastic', 'Precancerous Conditions', 'Reproductive Physiological Phenomena', 'Respiration Disorders', 'Digestive System Diseases', 'Eyelid Diseases', 'Skin Diseases', 'Impulse Control Disorders', 'Digestive System Fistula', 'Eating Disorders', 'Urogenital Abnormalities', 'Cell Physiological Processes', 'Paraneoplastic Syndromes', 'Musculoskeletal Diseases', 'Parathyroid Diseases', 'Sprains and Strains', 'Neoplasms', 'Wound Infection', 'Cysts', 'Tennis Elbow', 'Respiratory Physiological Phenomena', 'Peritoneal Diseases', 'Opportunistic Infections', 'Neoplastic Cells, Circulating', 'Fatigue Syndrome, Chronic', 'Sleep Disorders', 'Chronobiology Disorders', 'Liver Diseases', 'Cartilage Diseases', 'Nervous System Neoplasms', 'Immune System Diseases', 'Dislocations', 'Adrenal Gland Diseases', 'Sexual and Gender Disorders', 'Genetic Diseases, Inborn', 'Pharmacological Phenomena', 'Thyroid Diseases', 'Respiratory Tract Diseases', 'Digestive System Abnormalities', 'Pleural Diseases', 'Lacerations']

Generate SQL


In [10]:
group_dict = {i: g for i, g in enumerate(final_groups)}
for i in group_dict.keys():
    if len(group_dict[i]) <= 15:
        mesh_cats = set([m[:7] for d in group_dict[i] for m in mesh_ids[d]])
        print "  max(case when substr(mesh_id,1,7) in ('%s') then 1 else 0 end) clus_%s," % ("','".join(list(mesh_cats)),str(i))


  max(case when substr(mesh_id,1,7) in ('F04.096','F03.080','C25','C25.100','C25.835','F03.600','C25.675','I01.880','C25.912','C25.300','C25.635','F03.900','F01.145') then 1 else 0 end) clus_0,
  max(case when substr(mesh_id,1,7) in ('C02.825','C02.782','C02.800','C13.351','C17.800','C02.440','C02.256','C20.673','C02','C06.552','C02.839','C01.539','C12.294') then 1 else 0 end) clus_1,
  max(case when substr(mesh_id,1,7) in ('C08.360','C16.320','C10.228','C11.640','C04.588','C17.800','C11.590','C09.647','C09.400','C10.562','C16.131','C10.292') then 1 else 0 end) clus_2,
  max(case when substr(mesh_id,1,7) in ('C11.675','C11.204','C11.496','C16.320','C11.270','C11') then 1 else 0 end) clus_3,
  max(case when substr(mesh_id,1,7) in ('C26.117','C26.531','C26.831','C26.088','C26.558','C26.404') then 1 else 0 end) clus_4,
  max(case when substr(mesh_id,1,7) in ('C16.300','C26.761','C13.703') then 1 else 0 end) clus_5,
  max(case when substr(mesh_id,1,7) in ('A11.284','G05.700','G05.360','G05.365') then 1 else 0 end) clus_6,
  max(case when substr(mesh_id,1,7) in ('C19.700','C10.228','C16.320','C19.297','C19','C05.116') then 1 else 0 end) clus_7,
  max(case when substr(mesh_id,1,7) in ('C03.335','C03.752','C03.858','C17.800') then 1 else 0 end) clus_8,
  max(case when substr(mesh_id,1,7) in ('A10.165','C26.200','G16.100') then 1 else 0 end) clus_9,
  max(case when substr(mesh_id,1,7) in ('F03.875','C10.177','C10','F03.300') then 1 else 0 end) clus_10,
  max(case when substr(mesh_id,1,7) in ('H02.403','N06.230','N01.400','N06.850','C26.733','G16.500') then 1 else 0 end) clus_11,
  max(case when substr(mesh_id,1,7) in ('C11.319','C04.588','C11.941','C11.768') then 1 else 0 end) clus_12,
  max(case when substr(mesh_id,1,7) in ('C02.325','C11.294','C01.539','C11.187') then 1 else 0 end) clus_13,
  max(case when substr(mesh_id,1,7) in ('E05.598','C04.619','C01.703','C04.925','C02.928') then 1 else 0 end) clus_14,
  max(case when substr(mesh_id,1,7) in ('C10.500','C04.700','C16.320','C16.131','C04.651') then 1 else 0 end) clus_15,
  max(case when substr(mesh_id,1,7) in ('C10.720','C10.114','C10.314','C05.550','C10.228','C17.300','C02.182','C05.799','C25.723','C02.081','C20.111','C02.290') then 1 else 0 end) clus_16,
  max(case when substr(mesh_id,1,7) in ('C18.654','F01.700','F03.087','E01.370','C10.228','G07.700','E05.041','C10.597','C10.574','N06.850','C23.888','G07.100') then 1 else 0 end) clus_17,
  max(case when substr(mesh_id,1,7) in ('C23.550','C14.280','C14.907') then 1 else 0 end) clus_18,
  max(case when substr(mesh_id,1,7) in ('C26.819','C26.915','C26.260','C10.900','C10.228') then 1 else 0 end) clus_19,
  max(case when substr(mesh_id,1,7) in ('C07.650','C07.678','C16.131','C05.660','C05.500','C05.651','C07.320','C05.550') then 1 else 0 end) clus_20,
  max(case when substr(mesh_id,1,7) in ('C09.218','C11.525','C11.744','C11.510','C11.966','C10.597','C23.888') then 1 else 0 end) clus_21,
  max(case when substr(mesh_id,1,7) in ('C05.321','C05.360','C17.800','C05.330') then 1 else 0 end) clus_22,
  max(case when substr(mesh_id,1,7) in ('C26','C26.974') then 1 else 0 end) clus_23,
  max(case when substr(mesh_id,1,7) in ('F03.550','F03.700','F03.675','F03') then 1 else 0 end) clus_24,
  max(case when substr(mesh_id,1,7) in ('C23.550','C15.378','G12.425','C20.841','C23.888') then 1 else 0 end) clus_25,
  max(case when substr(mesh_id,1,7) in ('F02.830','G11.561','H01.158','F04.096','E02.190') then 1 else 0 end) clus_26,
  max(case when substr(mesh_id,1,7) in ('C07.465','C07.550','C09.775','C07.793') then 1 else 0 end) clus_27,
  max(case when substr(mesh_id,1,7) in ('C06.130','C23.300') then 1 else 0 end) clus_28,
  max(case when substr(mesh_id,1,7) in ('C10.668','C26.874','C05.651') then 1 else 0 end) clus_29,
  max(case when substr(mesh_id,1,7) in ('C18.452','C19.246') then 1 else 0 end) clus_30,
  max(case when substr(mesh_id,1,7) in ('C01.539','C01.252') then 1 else 0 end) clus_31,
  max(case when substr(mesh_id,1,7) in ('C16.131','C14.240') then 1 else 0 end) clus_32,
  max(case when substr(mesh_id,1,7) in ('C04.692','C04.697','C23.550') then 1 else 0 end) clus_33,
  max(case when substr(mesh_id,1,7) in ('C08.730','C08.460','C01.539','C09.603') then 1 else 0 end) clus_34,
  max(case when substr(mesh_id,1,7) in ('C04.588','C08.381','C20.543','C08.785','C08.674','C08.127') then 1 else 0 end) clus_35,
  max(case when substr(mesh_id,1,7) in ('C15.378','C04.557','C15.604','C19.391','C04.588','C20.683','C06.689','C06.301','C19.344','C16.614','C06.405') then 1 else 0 end) clus_36,
  max(case when substr(mesh_id,1,7) in ('C04.588','C12.294','C13.351','C12.758') then 1 else 0 end) clus_37,
  max(case when substr(mesh_id,1,7) in ('C13.351','C12.777') then 1 else 0 end) clus_38,

In [11]:
for i, g in enumerate(final_groups):
    print i, ', '.join(g)


0 Substance Withdrawal Syndrome, Anxiety Disorders, Sociology, Marijuana Abuse, Substance-Related Disorders, Tobacco Use Disorder, Mood Disorders, Alcohol-Related Disorders, Cocaine-Related Disorders, Behavior, Opioid-Related Disorders
1 Immunologic Deficiency Syndromes, Virus Diseases, Sexually Transmitted Diseases, DNA Virus Infections, Skin Diseases, Viral, Slow Virus Diseases, Hepatitis, Viral, Human, RNA Virus Infections
2 Optic Nerve Diseases, Otorhinolaryngologic Neoplasms, Ocular Motility Disorders, Neurocutaneous Syndromes, Laryngeal Diseases, Cranial Nerve Diseases
3 Orbital Diseases, Eye Diseases, Hereditary, Lacrimal Apparatus Diseases, Eye Diseases, Corneal Diseases
4 Leg Injuries, Hip Injuries, Arm Injuries, Fractures, Bone, Spinal Injuries, Back Injuries
5 Fetal Diseases, Rupture, Pregnancy Complications
6 Genetic Variation, Cellular Structures, Ploidies, Genetic Structures
7 Bone Diseases, Endocrine System Diseases, Dwarfism, Pituitary Diseases
8 Skin Diseases, Parasitic, Helminthiasis, Protozoan Infections
9 Biological Processes, Connective Tissue, Burns
10 Dissociative Disorders, Nervous System Diseases, Autonomic Nervous System Diseases, Somatoform Disorders
11 Radiation Injuries, Environment, Public Health
12 Uveal Diseases, Eye Neoplasms, Retinal Diseases
13 Eye Infections, Viral, Conjunctival Diseases, Eye Infections
14 Tumor Virus Infections, Mycoses, Neoplasms, Experimental
15 Neoplasms, Multiple Primary, Neoplastic Syndromes, Hereditary, Nervous System Malformations
16 Rheumatic Diseases, Joint Diseases, Encephalitis, Viral, Central Nervous System Viral Diseases, Connective Tissue Diseases, Poisoning, Autoimmune Diseases, Demyelinating Diseases, Autoimmune Diseases of the Nervous System, Arbovirus Infections, Neurotoxicity Syndromes
17 Nutrition Disorders, Neurodegenerative Diseases, Anthropometry, Body Constitution, Physiological Processes, Delirium, Dementia, Amnestic, Cognitive Disorders, Neurologic Manifestations, Central Nervous System Diseases, Diagnostic Techniques and Procedures, Neurobehavioral Manifestations, Signs and Symptoms
18 Vascular Diseases, Pathologic Processes, Heart Diseases
19 Craniocerebral Trauma, Spinal Cord Injuries, Trauma, Nervous System
20 Musculoskeletal Abnormalities, Temporomandibular Joint Disorders, Stomatognathic System Abnormalities, Jaw Diseases
21 Ear Diseases, Vision Disorders, Lens Diseases, Refractive Errors, Ocular Hypertension
22 Foot Diseases, Fasciitis, Foot Deformities
23 Wounds, Nonpenetrating, Wounds and Injuries
24 Mental Disorders Diagnosed in Childhood, Schizophrenia and Disorders with Psychotic Features, Personality Disorders, Mental Disorders
25 Immune System Processes, Purpura, Thrombocytopenic
26 Nervous System Physiological Phenomena, Psychophysiology
27 Pharyngeal Diseases, Mouth Diseases, Tooth Diseases
28 Pathological Conditions, Anatomical, Biliary Tract Diseases
29 Muscular Diseases, Tendon Injuries, Neuromuscular Diseases
30 Diabetes Mellitus, Metabolic Diseases
31 Infection, Bacterial Infections
32 Congenital Abnormalities, Cardiovascular Abnormalities
33 Neoplasms, Second Primary, Neoplastic Processes
34 Respiratory Tract Infections, Nose Diseases
35 Respiratory Tract Neoplasms, Bronchial Diseases, Lung Diseases, Hypersensitivity, Respiratory Hypersensitivity
36 Digestive System Neoplasms, Pancreatic Diseases, Endocrine Gland Neoplasms, Infant, Newborn, Diseases, Gonadal Disorders, Lymphatic Diseases, Hematologic Diseases, Gastrointestinal Diseases, Immunoproliferative Disorders, Neoplasms by Histologic Type, Neoplasms by Site
37 Urogenital Neoplasms, Genital Diseases, Male
38 Female Urogenital Diseases, Urologic Diseases
39 Graft vs Host Disease, Thoracic Injuries, Cardiovascular Diseases, Pregnancy Complications, Neoplastic, Precancerous Conditions, Reproductive Physiological Phenomena, Respiration Disorders, Digestive System Diseases, Eyelid Diseases, Skin Diseases, Impulse Control Disorders, Digestive System Fistula, Eating Disorders, Urogenital Abnormalities, Cell Physiological Processes, Paraneoplastic Syndromes, Musculoskeletal Diseases, Parathyroid Diseases, Sprains and Strains, Neoplasms, Wound Infection, Cysts, Tennis Elbow, Respiratory Physiological Phenomena, Peritoneal Diseases, Opportunistic Infections, Neoplastic Cells, Circulating, Fatigue Syndrome, Chronic, Sleep Disorders, Chronobiology Disorders, Liver Diseases, Cartilage Diseases, Nervous System Neoplasms, Immune System Diseases, Dislocations, Adrenal Gland Diseases, Sexual and Gender Disorders, Genetic Diseases, Inborn, Pharmacological Phenomena, Thyroid Diseases, Respiratory Tract Diseases, Digestive System Abnormalities, Pleural Diseases, Lacerations

In [ ]: