In [1]:
%load_ext sql
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from collections import Counter
from __future__ import division
In [2]:
# sql connection parameters come from connect.py
have_connect = !ls connect.py 2>/dev/null
if len(have_connect) == 0:
!mv ../../connect.py .
from connect import *
get_ipython().magic('sql mysql://' + mysqlusername + ':' + mysqlpassword + '@' + mysqlserver + ':3306/' + mysqldbname)
conditions = %sql select nct_id, mesh_term from condition_browse
mesh_lookup = %sql select mesh_id, mesh_term from mesh_thesaurus
In [3]:
mesh_terms = {mesh_id: mesh_term for mesh_id, mesh_term in mesh_lookup}
mesh_ids = {}
for id, term in mesh_lookup:
# create term-to-id lookup
if term not in mesh_ids: mesh_ids[term] = set()
mesh_ids[term].add(id)
In [4]:
condition_study = {}
for studyid, cond in conditions:
mesh_cats = set([mesh_terms[m[:7]] for m in mesh_ids[cond]])
for m in list(mesh_cats):
if m not in condition_study: condition_study[m] = set()
condition_study[m].add(studyid)
Remove categories with less than 40 studies
In [5]:
for c in condition_study.keys():
if len(condition_study[c]) < 40:
del condition_study[c]
In [6]:
def jaccard_similarity(set1, set2):
return float(len(set1 & set2)) / len(set1 | set2)
In [7]:
max_dist = 10
condition_list = condition_study.keys()
num_cond = len(condition_list)
df = pd.DataFrame(index=condition_list, columns=condition_list, dtype=np.float32)
for i in range(num_cond):
for j in range(num_cond):
cond1 = condition_list[i]
cond2 = condition_list[j]
if i == j:
dist = 0
elif j < i:
dist = df[cond2][cond1]
elif len(condition_study[cond1] & condition_study[cond2]) > 0:
sim = jaccard_similarity(condition_study[cond1], condition_study[cond2])
dist = max([np.log10(sim), (-1 * max_dist)]) * -1
else:
dist = max_dist
df[cond1][cond2] = dist
Iteratively create clusters
In [8]:
clus_df = df.copy()
final_groups = []
groups = []
for s in range(10,1,-1):
for i in range(40):
db = DBSCAN(eps=(40 - i)/10.0, min_samples=s, metric='precomputed').fit(clus_df.as_matrix())
cnt = Counter(db.labels_)
if -1 in cnt and 0 in cnt and len(cnt) == 2 and cnt[-1] > cnt[0]:
nonzero = {clus_df.index.values[i]: n for i, n in enumerate(db.labels_) if n >= 0}
else:
nonzero = {clus_df.index.values[i]: n for i, n in enumerate(db.labels_) if n > 0}
for ax in [0,1]:
clus_df.drop(nonzero.keys(), axis=ax, inplace=True)
for c in set(nonzero.values()):
groups.append([k for k, v in nonzero.items() if v == c])
to_add = [g for g in groups if len(g) <= 15]
print s
print to_add
final_groups += to_add
# regenerate dataframe for next round
clus_df = df.copy()
to_drop = {m for g in final_groups for m in g}
for ax in [0,1]:
clus_df.drop(list(to_drop), axis=ax, inplace=True)
groups = []
print
print 'Total classified: %d out of %d' % (len([f for g in final_groups for f in g]), len(condition_list))
print '%d clusters' % len(final_groups)
Add unclustered in as a group
In [9]:
unclustered = [c for c in condition_list if c not in {s for g in final_groups for s in g}]
print unclustered
final_groups.append(unclustered)
Generate SQL
In [10]:
group_dict = {i: g for i, g in enumerate(final_groups)}
for i in group_dict.keys():
if len(group_dict[i]) <= 15:
mesh_cats = set([m[:7] for d in group_dict[i] for m in mesh_ids[d]])
print " max(case when substr(mesh_id,1,7) in ('%s') then 1 else 0 end) clus_%s," % ("','".join(list(mesh_cats)),str(i))
In [11]:
for i, g in enumerate(final_groups):
print i, ', '.join(g)
In [ ]: