In [1]:
import numpy
import pandas

import rwwr

In [2]:
# restart probability
r = 0.2

In [3]:
# Read genetic overlap between diseases
dice_df = pandas.read_table('data/disease-similarity.tsv', index_col=0)
len(dice_df)


Out[3]:
90

In [4]:
# Filter diseases without any similarity
doid_ids = dice_df.columns[(dice_df > 0).sum() > 1]
dice_df = dice_df.loc[doid_ids, doid_ids]
len(dice_df)


Out[4]:
82

In [5]:
# Run random walk for each disease
rows = list()

for doid_id in dice_df.columns:
    df = dice_df.copy()
    df = df.drop(doid_id, axis=0) # drop row
    seed = df[doid_id]
    df = df.drop(doid_id, axis=1) # drop column
    mat = df.as_matrix()
    probs, steps = rwwr.walk(r, seed, mat)
    rows.extend(zip([doid_id] * len(df), df.columns, probs))

rw_df = pandas.DataFrame(rows, columns=['source_id', 'target_id', 'proximity'])

In [6]:
# Add Disease Ontology names
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/term-names.tsv'
doid_df = pandas.read_table(url).query("type == 'name'").drop('type', 1)
s_df = doid_df.rename(columns={'doid': 'source_id', 'name': 'source_name'})
t_df = doid_df.rename(columns={'doid': 'target_id', 'name': 'target_name'})
rw_df = s_df.merge(t_df.merge(rw_df))

In [7]:
# Sort by disease_name and proximity
rw_df = rw_df.sort(['source_name', 'proximity'], ascending=[True, False])
# Save as a tsv
rw_df.to_csv('data/proximities.tsv', sep='\t', index=False, float_format='%.6f')

In [8]:
# Top MS proximities
rw_df.query("source_name == 'multiple sclerosis'").head(15)


Out[8]:
source_id source_name target_id target_name proximity
1411 DOID:2377 multiple sclerosis DOID:10608 celiac disease 0.080378
1385 DOID:2377 multiple sclerosis DOID:8778 Crohn's disease 0.061036
1407 DOID:2377 multiple sclerosis DOID:12236 primary biliary cirrhosis 0.052170
1457 DOID:2377 multiple sclerosis DOID:9744 type 1 diabetes mellitus 0.049153
1404 DOID:2377 multiple sclerosis DOID:8577 ulcerative colitis 0.045905
1395 DOID:2377 multiple sclerosis DOID:7148 rheumatoid arthritis 0.040804
1454 DOID:2377 multiple sclerosis DOID:2841 asthma 0.040312
1423 DOID:2377 multiple sclerosis DOID:8893 psoriasis 0.037900
1394 DOID:2377 multiple sclerosis DOID:7147 ankylosing spondylitis 0.035559
1409 DOID:2377 multiple sclerosis DOID:4481 allergic rhinitis 0.035437
1431 DOID:2377 multiple sclerosis DOID:8567 Hodgkin's lymphoma 0.031840
1452 DOID:2377 multiple sclerosis DOID:9074 systemic lupus erythematosus 0.029657
1402 DOID:2377 multiple sclerosis DOID:12306 vitiligo 0.029418
1448 DOID:2377 multiple sclerosis DOID:5082 liver cirrhosis 0.029288
1410 DOID:2377 multiple sclerosis DOID:332 amyotrophic lateral sclerosis 0.026867