In [1]:
import numpy
import pandas
import rwwr
In [2]:
# restart probability
r = 0.2
In [3]:
# Read genetic overlap between diseases
dice_df = pandas.read_table('data/disease-similarity.tsv', index_col=0)
len(dice_df)
Out[3]:
In [4]:
# Filter diseases without any similarity
doid_ids = dice_df.columns[(dice_df > 0).sum() > 1]
dice_df = dice_df.loc[doid_ids, doid_ids]
len(dice_df)
Out[4]:
In [5]:
# Run random walk for each disease
rows = list()
for doid_id in dice_df.columns:
df = dice_df.copy()
df = df.drop(doid_id, axis=0) # drop row
seed = df[doid_id]
df = df.drop(doid_id, axis=1) # drop column
mat = df.as_matrix()
probs, steps = rwwr.walk(r, seed, mat)
rows.extend(zip([doid_id] * len(df), df.columns, probs))
rw_df = pandas.DataFrame(rows, columns=['source_id', 'target_id', 'proximity'])
In [6]:
# Add Disease Ontology names
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/term-names.tsv'
doid_df = pandas.read_table(url).query("type == 'name'").drop('type', 1)
s_df = doid_df.rename(columns={'doid': 'source_id', 'name': 'source_name'})
t_df = doid_df.rename(columns={'doid': 'target_id', 'name': 'target_name'})
rw_df = s_df.merge(t_df.merge(rw_df))
In [7]:
# Sort by disease_name and proximity
rw_df = rw_df.sort(['source_name', 'proximity'], ascending=[True, False])
# Save as a tsv
rw_df.to_csv('data/proximities.tsv', sep='\t', index=False, float_format='%.6f')
In [8]:
# Top MS proximities
rw_df.query("source_name == 'multiple sclerosis'").head(15)
Out[8]: