Create and visualize disease similarity and proximity networks


In [1]:
import pandas
import numpy
import networkx
import matplotlib.pyplot as plt

%matplotlib inline

Merge similarity and proximity scores into a single dataframe


In [2]:
dice_df = pandas.read_table('data/disease-similarity.tsv', index_col=0)
similarity_df = pandas.melt(dice_df.reset_index(), id_vars='doid_id')
similarity_df.columns = ['source_id', 'target_id', 'similarity']
prox_df = pandas.read_table('data/proximities.tsv')
network_df = prox_df.merge(similarity_df)

In [3]:
def consolidate(df):
    series = df.iloc[0, :].copy()
    for column in 'proximity', 'similarity':
        series[column] = df[column].mean()
    return series

by_map = network_df.apply(lambda x: frozenset([x.source_id, x.target_id]), axis=1)
network_df = network_df.groupby(by_map).apply(consolidate)

In [4]:
network_df.to_csv('data/network.tsv', sep='\t', index=False, float_format='%.6f')
network_df.head(5)


Out[4]:
source_id source_name target_id target_name proximity similarity
(DOID:10652, DOID:14221) DOID:10652 Alzheimer's disease DOID:14221 metabolic syndrome X 0.150427 0.054377
(DOID:1936, DOID:10652) DOID:10652 Alzheimer's disease DOID:1936 atherosclerosis 0.141439 0.032741
(DOID:10652, DOID:1107) DOID:10652 Alzheimer's disease DOID:1107 esophageal carcinoma 0.069386 0.033051
(DOID:10286, DOID:10652) DOID:10652 Alzheimer's disease DOID:10286 prostate carcinoma 0.050564 0.016684
(DOID:10652, DOID:10871) DOID:10652 Alzheimer's disease DOID:10871 age related macular degeneration 0.044876 0.000000

In [5]:
# Create html table browswer
! R --quiet -e "rmarkdown::render('tables.Rmd')"


> rmarkdown::render('tables.Rmd')


processing file: tables.Rmd
  |...........                                                      |  17%
  ordinary text without R code

  |......................                                           |  33%
label: unnamed-chunk-1
  |................................                                 |  50%
  ordinary text without R code

  |...........................................                      |  67%
label: unnamed-chunk-2
  |......................................................           |  83%
  ordinary text without R code

  |.................................................................| 100%
label: unnamed-chunk-3

output file: tables.knit.md

/usr/bin/pandoc tables.utf8.md --to html --from markdown+autolink_bare_uris+ascii_identifiers+tex_math_single_backslash-implicit_figures --output tables.html --smart --email-obfuscation none --self-contained --standalone --section-divs --template /home/dhimmels/R/x86_64-pc-linux-gnu-library/3.2/rmarkdown/rmd/h/default.html --variable 'theme:cosmo' --include-in-header /tmp/RtmpxdseJc/rmarkdown-str2501731b37b.html --mathjax --variable 'mathjax-url:https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' --highlight-style pygments 

Output created: tables.html
> 
> 

Construct and plot networkx Graphs


In [6]:
# normalize metrics to have mean of 1 for weighted layout
for column in 'proximity', 'similarity':
    network_df[column] /= network_df[column].mean()

In [7]:
g_sim = networkx.Graph(name='similarity')
g_prox = networkx.Graph(name='proximity')

for i, row in network_df.iterrows():
    for column, graph in ('similarity', g_sim), ('proximity', g_prox):
        weight = row[column]
        if weight == 0:
            continue
        graph.add_edge(row.source_id, row.target_id, weight=weight)
        graph.node[row.source_id]['name'] = row.source_name
        graph.node[row.target_id]['name'] = row.target_name

for graph in g_sim, g_prox:
    print(networkx.info(graph), '\n')


Name: similarity
Type: Graph
Number of nodes: 82
Number of edges: 433
Average degree:  10.5610 

Name: proximity
Type: Graph
Number of nodes: 82
Number of edges: 3321
Average degree:  81.0000 


In [8]:
# Check whether the similarity network is connected
assert networkx.is_connected(g_sim)

In [9]:
# Manual node colors
color_df = pandas.DataFrame(list({
    'Autoimmune': '#998EC3',
    'Solid Cancer': '#F1A340',
    'Hematologic Cancer': '#F7F7F7',
    numpy.nan: '#D8D8D8'
}.items()), columns=['category', 'color'])

# Read manual disease categorizations
category_df = pandas.read_table('input/doid-categories.tsv')
category_df = category_df.merge(color_df)
category_df = category_df[category_df.doid_code.isin(g_sim.nodes())]

# MS set as special color
category_df.loc[category_df.doid_name == 'multiple sclerosis', 'color'] = '#423869'

# List of nodes to plot
nodelist = list(category_df.doid_code)

# Dictionary of nodes to label
label_dict_white = {row.doid_code: row.abbreviation for i, row in category_df.iterrows() if
                    row.category in {'Autoimmune', 'Solid Cancer'}}
label_dict_black = {row.doid_code: row.abbreviation for i, row in category_df.iterrows() if
                    row.category == 'Hematologic Cancer'}

In [10]:
def initially_position(nodes, seed=None):
    """
    Randomly generate initial positions for networkx nodes.
    Useful if setting the random seed to enable reproducible
    layouts.
    """
    numpy.random.seed(seed)
    rand_pos = numpy.random.ranf((len(nodes), 2))
    return {k: tuple(v) for k, v in zip(nodes, rand_pos)}

Plot the similarity network


In [11]:
plt.figure(figsize=(5, 5))
plt.axis('off')

positions = initially_position(nodelist, 1)
positions = networkx.spring_layout(g_sim, pos=positions, weight='weight', iterations=500, k = 1.8 * len(g_sim)**-0.5)
networkx.draw_networkx_nodes(g_sim, pos=positions, nodelist=nodelist, node_color=category_df.color, linewidths=0.5)
for u, v, data in g_sim.edges(data=True):
    networkx.draw_networkx_edges(g_sim, pos=positions, alpha=0.1, edgelist=[(u, v)], width=data['weight'] / 2)
networkx.draw_networkx_labels(g_sim, pos=positions, labels=label_dict_black, font_size=8);
plt.xlim((-0.035, 1.035))
plt.ylim((-0.035, 1.035))
plt.savefig('figure/similarity-network.svg', bbox_inches='tight')
plt.savefig('figure/similarity-network.png', dpi=450, bbox_inches='tight')


/home/dhimmels/anaconda3/lib/python3.4/site-packages/networkx/drawing/layout.py:269: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
  if pos==None:

Plot the proximity network


In [12]:
plt.figure(figsize=(5, 5))
plt.axis('off')
positions = initially_position(nodelist, 1)
positions = networkx.spring_layout(g_prox, pos=positions, weight='weight', iterations=500,  k = 3 * len(g_sim)**-0.5)
networkx.draw_networkx_nodes(g_prox, pos=positions, nodelist=list(category_df.doid_code), node_color=category_df.color, linewidths=0.5)
for u, v, data in g_prox.edges(data=True):
    networkx.draw_networkx_edges(g_sim, pos=positions, alpha=0.1, edgelist=[(u, v)], width=data['weight'] / 2)
networkx.draw_networkx_labels(g_sim, pos=positions, labels=label_dict_black, font_size=8, font_color='black');
networkx.draw_networkx_labels(g_sim, pos=positions, labels=label_dict_white, font_size=7, font_color='white');
plt.xlim((-0.035, 1.035))
plt.ylim((-0.035, 1.035))
plt.savefig('figure/proximity-network.svg', bbox_inches='tight')
plt.savefig('figure/proximity-network.png', dpi=450, bbox_inches='tight')


/home/dhimmels/anaconda3/lib/python3.4/site-packages/networkx/drawing/layout.py:269: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
  if pos==None: