In [1]:
import math
import itertools

import numpy
import pandas

In [2]:
doid_exclusions = {
    'DOID:0050589', # inflammatory bowel disease because redundant with UC and Crohn's
    'DOID:2914', # immune system disease because it is non-specific
}

In [3]:
def overlaps(locus_0, locus_1):
    """
    Returns wether two genomic regions (loci) overlap.
    Loci are (chromosome, lower_coordinate, upper_coordinate) tuples.
    """
    chrom_0, lower_0, upper_0 = locus_0
    chrom_1, lower_1, upper_1 = locus_1
    if chrom_0 != chrom_1:
        return False
    return min(upper_0, upper_1) >= max(lower_0, lower_1)

In [4]:
# Read associations from GWAS catalog
url = 'https://raw.githubusercontent.com/dhimmel/gwas-catalog/a5aa4910708a3995501ebe4136d8b9d601463fa1/data/snp-associations.tsv'
snp_df = pandas.read_table(url)
snp_df = snp_df[-snp_df.doid_code.isin(doid_exclusions)]

# Check that coordinates are well-formed
assert all(snp_df.lower_coord <= snp_df.upper_coord)

In [5]:
# Restrict to high-confidence associations
snp_df = snp_df[snp_df.mlog_pvalue >= -math.log10(5e-8)]
snp_df = snp_df[snp_df.samples >= 1000]

In [6]:
# Single association per locus
snp_df = snp_df.drop_duplicates(['doid_code', 'locus'])
snp_df = snp_df[['doid_code', 'doid_name', 'lead_chrom', 'lower_coord', 'upper_coord']]
snp_df.to_csv('data/loci.tsv', sep='\t', index=False)

In [7]:
# Calculate loci per disease
count_df = snp_df.groupby(['doid_code', 'doid_name']).apply(lambda df: pandas.Series({'count': len(df)}))
count_df = count_df.reset_index().sort('count', ascending=False)
count_df.to_csv('data/loci-counts.tsv', sep='\t', index=False)

# Filter diseases without at least 3 associations
doid_ids = sorted(count_df.doid_code[count_df['count'] >= 3])
snp_df = snp_df[snp_df.doid_code.isin(doid_ids)]

In [8]:
# Count the number of overlaping loci per locus
snp_df['locus_tuple'] = [tuple(row) for i, row in snp_df[['lead_chrom', 'lower_coord', 'upper_coord']].iterrows()]
locus_to_count = dict()
for locus in set(snp_df.locus_tuple):
    count = sum(overlaps(locus, l) for l in snp_df.locus_tuple)
    locus_to_count[locus] = count

In [9]:
# Compute disease-disease similarities
dice_df = pandas.DataFrame(index=doid_ids, columns=doid_ids)
dice_df.index.name = 'doid_id'

for group_0, group_1 in itertools.product(snp_df.groupby(['doid_code', 'doid_name']), repeat=2):
    (doid_id_0, doid_name_0), df_0 = group_0
    (doid_id_1, doid_name_1), df_1 = group_1
    shared = list()
    for locus in df_0.locus_tuple:
        if any(overlaps(locus, l) for l in df_1.locus_tuple):
            shared.append(locus)
    total = list(df_0.locus_tuple) + list(df_1.locus_tuple)
    dice = 2.0 * len(shared) / len(total)
    weight = lambda x: locus_to_count[x] ** -0.5
    dice_weighted = 2.0 * sum(map(weight, shared)) / sum(map(weight, total))
    dice_df.loc[doid_id_0, doid_id_1] = round(dice_weighted, 6)

In [10]:
# Save similarities to tsv
dice_df.reset_index().to_csv('data/disease-similarity.tsv', index=False, sep='\t', float_format='%.5f')

In [11]:
dice_df.head()


Out[11]:
DOID:0050156 DOID:0050425 DOID:0050741 DOID:1024 DOID:10286 DOID:1040 DOID:10608 DOID:10652 DOID:1067 DOID:10763 ... DOID:9074 DOID:9206 DOID:9296 DOID:9352 DOID:9538 DOID:9744 DOID:9835 DOID:986 DOID:9952 DOID:9970
doid_id
DOID:0050156 1 0 0 0 0.014564 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
DOID:0050425 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
DOID:0050741 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
DOID:1024 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
DOID:10286 0.015954 0 0 0 1 0.01497 0 0.016684 0 0 ... 0 0 0.014018 0.0176 0 0.017189 0 0 0 0

5 rows × 90 columns