In [1]:
import math
import itertools
import numpy
import pandas
In [2]:
doid_exclusions = {
'DOID:0050589', # inflammatory bowel disease because redundant with UC and Crohn's
'DOID:2914', # immune system disease because it is non-specific
}
In [3]:
def overlaps(locus_0, locus_1):
"""
Returns wether two genomic regions (loci) overlap.
Loci are (chromosome, lower_coordinate, upper_coordinate) tuples.
"""
chrom_0, lower_0, upper_0 = locus_0
chrom_1, lower_1, upper_1 = locus_1
if chrom_0 != chrom_1:
return False
return min(upper_0, upper_1) >= max(lower_0, lower_1)
In [4]:
# Read associations from GWAS catalog
url = 'https://raw.githubusercontent.com/dhimmel/gwas-catalog/a5aa4910708a3995501ebe4136d8b9d601463fa1/data/snp-associations.tsv'
snp_df = pandas.read_table(url)
snp_df = snp_df[-snp_df.doid_code.isin(doid_exclusions)]
# Check that coordinates are well-formed
assert all(snp_df.lower_coord <= snp_df.upper_coord)
In [5]:
# Restrict to high-confidence associations
snp_df = snp_df[snp_df.mlog_pvalue >= -math.log10(5e-8)]
snp_df = snp_df[snp_df.samples >= 1000]
In [6]:
# Single association per locus
snp_df = snp_df.drop_duplicates(['doid_code', 'locus'])
snp_df = snp_df[['doid_code', 'doid_name', 'lead_chrom', 'lower_coord', 'upper_coord']]
snp_df.to_csv('data/loci.tsv', sep='\t', index=False)
In [7]:
# Calculate loci per disease
count_df = snp_df.groupby(['doid_code', 'doid_name']).apply(lambda df: pandas.Series({'count': len(df)}))
count_df = count_df.reset_index().sort('count', ascending=False)
count_df.to_csv('data/loci-counts.tsv', sep='\t', index=False)
# Filter diseases without at least 3 associations
doid_ids = sorted(count_df.doid_code[count_df['count'] >= 3])
snp_df = snp_df[snp_df.doid_code.isin(doid_ids)]
In [8]:
# Count the number of overlaping loci per locus
snp_df['locus_tuple'] = [tuple(row) for i, row in snp_df[['lead_chrom', 'lower_coord', 'upper_coord']].iterrows()]
locus_to_count = dict()
for locus in set(snp_df.locus_tuple):
count = sum(overlaps(locus, l) for l in snp_df.locus_tuple)
locus_to_count[locus] = count
In [9]:
# Compute disease-disease similarities
dice_df = pandas.DataFrame(index=doid_ids, columns=doid_ids)
dice_df.index.name = 'doid_id'
for group_0, group_1 in itertools.product(snp_df.groupby(['doid_code', 'doid_name']), repeat=2):
(doid_id_0, doid_name_0), df_0 = group_0
(doid_id_1, doid_name_1), df_1 = group_1
shared = list()
for locus in df_0.locus_tuple:
if any(overlaps(locus, l) for l in df_1.locus_tuple):
shared.append(locus)
total = list(df_0.locus_tuple) + list(df_1.locus_tuple)
dice = 2.0 * len(shared) / len(total)
weight = lambda x: locus_to_count[x] ** -0.5
dice_weighted = 2.0 * sum(map(weight, shared)) / sum(map(weight, total))
dice_df.loc[doid_id_0, doid_id_1] = round(dice_weighted, 6)
In [10]:
# Save similarities to tsv
dice_df.reset_index().to_csv('data/disease-similarity.tsv', index=False, sep='\t', float_format='%.5f')
In [11]:
dice_df.head()
Out[11]: