Combine SNP Abundances with Network Degrees


In [1]:
import gzip

import pandas
import numpy
import matplotlib.pyplot
import seaborn

%matplotlib inline

In [2]:
# Read gene connectivity degrees
degree_df = pandas.read_table('data/gene-degrees.tsv.gz')
abbrev_df = pandas.read_table('download/network-summary.tsv')
degree_df = abbrev_df[['metaedge', 'abbreviation']].merge(degree_df)
degree_df.head(2)


Out[2]:
metaedge abbreviation entrez_gene_id symbol degree
0 gene - participation - pathway GpPW 1 A1BG 1
1 gene - participation - pathway GpPW 29974 A1CF 1

In [3]:
# Read SNPs per Gene on genotyping platforms
snp_count_df = pandas.read_table('data/platforms/combined.tsv')
snp_count_df = snp_count_df.iloc[:, 3:]
snp_count_df.head(2)


Out[3]:
entrez_gene_id snps_hh550 snps_ho1 snps_affy500 snps_exac snps_kg3
0 79501 0 0 0 4 10
1 148398 0 13 0 9 159

In [4]:
long_df = degree_df.merge(snp_count_df)
assert not any(long_df.duplicated(['entrez_gene_id', 'metaedge']))
for column in 'degree', 'snps_hh550', 'snps_ho1', 'snps_affy500', 'snps_exac', 'snps_kg3':
    long_df['{}_log'.format(column)] = numpy.log10(1 + long_df[column])
with gzip.open('data/combined.tsv.gz', 'wt') as write_file:
    long_df.to_csv(write_file, index=False, sep='\t', float_format='%.5g')
long_df.head(2)


Out[4]:
metaedge abbreviation entrez_gene_id symbol degree snps_hh550 snps_ho1 snps_affy500 snps_exac snps_kg3 degree_log snps_hh550_log snps_ho1_log snps_affy500_log snps_exac_log snps_kg3_log
0 gene - participation - pathway GpPW 1 A1BG 1 4 12 1 6 38 0.30103 0.69897 1.113943 0.30103 0.845098 1.591065
1 gene - downregulation - compound GdC 1 A1BG 0 4 12 1 6 38 0.00000 0.69897 1.113943 0.30103 0.845098 1.591065

Visualization


In [5]:
long_df = pandas.read_table('data/combined.tsv.gz')

In [6]:
with seaborn.axes_style('white'):
    grid = seaborn.jointplot(x='snps_hh550_log', y='snps_ho1_log', data=long_df, kind='hex', gridsize=25, size=4)



In [7]:
plot_df = long_df.dropna().drop_duplicates('entrez_gene_id')
seaborn.set_style('white')
grid = seaborn.PairGrid(plot_df, vars=['snps_hh550_log', 'snps_ho1_log', 'snps_affy500_log', 'snps_exac_log', 'snps_kg3_log'])
grid.map_diag(seaborn.distplot, kde=False)
grid.map_lower(matplotlib.pyplot.hexbin, cmap='Blues', gridsize=25, linewidths=0)
seaborn.despine(top=True, right=True, left=True, bottom=True)