Visualizing correlates of 'SNPs in gene'


In [1]:
library(dplyr, warn=FALSE)
library(readr)
library(tidyr)
library(ggplot2)

In [2]:
platforms = c(
  'snps_affy500_log'='Affymetrix 500K Set',
  'snps_hh550_log'='Illumina HumanHap550',
  'snps_ho1_log'='Illum. HumanOmni1',
  'snps_exac_log'='ExAC',
  'snps_kg3_log'='1000 Genomes Phase 3'
)
platform_df = dplyr::data_frame(
  platform = names(platforms),
  platform_name = as.character(platforms)
)
platform_df$platform_name = factor(platform_df$platform_name, levels = platform_df$platform_name)
platform_df


Out[2]:
platformplatform_name
1snps_affy500_logAffymetrix 500K Set
2snps_hh550_logIllumina HumanHap550
3snps_ho1_logIllum. HumanOmni1
4snps_exac_logExAC
5snps_kg3_log1000 Genomes Phase 3

In [3]:
combined_df = readr::read_tsv('data/combined.tsv.gz')
head(combined_df, 2)


Out[3]:
metaedgeabbreviationentrez_gene_idsymboldegreesnps_hh550snps_ho1snps_affy500snps_exacsnps_kg3degree_logsnps_hh550_logsnps_ho1_logsnps_affy500_logsnps_exac_logsnps_kg3_log
1gene - participation - pathwayGpPW1A1BG141216380.301030.698971.11390.301030.84511.5911
2gene - downregulation - compoundGdC1A1BG0412163800.698971.11390.301030.84511.5911

In [4]:
gathered_df = combined_df %>%
  dplyr::select(-(snps_hh550:snps_kg3)) %>%
  tidyr::gather(platform, snps_log, snps_hh550_log:snps_kg3_log) %>%
  dplyr::inner_join(platform_df)

head(gathered_df, 2)


Joining by: "platform"
Warning message:
In inner_join_impl(x, y, by$x, by$y): joining factor and character vector, coercing into character vector
Out[4]:
metaedgeabbreviationentrez_gene_idsymboldegreedegree_logplatformsnps_logplatform_name
1gene - participation - pathwayGpPW1A1BG10.30103snps_hh550_log0.69897Illumina HumanHap550
2gene - downregulation - compoundGdC1A1BG00snps_hh550_log0.69897Illumina HumanHap550

Remove top percentile of data points

Since, the SNP abudance distribution has tails containing few points, accurate estimates of degree cannot be estimated. Ideally, we could just crop the x-axis to exclude the extremes. However, ggplot2 does not currently support this operation for facetted plots. Therefore, we identify the 2nd and 98th percentile and exclude points beyond this boundary. Note that if at least 2% of SNP abundances are 0, then no data is excluded from the left-tail of SNP abundances.


In [5]:
percentile_df = gathered_df %>%
  dplyr::group_by(platform) %>%
  dplyr::summarize(
    percentile_02 = quantile(snps_log, 0.02),
    percentile_98 = quantile(snps_log, 0.98)
  )
percentile_df


Out[5]:
platformpercentile_02percentile_98
1snps_affy500_log01.9956
2snps_exac_log01.3802
3snps_hh550_log02.0645
4snps_ho1_log0.778152.2304
5snps_kg3_log1.27883.0734

In [6]:
trimmed_df = gathered_df %>%
  dplyr::inner_join(percentile_df) %>%
  dplyr::filter(snps_log >= percentile_02,
                snps_log <= percentile_98)


Joining by: "platform"

Visualization helpers


In [7]:
metaedge_subset = c(
  'gene - association - disease',
  'gene - participation - biological process',
  'gene - participation - molecular function',
  'gene - participation - cellular component',
  'gene - participation - pathway',
  'gene - expression - anatomy',
  'gene - interaction - gene',
  'gene - regulation - perturbation'
)

In [8]:
log10p = function(x) {log10(1 + x)}
snp_labels = round(10 ** seq(0, max(percentile_df$percentile_98), 0.4) - 1)

In [9]:
gg_base = function(gg) {
    gg = gg + 
      ggplot2::theme_bw() + 
      ggplot2::theme(strip.background=ggplot2::element_rect(fill='#FEF2E2')) +
      ggplot2::theme(plot.margin = grid::unit(c(2, 2, 2, 2), 'points'))
    return(gg)
}

In [10]:
saveall <- function(gg, name, dpi=300, w=7, h=7) {
  # Save a ggplot2 object as svg, pdf, and png
  path = file.path('figure', name)
  ggplot2::ggsave(paste0(path, '.svg'), gg, width=w, height=h, limitsize=FALSE)
  ggplot2::ggsave(paste0(path, '.pdf'), gg, width=w, height=h, limitsize=FALSE)
  ggplot2::ggsave(paste0(path, '.png'), gg, dpi=dpi, width=w, height=h, limitsize=FALSE)
}

Network degree versus SNPs in gene


In [11]:
w = 8; h = 7
degree_labels = round(10 ** seq(0, 3, 0.32) - 1)
options(repr.plot.width=w, repr.plot.height = h)
gg_0 = trimmed_df %>%
  dplyr::filter(metaedge %in% metaedge_subset) %>%
  ggplot2::ggplot(aes(x=snps_log, y=degree_log, fill=metaedge)) %>% gg_base() +
  ggplot2::facet_grid(. ~ platform_name, space = 'free_x', scales='free_x') +
  ggplot2::geom_smooth(method='gam', formula=formula("y ~ s(x, bs='cs')"), linetype=0, n=1000, sp=50) +
  ggplot2::theme(legend.position='top', legend.direction='vertical',
                 legend.key=ggplot2::element_rect(color='white'),
                 legend.key.height = grid::unit(0.4, 'cm')) +
  ggplot2::scale_x_continuous(breaks=log10p(snp_labels), labels=snp_labels, name='SNPs in Gene') +
  ggplot2::scale_y_continuous(breaks=log10p(degree_labels), labels=degree_labels, name='Network Degree') +
  ggplot2::guides(fill = ggplot2::guide_legend(ncol = 2, title = NULL))

saveall(gg_0, 'degree-v-snps', w=w, h=h)
gg_0


Network degree (as a percent of mean) versus SNPs in gene


In [12]:
w = 8; h = 7
options(repr.plot.width=w, repr.plot.height = h)
gg_1 = trimmed_df %>%
  dplyr::group_by(metaedge) %>%
  dplyr::mutate(degree_percent = 100 * degree_log / mean(degree_log)) %>%
  dplyr::ungroup() %>%
  dplyr::filter(metaedge %in% metaedge_subset) %>%
  ggplot2::ggplot(aes(x=snps_log, y=degree_percent, fill=metaedge)) %>% gg_base() +
  ggplot2::facet_grid(. ~ platform_name, space = 'free_x',  scales='free_x') +
  ggplot2::geom_smooth(method='gam', formula=formula("y ~ s(x, bs='cs')"), linetype=0, n=1000, sp=50) +
  ggplot2::coord_cartesian(ylim=c(60, 140)) +
  ggplot2::theme(legend.position='top', legend.direction='vertical',
                 legend.key=ggplot2::element_rect(color='white'),
                 legend.key.height = grid::unit(0.4, 'cm')) +
  ggplot2::scale_x_continuous(breaks=log10p(snp_labels), labels=snp_labels, name='SNPs in Gene') +
  ggplot2::ylab('Percent of Mean Degree') +
  ggplot2::guides(fill = ggplot2::guide_legend(ncol = 2, title = NULL))

saveall(gg_1, 'adj-degree-v-snps', w=w, h=h)
gg_1


Network degree versus SNPs in Gene for all metaedges


In [13]:
metaedge_df = readr::read_tsv('download/network-summary.tsv') %>%
  dplyr::filter(edges >= 1000)

In [14]:
snp_labels = round(10 ** seq(0, max(gathered_df$snps_log), 0.6) - 1)

In [15]:
w = 8.25; h = 60
degree_labeler = function(x) {format(round(10 ** x - 1, 2), nsmall=2)}

degree_labels = round(10 ** seq(0, 3, 0.01) - 1, )
options(repr.plot.width=w, repr.plot.height=h)
gg_2 = gathered_df %>%
  dplyr::filter(metaedge %in% metaedge_df$metaedge) %>%
  ggplot2::ggplot(aes(x=snps_log, y=degree_log)) %>% gg_base() +
  ggplot2::facet_grid(abbreviation ~ platform_name, scales='free', space='free') +
  ggplot2::geom_smooth(method='gam', formula=formula("y ~ s(x, bs='cs')"), n=200, linetype=0, fill='#00693E', sp=15) +
  ggplot2::scale_x_continuous(breaks=log10p(snp_labels), labels=snp_labels, name='SNPs in Gene') +
  ggplot2::scale_y_continuous(labels=degree_labeler, name='Network Degree')

saveall(gg_2, 'degree-v-snps-all', w=w, h=h)
gg_2



In [16]:
# See metaedge abbreviations here
metaedge_df


Out[16]:
metaedgeabbreviationinvertededgessource_nodestarget_nodesunbiased
1gene - participation - pathwayGpPW079101951116150
2gene - downregulation - compoundGdC12031292374820312
3gene - variation - diseaseGvD11284926911284
4gene - expression - anatomyGeA098786218041194914193
5gene - participation - biological processGpBP053400414533110140
6gene < knockdown downregulation < geneG11409429574207140942
7gene - upregulation - compoundGuC11726195372917261
8gene < knockdown upregulation < geneG178752970406878752
9gene - participation - molecular functionGpMF0938571289028260
10gene > overexpression downregulation > geneGod>G014911166083014911
11gene > knockdown downregulation > geneGkd>G01409424207957140942
12gene > overexpression upregulation > geneGup>G015292186290915292
13gene - participation - cellular componentGpCC0653771026913320
14gene - evolution - geneGeG01233821245312453123382
15gene - target - compoundGtC14603113311820
16gene < overexpression downregulation < geneG114911830166014911
17gene - upregulation - anatomyGuA097848159293697848
18gene - regulation - perturbationGrPB03665021824533950
19gene < overexpression upregulation < geneG115292909186215292
20gene - binding - compoundGbC125025725540
21gene - association - diseaseGaD11182450671340
22gene > knockdown upregulation > geneGku>G078752406897078752
23gene - interaction - geneGiG0297253152111521131575
24gene - downregulation - anatomyGdA01022401509736102240