Visualizing correlates of 'SNPs in gene'


In [1]:
library(dplyr, warn=FALSE)
library(readr)
library(tidyr)
library(ggplot2)

In [2]:
platforms = c(
  'snps_affy500_log'='Affymetrix 500K Set',
  'snps_hh550_log'='Illumina HumanHap550',
  'snps_ho1_log'='Illum. HumanOmni1',
  'snps_exac_log'='ExAC',
  'snps_kg3_log'='1000 Genomes Phase 3'
)
platform_df = dplyr::data_frame(
  platform = names(platforms),
  platform_name = as.character(platforms)
)
platform_df$platform_name = factor(platform_df$platform_name, levels = platform_df$platform_name)
platform_df


Out[2]:
platformplatform_name
1snps_affy500_logAffymetrix 500K Set
2snps_hh550_logIllumina HumanHap550
3snps_ho1_logIllum. HumanOmni1
4snps_exac_logExAC
5snps_kg3_log1000 Genomes Phase 3

In [3]:
combined_df = readr::read_tsv('data/combined.tsv.gz')
head(combined_df, 2)


Out[3]:
metaedgeabbreviationentrez_gene_idsymboldegreesnps_hh550snps_ho1snps_affy500snps_exacsnps_kg3degree_logsnps_hh550_logsnps_ho1_logsnps_affy500_logsnps_exac_logsnps_kg3_log
1gene - participation - pathwayGpPW1A1BG141216380.301030.698971.11390.301030.84511.5911
2gene - downregulation - compoundGdC1A1BG0412163800.698971.11390.301030.84511.5911

In [4]:
gathered_df = combined_df %>%
  dplyr::select(-(snps_hh550:snps_kg3)) %>%
  tidyr::gather(platform, snps_log, snps_hh550_log:snps_kg3_log) %>%
  dplyr::inner_join(platform_df)

head(gathered_df, 2)


Joining by: "platform"
Warning message:
In inner_join_impl(x, y, by$x, by$y): joining factor and character vector, coercing into character vector
Out[4]:
metaedgeabbreviationentrez_gene_idsymboldegreedegree_logplatformsnps_logplatform_name
1gene - participation - pathwayGpPW1A1BG10.30103snps_hh550_log0.69897Illumina HumanHap550
2gene - downregulation - compoundGdC1A1BG00snps_hh550_log0.69897Illumina HumanHap550

Remove top percentile of data points

Since, the SNP abudance distribution has tails containing few points, accurate estimates of degree cannot be estimated. Ideally, we could just crop the x-axis to exclude the extremes. However, ggplot2 does not currently support this operation for facetted plots. Therefore, we identify the 2nd and 98th percentile and exclude points beyond this boundary. Note that if at least 2% of SNP abundances are 0, then no data is excluded from the left-tail of SNP abundances.


In [5]:
percentile_df = gathered_df %>%
  dplyr::group_by(platform) %>%
  dplyr::summarize(
    percentile_02 = quantile(snps_log, 0.02),
    percentile_98 = quantile(snps_log, 0.98)
  )
percentile_df


Out[5]:
platformpercentile_02percentile_98
1snps_affy500_log01.9956
2snps_exac_log01.3802
3snps_hh550_log02.0645
4snps_ho1_log0.778152.2304
5snps_kg3_log1.27883.0734

In [6]:
trimmed_df = gathered_df %>%
  dplyr::inner_join(percentile_df) %>%
  dplyr::filter(snps_log >= percentile_02,
                snps_log <= percentile_98)


Joining by: "platform"

Visualization helpers


In [7]:
metaedge_subset = c(
  'gene - association - disease',
  'gene - participation - biological process',
  'gene - participation - molecular function',
  'gene - participation - cellular component',
  'gene - participation - pathway',
  'gene - expression - anatomy',
  'gene - interaction - gene',
  'gene - regulation - perturbation'
)

In [8]:
log10p = function(x) {log10(1 + x)}
snp_labels = round(10 ** seq(0, max(percentile_df$percentile_98), 0.4) - 1)

In [9]:
gg_base = function(gg) {
    gg = gg + 
      ggplot2::theme_bw() + 
      ggplot2::theme(strip.background=ggplot2::element_rect(fill='#FEF2E2')) +
      ggplot2::theme(plot.margin = grid::unit(c(2, 2, 2, 2), 'points'))
    return(gg)
}

In [10]:
saveall <- function(gg, name, dpi=300, w=7, h=7) {
  # Save a ggplot2 object as svg, pdf, and png
  path = file.path('figure', name)
  ggplot2::ggsave(paste0(path, '.svg'), gg, width=w, height=h, limitsize=FALSE)
  ggplot2::ggsave(paste0(path, '.pdf'), gg, width=w, height=h, limitsize=FALSE)
  ggplot2::ggsave(paste0(path, '.png'), gg, dpi=dpi, width=w, height=h, limitsize=FALSE)
}

Network degree versus SNPs in gene


In [11]:
w = 8; h = 7
degree_labels = round(10 ** seq(0, 3, 0.32) - 1)
options(repr.plot.width=w, repr.plot.height = h)
gg_0 = trimmed_df %>%
  dplyr::filter(metaedge %in% metaedge_subset) %>%
  ggplot2::ggplot(aes(x=snps_log, y=degree_log, fill=metaedge)) %>% gg_base() +
  ggplot2::facet_grid(. ~ platform_name, space = 'free_x', scales='free_x') +
  ggplot2::geom_smooth(method='gam', formula=formula("y ~ s(x, bs='cs')"), linetype=0, n=1000, sp=50) +
  ggplot2::theme(legend.position='top', legend.direction='vertical',
                 legend.key=ggplot2::element_rect(color='white'),
                 legend.key.height = grid::unit(0.4, 'cm')) +
  ggplot2::scale_x_continuous(breaks=log10p(snp_labels), labels=snp_labels, name='SNPs in Gene') +
  ggplot2::scale_y_continuous(breaks=log10p(degree_labels), labels=degree_labels, name='Network Degree') +
  ggplot2::guides(fill = ggplot2::guide_legend(ncol = 2, title = NULL))

saveall(gg_0, 'degree-v-snps', w=w, h=h)
gg_0