In [1]:
library(dplyr, warn=FALSE)
library(readr)
library(tidyr)
library(ggplot2)
In [2]:
platforms = c(
'snps_affy500_log'='Affymetrix 500K Set',
'snps_hh550_log'='Illumina HumanHap550',
'snps_ho1_log'='Illum. HumanOmni1',
'snps_exac_log'='ExAC',
'snps_kg3_log'='1000 Genomes Phase 3'
)
platform_df = dplyr::data_frame(
platform = names(platforms),
platform_name = as.character(platforms)
)
platform_df$platform_name = factor(platform_df$platform_name, levels = platform_df$platform_name)
platform_df
Out[2]:
In [3]:
combined_df = readr::read_tsv('data/combined.tsv.gz')
head(combined_df, 2)
Out[3]:
In [4]:
gathered_df = combined_df %>%
dplyr::select(-(snps_hh550:snps_kg3)) %>%
tidyr::gather(platform, snps_log, snps_hh550_log:snps_kg3_log) %>%
dplyr::inner_join(platform_df)
head(gathered_df, 2)
Out[4]:
Since, the SNP abudance distribution has tails containing few points, accurate estimates of degree cannot be estimated. Ideally, we could just crop the x-axis to exclude the extremes. However, ggplot2 does not currently support this operation for facetted plots. Therefore, we identify the 2nd and 98th percentile and exclude points beyond this boundary. Note that if at least 2% of SNP abundances are 0, then no data is excluded from the left-tail of SNP abundances.
In [5]:
percentile_df = gathered_df %>%
dplyr::group_by(platform) %>%
dplyr::summarize(
percentile_02 = quantile(snps_log, 0.02),
percentile_98 = quantile(snps_log, 0.98)
)
percentile_df
Out[5]:
In [6]:
trimmed_df = gathered_df %>%
dplyr::inner_join(percentile_df) %>%
dplyr::filter(snps_log >= percentile_02,
snps_log <= percentile_98)
In [7]:
metaedge_subset = c(
'gene - association - disease',
'gene - participation - biological process',
'gene - participation - molecular function',
'gene - participation - cellular component',
'gene - participation - pathway',
'gene - expression - anatomy',
'gene - interaction - gene',
'gene - regulation - perturbation'
)
In [8]:
log10p = function(x) {log10(1 + x)}
snp_labels = round(10 ** seq(0, max(percentile_df$percentile_98), 0.4) - 1)
In [9]:
gg_base = function(gg) {
gg = gg +
ggplot2::theme_bw() +
ggplot2::theme(strip.background=ggplot2::element_rect(fill='#FEF2E2')) +
ggplot2::theme(plot.margin = grid::unit(c(2, 2, 2, 2), 'points'))
return(gg)
}
In [10]:
saveall <- function(gg, name, dpi=300, w=7, h=7) {
# Save a ggplot2 object as svg, pdf, and png
path = file.path('figure', name)
ggplot2::ggsave(paste0(path, '.svg'), gg, width=w, height=h, limitsize=FALSE)
ggplot2::ggsave(paste0(path, '.pdf'), gg, width=w, height=h, limitsize=FALSE)
ggplot2::ggsave(paste0(path, '.png'), gg, dpi=dpi, width=w, height=h, limitsize=FALSE)
}
In [11]:
w = 8; h = 7
degree_labels = round(10 ** seq(0, 3, 0.32) - 1)
options(repr.plot.width=w, repr.plot.height = h)
gg_0 = trimmed_df %>%
dplyr::filter(metaedge %in% metaedge_subset) %>%
ggplot2::ggplot(aes(x=snps_log, y=degree_log, fill=metaedge)) %>% gg_base() +
ggplot2::facet_grid(. ~ platform_name, space = 'free_x', scales='free_x') +
ggplot2::geom_smooth(method='gam', formula=formula("y ~ s(x, bs='cs')"), linetype=0, n=1000, sp=50) +
ggplot2::theme(legend.position='top', legend.direction='vertical',
legend.key=ggplot2::element_rect(color='white'),
legend.key.height = grid::unit(0.4, 'cm')) +
ggplot2::scale_x_continuous(breaks=log10p(snp_labels), labels=snp_labels, name='SNPs in Gene') +
ggplot2::scale_y_continuous(breaks=log10p(degree_labels), labels=degree_labels, name='Network Degree') +
ggplot2::guides(fill = ggplot2::guide_legend(ncol = 2, title = NULL))
saveall(gg_0, 'degree-v-snps', w=w, h=h)
gg_0