In this notebook we further explore the sample-level QC results generated by Sample-Level-QC.Rmd and specifically take a closer look at the outliers.
RMarkdown was used to run a particular set of analyses and generate a canned report. If you have questions about the details of the analyses, see the canned reports for details. This notebook reads in those previously computed results, and re-renders 'small multiples' of the results.
With this notebook, specifically you can:
It is currently configured to read in the QC results for the Simons Genome Diversity Project data. You can modify it to instead read in results for DeepVariant Platinum Genomes, 1000 Genomes, or results from your own cohort.
In [1]:
lapply(c('devtools', 'tidyverse', 'bigrquery', 'scales', 'skimr'),
function(pkg_name) { if(! pkg_name %in% installed.packages()) { install.packages(pkg_name)} } )
In [2]:
library(bigrquery)
library(scales)
library(skimr)
library(tidyverse)
In [3]:
#-----------[ CHANGE THIS] ------------------
BILLING_PROJECT_ID <- 'YOUR-PROJECT-ID'
In [4]:
AUTOSOMES <- c(paste('chr', c(as.character(seq(1, 22))), sep = ''),
as.character(seq(1, 22)))
ALLOSOMES <- c('chrX', 'X', 'chrY', 'Y')
MITOCHONDRIA <- c('chrM', 'M', 'MT')
In [5]:
DEFAULT_PLOT_HEIGHT <- 7.5
options(repr.plot.width = 16, repr.plot.height = DEFAULT_PLOT_HEIGHT)
theme_set(theme_bw(base_size = 16))
update_geom_defaults('point', list(size = 2))
#' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
#'
#' @param d A data frame.
#' @return A data frame with column y as max and column label as length.
#'
get_boxplot_fun_data <- function(d) {
return(data.frame(y = max(d), label = stringr::str_c('N = ', length(d))))
}
In [6]:
raw_sample_info <- bq_table_download(bq_project_query(BILLING_PROJECT_ID, '
SELECT
id_from_vcf,
sex,
region
FROM
`bigquery-public-data.human_genome_variants.simons_genome_diversity_project_sample_attributes`
'))
print(skim(raw_sample_info))
Wrangle this into the columns we expect for the rest of the notebook.
Note that some cohorts have samples from several different sequencing platforms such as Illumina HiSeq2000 or Illumina HiSeqX or Complete Genomics, so many plots are faceted by sequencing platform.
In [7]:
sample_info <- raw_sample_info %>%
mutate(sequencing_platform = 'Illumina') %>%
select(name = id_from_vcf, sequencing_platform, sex, ancestry = region)
dim(sample_info)
In [8]:
genome_results <- read_csv(pipe(
'gsutil cat gs://genomics-public-data/simons-genome-diversity-project/reports/Simons_Genome_Diversity_Project_sample_results.csv'))
genome_results <- genome_results[,!grepl('X1', colnames(genome_results))] # Drop row name column, if present.
print(skim(genome_results))
In [9]:
chrom_results <- read_csv(pipe(
'gsutil cat gs://genomics-public-data/simons-genome-diversity-project/reports/Simons_Genome_Diversity_Project_sample_reference_results.csv'))
chrom_results <- chrom_results[,!grepl('X1', colnames(chrom_results))] # Drop row name column, if present.
print(skim(chrom_results))
In [10]:
joined_genome_results <- inner_join(genome_results, sample_info)
dim(joined_genome_results)
In [11]:
joined_chrom_results <- inner_join(chrom_results, sample_info)
dim(joined_chrom_results)
In [12]:
sample_info %>%
group_by(ancestry, sex) %>%
summarize(
count = n(),
proportion = n() / nrow(.)
) %>%
ggplot(aes(x = ancestry, y = count, fill = ancestry)) +
geom_bar(stat = 'identity', position = 'dodge') +
geom_text(aes(label = str_glue('N = {count} ({percent(proportion)})'))) +
facet_wrap( ~ sex, ncol = 1) +
theme(axis.text.x = element_text(angle = 50, hjust = 1)) +
ggtitle('Sample count and proportion by ancestry and sex')
In [13]:
sample_info %>%
group_by(sequencing_platform, sex) %>%
summarize(
count = n(),
proportion = n() / nrow(.)
) %>%
ggplot(aes(x = sequencing_platform, y = count, fill = sequencing_platform)) +
geom_bar(stat = 'identity', position = 'dodge') +
geom_text(aes(label = str_glue('N = {count} ({percent(proportion)})'))) +
coord_cartesian(clip = 'off') +
facet_wrap(. ~ sex, ncol = 1) +
theme(axis.text.x = element_text(angle = 50, hjust = 1)) +
ggtitle('Sample count and proportion by sequencing platform and sex')
In [14]:
joined_chrom_results %>%
filter(reference_name %in% c(AUTOSOMES, ALLOSOMES, MITOCHONDRIA)) %>%
mutate(reference_name = parse_factor(reference_name, levels = c(AUTOSOMES, ALLOSOMES, MITOCHONDRIA))) %>%
ggplot(aes(y = number_of_calls, x = reference_name)) +
geom_boxplot() +
scale_y_continuous(labels = comma) +
facet_wrap(. ~ sequencing_platform, ncol = 1, scales = 'free_y') +
ylab('Number of Variant Calls') +
xlab('Chromosome') +
ggtitle('Box plot: Count of variant calls per genome by chromosome') +
theme(axis.text.x = element_text(angle = 50, hjust = 1))
Compute per chromosome mean and sd call count
In [15]:
chrom_calls_summary <- joined_chrom_results %>%
group_by(reference_name, sequencing_platform) %>%
summarize(
number_of_calls_mean = mean(number_of_calls),
number_of_calls_sd = sd(number_of_calls),
)
head(chrom_calls_summary)
In [16]:
low_chrom_calls_sd_multiplier <- 3
(chrom_calls_outliers <- joined_chrom_results %>%
inner_join(chrom_calls_summary) %>%
filter(number_of_calls < number_of_calls_mean - (low_chrom_calls_sd_multiplier * number_of_calls_sd)) %>%
arrange(reference_name, number_of_calls))
In [17]:
problems <- chrom_calls_outliers %>%
group_by(name) %>%
summarize(
fail = str_glue('per chrom variant count is at least {low_chrom_calls_sd_multiplier}',
' standard deviations below the platform mean for ',
str_c(reference_name, collapse = ',')))
dim(problems)
head(problems)
In [18]:
genome_call_rate_summary <- joined_chrom_results %>%
group_by(name, sex, ancestry) %>%
summarize(n = sum(number_of_calls))
p <- genome_call_rate_summary %>%
ggplot() +
geom_point(aes(x = name, y = n, color = ancestry, shape = sex)) +
scale_x_discrete(expand = c(0.05, 1)) +
scale_y_continuous(labels = comma) +
xlab('Sample') +
ylab('Number of Calls') +
ggtitle('Scatter Plot: Count of Calls Per Genome') +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank())
if (nrow(genome_call_rate_summary) <= 20) {
p + theme(axis.text.x = element_text(angle = 50, hjust = 1))
} else {
p + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank())
}
In [19]:
genome_call_rate_summary <- joined_chrom_results %>%
group_by(name, sex, sequencing_platform) %>%
summarize(n = sum(number_of_calls))
p <- genome_call_rate_summary %>%
ggplot() +
geom_point(aes(x = name, y = n, color = sequencing_platform, shape = sex)) +
scale_x_discrete(expand = c(0.05, 1)) +
scale_y_continuous(labels = comma) +
xlab('Sample') +
ylab('Number of Calls') +
ggtitle('Scatter Plot: Count of Calls Per Genome') +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank())
if (nrow(genome_call_rate_summary) <= 20) {
p + theme(axis.text.x = element_text(angle = 50, hjust = 1))
} else {
p + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank())
}
In [20]:
genome_calls_summary <- genome_call_rate_summary %>%
group_by(sequencing_platform) %>%
summarize(
genome_calls_mean = mean(n),
genome_calls_sd = sd(n),
)
head(genome_calls_summary)
In [21]:
low_genome_calls_sd_multiplier <- 3
(genome_calls_outliers <- genome_call_rate_summary %>%
inner_join(genome_calls_summary) %>%
filter(n < genome_calls_mean - (low_genome_calls_sd_multiplier * genome_calls_sd)) %>%
arrange(n))
In [22]:
problems <- rbind(
tibble(name = unique(genome_calls_outliers$name),
fail = str_glue('per genome variant count is at least {low_genome_calls_sd_multiplier}',
' standard deviations below the platform mean')),
problems)
dim(problems)
head(problems)
In [23]:
joined_genome_results %>%
ggplot(aes(y = missingness_rate, x = ancestry, fill = ancestry)) +
geom_boxplot() +
stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 3,
position = position_dodge(width = 0.9), vjust = -0.5) +
scale_y_continuous(labels = percent_format()) +
facet_wrap(. ~ sequencing_platform, ncol = 1, scales = 'free_y') +
ylab('Missingness Rate') +
xlab('Ancestry') +
ggtitle('Genome-Specific Missingness') +
theme(axis.text.x = element_text(angle = 50, hjust = 1))
In [24]:
p <- joined_genome_results %>%
ggplot(aes(x = name, y = missingness_rate, color = sex)) +
geom_point() +
scale_x_discrete(expand = c(0.05, 1)) +
scale_y_continuous(limits = c(0, NA), labels = percent_format()) +
facet_wrap(. ~ sequencing_platform, ncol = 1, scales = 'free') +
xlab('Sample') +
ylab('Missingness Rate') +
ggtitle('Scatter Plot: Genome-Specific Missingness')
if (nrow(joined_genome_results) <= 20) {
p + theme(axis.text.x = element_text(angle = 50, hjust = 1))
} else {
p + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank())
}
In [25]:
(missingness_summary <- joined_genome_results %>%
group_by(sequencing_platform) %>%
summarize(
missingness_rate_mean = mean(missingness_rate),
missingness_rate_sd = sd(missingness_rate),
))
In [26]:
low_missingness_sd_multiplier <- 3
(low_missingness_outliers <- joined_genome_results %>%
inner_join(missingness_summary) %>%
filter(
missingness_rate < missingness_rate_mean - (low_missingness_sd_multiplier * missingness_rate_sd)
) %>%
arrange(missingness_rate))
In [27]:
high_missingness_sd_multiplier <- 3
(high_missingness_outliers <- joined_genome_results %>%
inner_join(missingness_summary) %>%
filter(missingness_rate > missingness_rate_mean + (high_missingness_sd_multiplier * missingness_rate_sd)) %>%
arrange(desc(missingness_rate)))
In [28]:
problems <- rbind(
tibble(name = unique(high_missingness_outliers$name),
fail = str_glue('per genome missingness rate is at least {high_missingness_sd_multiplier}',
' standard deviations above the platform mean')),
tibble(name = unique(low_missingness_outliers$name),
fail = str_glue('per genome missingness rate is at least {low_missingness_sd_multiplier}',
' standard deviations below the platform mean')),
problems)
dim(problems)
head(problems)
In [29]:
joined_genome_results %>%
ggplot(aes(y = private_variant_count, x = ancestry, fill = ancestry)) +
geom_boxplot() +
stat_summary(fun.data = get_boxplot_fun_data, geom = 'text',
position = position_dodge(width = 0.9), vjust = -0.5) +
facet_wrap(. ~ sequencing_platform, ncol = 1, scales = 'free_y') +
scale_y_continuous(labels = comma, expand = c(0.3, 0)) +
ylab('Number of Singletons') +
xlab('Ancestry') +
ggtitle('Box plot: Count of singletons per genome by ancestry') +
theme(axis.text.x = element_text(angle = 50, hjust = 1))
In [30]:
p <- joined_genome_results %>%
ggplot(aes(x = name, y = private_variant_count, color = sex)) +
geom_point() +
scale_x_discrete(expand = c(0.05, 1)) +
scale_y_log10(labels = comma) +
facet_wrap(. ~ sequencing_platform, ncol = 1, scales = 'free') +
xlab('Sample') +
ylab('Number of Singletons (log scale)') +
ggtitle('Scatter Plot: Count of Singletons Per Genome')
if (nrow(joined_genome_results) <= 20) {
p + theme(axis.text.x = element_text(angle = 50, hjust = 1))
} else {
p + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank())
}
In [31]:
(singleton_summary <- joined_genome_results %>%
group_by(sequencing_platform, ancestry) %>%
summarize(
singleton_count_mean = mean(private_variant_count),
singleton_count_sd = sd(private_variant_count),
))
In [32]:
high_singleton_sd_multiplier <- 3
(high_singleton_outliers <- joined_genome_results %>%
inner_join(singleton_summary) %>%
filter(private_variant_count > singleton_count_mean + (high_singleton_sd_multiplier * singleton_count_sd)) %>%
arrange(desc(private_variant_count)))
In [33]:
problems <- rbind(
tibble(name = high_singleton_outliers$name,
fail = str_glue('singleton count at least {high_singleton_sd_multiplier}',
' standard deviations above the ancestry and platform mean')),
problems)
dim(problems)
head(problems)
In [34]:
ggplot(joined_genome_results,
aes(y = heterozygous_variant_count, x = ancestry, fill = ancestry)) +
geom_boxplot() +
stat_summary(fun.data = get_boxplot_fun_data, geom = 'text',
position = position_dodge(width = 0.9), vjust = -0.5) +
scale_y_continuous(labels = comma, expand = c(0.3, 0)) +
facet_wrap(. ~ sequencing_platform, ncol = 1, scales = 'free_y') +
ylab('Number of Heterozyous Variants') +
xlab('Ancestry') +
ggtitle('Box plot: Count of heterozygous variants per genome by ancestry') +
theme(axis.text.x = element_text(angle = 50, hjust = 1))
In [35]:
p <- ggplot(joined_genome_results) +
geom_point(aes(x = name, y = heterozygous_variant_count, color = sex)) +
scale_x_discrete(expand = c(0.05, 1)) +
scale_y_continuous(labels = comma) +
facet_wrap(. ~ sequencing_platform, ncol = 1, scales = 'free') +
xlab('Sample') +
ylab('Number of Heterozygous Variants') +
ggtitle('Scatter Plot: Count of Heterozygous Variants Per Genome')
if (nrow(joined_genome_results) <= 20) {
p + theme(axis.text.x = element_text(angle = 50, hjust = 1))
} else {
p + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank())
}
In [36]:
(heterozygous_variant_count_summary <- joined_genome_results %>%
group_by(sequencing_platform, ancestry) %>%
summarize(
heterozygous_variant_count_mean = mean(heterozygous_variant_count),
heterozygous_variant_count_sd = sd(heterozygous_variant_count),
))
In [37]:
low_het_count_sd_multiplier <- 3
(low_heterozygous_variant_count_outliers <- joined_genome_results %>%
inner_join(heterozygous_variant_count_summary) %>%
filter(heterozygous_variant_count
< heterozygous_variant_count_mean - (low_het_count_sd_multiplier * heterozygous_variant_count_sd)
) %>%
arrange(heterozygous_variant_count))
In [38]:
high_het_count_sd_multiplier <- 3
(high_heterozygous_variant_count_outliers <- joined_genome_results %>%
inner_join(heterozygous_variant_count_summary) %>%
filter(heterozygous_variant_count
> heterozygous_variant_count_mean + (high_het_count_sd_multiplier * heterozygous_variant_count_sd)
) %>%
arrange(desc(heterozygous_variant_count)))
In [39]:
problems <- rbind(
tibble(name = low_heterozygous_variant_count_outliers$name,
fail = str_glue('heterozygous variant count at least {low_het_count_sd_multiplier}',
' standard deviations below the ancestry and platform mean')),
tibble(name = low_heterozygous_variant_count_outliers$name,
fail = str_glue('heterozygous variant count at least {high_het_count_sd_multiplier}',
' standard deviations above the ancestry and platform mean')),
problems)
dim(problems)
head(problems)
In [40]:
joined_chrom_results %>%
filter(reference_name %in% c(AUTOSOMES, ALLOSOMES, MITOCHONDRIA)) %>%
mutate(reference_name = parse_factor(reference_name, levels = c(AUTOSOMES, ALLOSOMES, MITOCHONDRIA))) %>%
ggplot(aes(y = F, x = reference_name, color = sex)) +
geom_boxplot() +
facet_grid(sex ~ .) +
ylab('Fraction of Homozygous Variants') +
xlab('Reference Name') +
ggtitle('Fraction of Homozygous Variants Per Genome') +
theme(axis.text.x = element_text(angle = 50, hjust = 1))
In [41]:
head(fraction_homozygous_summary <- joined_chrom_results %>%
group_by(reference_name) %>%
summarize(
F_mean = mean(F),
F_sd = sd(F),
))
In [42]:
low_fraction_homozygous_sd_multiplier <- 4
(low_hom_outliers <- joined_chrom_results %>%
inner_join(fraction_homozygous_summary) %>%
filter(F < F_mean - (low_fraction_homozygous_sd_multiplier * F_sd)) %>%
arrange(sex, reference_name, F))
In [43]:
high_fraction_homozygous_sd_multiplier <- 4
(high_hom_outliers <- joined_chrom_results %>%
inner_join(fraction_homozygous_summary) %>%
filter(F > F_mean + (high_fraction_homozygous_sd_multiplier * F_sd)) %>%
arrange(sex, reference_name, desc(F)))
In [44]:
problems <- rbind(
low_hom_outliers %>%
group_by(name) %>%
summarize(fail = str_glue('per chrom homozygosity is at least {low_fraction_homozygous_sd_multiplier}',
' standard deviations below the mean ',
str_c(reference_name, collapse = ','))),
high_hom_outliers %>%
group_by(name) %>%
summarize(fail = str_glue('per chrom homozygosity is at least {high_fraction_homozygous_sd_multiplier}',
' standard deviations above the mean ',
str_c(reference_name, collapse = ','))),
problems)
dim(problems)
head(problems)
In [45]:
joined_chrom_results %>%
filter(reference_name %in% c(AUTOSOMES)) %>%
mutate(reference_name = parse_factor(reference_name, levels = c(AUTOSOMES))) %>%
ggplot(aes(y = titv, x = reference_name, color = sex)) +
geom_boxplot() +
ylab('Ti/Tv ratio') +
xlab('Chromosome') +
ggtitle('Ti/Tv ratio per genome') +
theme(axis.text.x = element_text(angle = 50, hjust = 1))
In [46]:
titv_summary <- joined_chrom_results %>%
group_by(reference_name) %>%
summarize(
titv_mean = mean(titv),
titv_sd = sd(titv),
)
head(titv_summary)
In [47]:
low_titv_sd_multiplier <- 4
(low_titv_outliers <- joined_chrom_results %>%
inner_join(titv_summary) %>%
filter(titv < titv_mean - (low_titv_sd_multiplier * titv_sd)) %>%
arrange(reference_name, titv))
In [48]:
high_titv_sd_multiplier <- 4
(high_titv_outliers <- joined_chrom_results %>%
inner_join(titv_summary) %>%
filter(titv > titv_mean + (high_titv_sd_multiplier * titv_sd)) %>%
arrange(reference_name, desc(titv)))
In [49]:
problems <- rbind(
low_titv_outliers %>%
group_by(name) %>%
summarize(fail = str_glue('per chrom ti/tv is at least {low_titv_sd_multiplier} standard deviations below the mean for ',
str_c(reference_name, collapse = ','))),
high_titv_outliers %>%
group_by(name) %>%
summarize(fail = str_glue('per chrom ti/tv is at least {high_titv_sd_multiplier} standard deviations above the mean for ',
str_c(reference_name, collapse = ','))),
problems)
dim(problems)
head(problems)
In [50]:
ggplot(joined_genome_results, aes(x = sex, y = perct_het_alt_in_snvs, fill = sex)) +
geom_boxplot() +
stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', position = position_dodge(width = 0.9), vjust = -0.5) +
scale_y_continuous(labels = percent_format()) +
xlab('Sex') +
ylab('Heterozygosity Rate ') +
ggtitle('Box Plot: Heterozygosity Rate on the X Chromosome')
In [51]:
p <- ggplot(joined_genome_results) +
geom_point(aes(x = name, y = perct_het_alt_in_snvs, color = sex)) +
scale_x_discrete(expand = c(0.05, 1)) +
scale_y_continuous(labels = percent_format()) +
facet_wrap(. ~ sequencing_platform, ncol = 1, scales = 'free') +
xlab('Sample') +
ylab('Heterozygosity Rate ') +
ggtitle('Scatter Plot: Heterozygosity Rate on the X Chromosome')
if (nrow(joined_genome_results) <= 20) {
p + theme(axis.text.x = element_text(angle = 50, hjust = 1))
} else {
p + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank())
}
In [52]:
(m_outliers <- joined_genome_results %>%
filter(!sex %in% c('F', 'FEMALE', 'female') & perct_het_alt_in_snvs > 0.4) %>%
arrange(desc(perct_het_alt_in_snvs))
)
In [53]:
(f_outliers <- joined_genome_results %>%
filter(! sex %in% c('M', 'MALE', 'male') & perct_het_alt_in_snvs < 0.4) %>%
arrange(perct_het_alt_in_snvs))
In [54]:
problems <- rbind(
tibble(name = m_outliers$name, fail = str_glue('sample info indicates {m_outliers$sex}, genome indicates female')),
tibble(name = f_outliers$name, fail = str_glue('sample info indicates {f_outliers$sex}, genome indicates male')),
problems)
dim(problems)
head(problems)
In [55]:
(problem_summary <- problems %>%
group_by(name) %>%
summarize(
issues = str_c(fail, collapse = ';\n '),
issue_count = n()
) %>%
arrange(desc(issue_count), name))
In [56]:
write_csv(problem_summary, path = 'problem_summary.csv')
In [57]:
devtools::session_info()
Copyright 2019 Verily Life Sciences LLC. All rights reserved.
Licensed under the Apache License, Version 2.0 (the 'License'); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.