In [ ]:
import hail as hl
hl.init()
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()
In [ ]:
hl.utils.get_1kg('data/')
mt = hl.read_matrix_table('data/1kg.mt')
table = (hl.import_table('data/1kg_annotations.txt', impute=True)
.key_by('Sample'))
mt = mt.annotate_cols(**table[mt.s])
mt = hl.sample_qc(mt)
mt.describe()
In [ ]:
dp_hist = mt.aggregate_entries(hl.expr.aggregators.hist(mt.DP, 0, 30, 30))
p = hl.plot.histogram(dp_hist, legend='DP', title='DP Histogram')
show(p)
This method, like all Hail plotting methods, also allows us to pass in fields of our data set directly. Choosing not to specify the range
and bins
arguments would result in a range being computed based on the largest and smallest values in the dataset and a default bins value of 50.
In [ ]:
p = hl.plot.histogram(mt.DP, range=(0, 30), bins=30)
show(p)
In [ ]:
p = hl.plot.cumulative_histogram(mt.DP, range=(0,30), bins=30)
show(p)
In [ ]:
p = hl.plot.scatter(mt.sample_qc.dp_stats.mean, mt.sample_qc.call_rate, xlabel='Mean DP', ylabel='Call Rate')
show(p)
We can also pass in a Hail field as a label
argument, which determines how to color the data points.
In [ ]:
mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97))
ab = mt.AD[1] / hl.sum(mt.AD)
filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) |
(mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) |
(mt.GT.is_hom_var() & (ab >= 0.9)))
mt = mt.filter_entries(filter_condition_ab)
mt = hl.variant_qc(mt).cache()
common_mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)
gwas = hl.linear_regression_rows(y=common_mt.CaffeineConsumption, x=common_mt.GT.n_alt_alleles(), covariates=[1.0])
pca_eigenvalues, pca_scores, _ = hl.hwe_normalized_pca(common_mt.GT)
In [ ]:
p = hl.plot.scatter(pca_scores.scores[0], pca_scores.scores[1],
label=common_mt.cols()[pca_scores.s].SuperPopulation,
title='PCA', xlabel='PC1', ylabel='PC2', collect_all=True)
show(p)
Hail's downsample aggregator is incorporated into the scatter()
, qq()
, and manhattan()
functions. The collect_all
parameter tells the plot function whether to collect all values or downsample. Choosing not to set this parameter results in downsampling.
In [ ]:
p2 = hl.plot.scatter(pca_scores.scores[0], pca_scores.scores[1],
label=common_mt.cols()[pca_scores.s].SuperPopulation,
title='PCA (downsampled)', xlabel='PC1', ylabel='PC2', collect_all=False, n_divisions=50)
show(gridplot([p, p2], ncols=2, plot_width=400, plot_height=400))
In [ ]:
p = hl.plot.histogram2d(pca_scores.scores[0], pca_scores.scores[1])
show(p)
In [ ]:
p = hl.plot.qq(gwas.p_value, collect_all=True)
p2 = hl.plot.qq(gwas.p_value, n_divisions=75)
show(gridplot([p, p2], ncols=2, plot_width=400, plot_height=400))
In [ ]:
p = hl.plot.manhattan(gwas.p_value)
show(p)
We can also pass in a dictionary of fields that we would like to show up as we hover over a data point, and choose not to downsample if the dataset is relatively small.
In [ ]:
hover_fields = dict([('alleles', gwas.alleles)])
p = hl.plot.manhattan(gwas.p_value, hover_fields=hover_fields, collect_all=True)
show(p)