This example was done using . You can read and use the Python code in other environments if you have never used Colab before. For example, we used a path to a file in gs://, but you can also just copy that file to your environment and use a local path instead.
This example is for advanced users who want to understand the data representation. You do NOT need to understand this in order to use DeepVariant.
In [0]:
!pip install -q "tensorflow==1.13.1"
In [0]:
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns
import tensorflow as tf
from tensorflow.core.example import example_pb2
tf.enable_eager_execution()
In [0]:
def get_bytes_list(example, key):
"""Returns the bytes-list corresponding to a key in a tf.Example."""
return example.features.feature[key].bytes_list.value
def get_label(example):
"""Returns the label(class) of the example. Expected values are 0 (HOM-REF), 1(HET), 2(HOM-ALT)"""
return example['label'].numpy()
def imshows(images, labels=None, n=None, scale=10, axis='off', **kwargs):
"""Plots a list of images in a row.
Args:
images: List of image arrays.
labels: List of str or None. Image titles.
n: int or None. How many images in the list to display. If this is None,
display all of them.
scale: int. How big each image is.
axis: str. How to plot each image.
**kwargs: Keyword arguments for Axes.imshow.
Returns:
None.
"""
n = len(images) if (n is None) else n
with sns.axes_style('white'):
_, axs = plt.subplots(1, n, figsize=(n * scale, scale))
for i in range(n):
axs[i].imshow(images[i], **kwargs)
axs[i].axis(axis)
if labels:
axs[i].set_title(labels[i])
plt.show()
def visualize_example(example):
image = example['image/encoded'].numpy()
# Read in the image bytestring and its shape: [100, 221, 6].
shape = (100, 221, 6)
# Parse the bytestring and reshape to an image.
image = np.frombuffer(image, np.uint8).reshape(shape)
# Split the tensor by its channels dimension and plot.
channels = [image[:, :, i] for i in range(shape[-1])]
# Prepend an image: RGBA image reconstructed from the 6-channels
channels.insert(0, channels_to_rgb(channels))
titles = ["reconstructed RGBA (label=%s)" % get_label(example),
"read base", "base quality", "mapping quality", "strand",
"supports variant", "supports reference"]
imshows(channels, titles, axis="image", scale=5)
def channels_to_rgb(channels):
# Reconstruct the original channels
base = channels[0]
qual = np.minimum(channels[1], channels[2])
strand = channels[3]
alpha = np.multiply(
channels[4] / 254.0,
channels[5] / 254.0)
return np.multiply(
np.stack([base, qual, strand]),
alpha).astype(np.uint8).transpose([1, 2, 0])
The following cell reads tf.train.Example protos from a source. This source is in gzipped TFRecord format. We show a few examples, including:
Visualize the 6-tensor that represent 6 different features. Reconstruct the RGBA image representation and render it as one image defined in the supplementary materials here.
In [0]:
!gsutil -q cp gs://deepvariant/datalab-testdata/make_examples_datalab.tfrecord.gz /tmp/make_examples_colab.tfrecord.gz
In [0]:
example_description = {
'label': tf.io.FixedLenFeature([], tf.int64),
'image/encoded': tf.io.FixedLenFeature([], tf.string),
'variant/encoded': tf.io.FixedLenFeature([], tf.string),
}
def _parse_example_function(example_proto):
# Parse the input tf.Example proto using the dictionary above.
return tf.io.parse_single_example(example_proto, example_description)
In [0]:
# This tfrecord comes from HG002 PFDA data. I ran make_examples in training mode so we also have the labels.
src = ['/tmp/make_examples_colab.tfrecord.gz']
examples = tf.data.TFRecordDataset(src, compression_type="GZIP")
parsed_examples = examples.map(_parse_example_function)
In [0]:
# After you run this cell, images should show up.
for example in parsed_examples.take(5):
visualize_example(example)
In [0]:
!pip install -qq "google-nucleus==0.2.2"
In [0]:
for example in parsed_examples.take(1):
print(example.keys())
In [0]:
from nucleus.protos import variants_pb2
for example in parsed_examples.take(1):
variant = variants_pb2.Variant.FromString(example['variant/encoded'].numpy())
print(variant)