This notebook demonstrates how to use the haplotype clustering utilities.
In [1]:
%run setup.ipynb
%matplotlib inline
import hapclust
# %reload_ext autoreload
# %autoreload 1
# %aimport hapclust
In [2]:
h = allel.HaplotypeArray([[0, 0, 1, 1, 0],
[0, 0, 1, 0, 1],
[0, 0, 0, 1, 1],
[0, 0, 0, 1, 0],
[0, 0, 0, 0, 1]])
In [3]:
graph = hapclust.graph_haplotype_network(
h[:, :-1], network_method='msn', debug=False,
show_node_labels=True, node_size_factor=.2, anon_width=.4,
variant_labels=['A', 'B', 'C', 'D', 'E'])
graph
Out[3]:
In [4]:
graph = hapclust.graph_haplotype_network(
h[:, :-1], network_method='mjn', debug=False,
show_node_labels=True, node_size_factor=.2, anon_width=.4,
variant_labels=['A', 'B', 'C', 'D', 'E'])
graph
Out[4]:
In [5]:
hapclust.graph_haplotype_network(
h, network_method='msn', debug=False,
show_node_labels=True, node_size_factor=.2, anon_width=.4,
variant_labels=['A', 'B', 'C', 'D', 'E'])
Out[5]:
In [6]:
hapclust.graph_haplotype_network(
h, network_method='mjn', debug=False,
show_node_labels=True, node_size_factor=.2, anon_width=.4,
variant_labels=['A', 'B', 'C', 'D', 'E'])
Out[6]:
In [7]:
idx_rec = hapclust.locate_recombinants(h, debug=False)
idx_rec
Out[7]:
In [8]:
# how many possible solutions?
len(idx_rec)
Out[8]:
In [9]:
# pick a solution, locate non-recombinants
idx_norec = [i for i in range(h.shape[1]) if i not in idx_rec[0]]
idx_norec
Out[9]:
In [10]:
hapclust.graph_haplotype_network(
h[:, idx_norec], network_method='mjn', debug=False,
show_node_labels=True, node_size_factor=.2, anon_width=.4,
variant_labels=['A', 'B', 'C', 'D', 'E'])
Out[10]:
In [11]:
# callset = h5py.File('../data/ag1000g.phase1.AR3.1.haplotypes.specific_regions.2L_2358158_2431617.h5',
# mode='r')
callset = phase1_ar31.callset_phased
region_vgsc = SeqFeature('2L', 2358158, 2431617)
genotypes = allel.GenotypeDaskArray(callset['2L/calldata/genotype'])
haplotypes = genotypes.to_haplotypes()
pos = allel.SortedIndex(callset['2L/variants/POS'])
loc = pos.locate_range(region_vgsc.start, region_vgsc.end)
h_vgsc = haplotypes[loc].compute()
pos_995S = 2422651
pos_995F = 2422652
loc_995S = haplotypes[pos.locate_key(pos_995S)] == 1
loc_995F = haplotypes[pos.locate_key(pos_995F)] == 1
h_vgsc_995F = h_vgsc.compress(loc_995F, axis=1)
h_vgsc_995S = h_vgsc.compress(loc_995S, axis=1)
sample_ids = callset['2L']['samples'][:]
hap_ids = np.array(list(itertools.chain(*[[s + b'a', s + b'b'] for s in sample_ids])))
hap_ids_995F = hap_ids[loc_995F]
hap_ids_995S = hap_ids[loc_995S]
# tbl_haplotypes = etl.fromtsv('../data/ag1000g.phase1.AR3.1.haplotypes.meta.txt')
tbl_haplotypes = phase1_ar31.tbl_haplotypes
hap_pops = np.array(tbl_haplotypes.values('population'))
hap_pops_995S = hap_pops[loc_995S]
hap_pops_995F = hap_pops[loc_995F]
# need to use named colors for graphviz
pop_colors = {
'AOM': 'brown',
'BFM': 'firebrick1',
'GWA': 'goldenrod1',
'GNS': 'cadetblue1',
'BFS': 'deepskyblue',
'CMS': 'dodgerblue3',
'UGS': 'palegreen',
'GAS': 'olivedrab',
'KES': 'grey47',
'colony': 'black'
}
hap_colors = np.array([pop_colors[p] for p in hap_pops])
hap_colors_995S = np.array([pop_colors[p] for p in hap_pops_995S])
hap_colors_995F = np.array([pop_colors[p] for p in hap_pops_995F])
In [12]:
tbl_variant_labels = (
etl
.frompickle('../data/tbl_variants_phase1.pkl')
.eq('num_alleles', 2)
.cut('POS', 'AGAP004707-RA')
.convert('AGAP004707-RA', lambda v: v[1] if v[0] == 'NON_SYNONYMOUS_CODING' else '')
.rename('AGAP004707-RA', 'label')
)
tbl_variant_labels
Out[12]:
In [13]:
pos2label = tbl_variant_labels.lookupone('POS', 'label')
pos2label[pos_995F]
Out[13]:
In [14]:
variant_labels = np.array([pos2label.get(p, '') for p in pos], dtype=object)
variant_labels_vgsc = variant_labels[loc]
variant_labels_vgsc[:5]
Out[14]:
In [15]:
# Default plot...
# cuts the tree at height 2 (so max distance within each cluster is 1)...
# highlights all clusters...
# labels all clusters.
hapclust.fig_haplotypes_clustered(h_vgsc_995S, dpi=150);
In [16]:
# Change the orientation...
hapclust.fig_haplotypes_clustered(h_vgsc_995S, orientation='left', dpi=150);
In [17]:
# Try a different cut height...
hapclust.fig_haplotypes_clustered(h_vgsc_995S, cut_height=5, dpi=150);
In [18]:
# Choose to highlight only clusters above a certain size...
hapclust.fig_haplotypes_clustered(h_vgsc_995S, dpi=150, highlight_clusters=5);
In [19]:
# Manually choose which clusters to highlight...
hapclust.fig_haplotypes_clustered(h_vgsc_995S, dpi=150, highlight_clusters=[2, 9]);
In [20]:
# Turn off cluster labels...
hapclust.fig_haplotypes_clustered(h_vgsc_995S, dpi=150, highlight_clusters=5, label_clusters=False);
In [21]:
# Use your favourite colors...
hapclust.fig_haplotypes_clustered(
h_vgsc_995S, dpi=150, highlight_clusters=5, label_clusters=False,
highlight_colors=['red', 'green', 'blue', 'cyan', 'magenta', 'yellow'],
highlight_alpha=.8);
In [22]:
# What does this function return?
fig, ax_dend, ax_freq, cluster_spans, leaf_obs = hapclust.fig_haplotypes_clustered(
h_vgsc_995S, dpi=150, highlight_clusters=5, label_clusters=5)
In [23]:
# E.g., use returned axes objects to customise labels etc. ...
cut_height = 2
fig, ax_dend, ax_freq, cluster_spans_995S, leaf_obs_995S = hapclust.fig_haplotypes_clustered(
h_vgsc_995S, cut_height=cut_height, dpi=150,
highlight_clusters=5, label_clusters=5)
ax_dend.set_title('Haplotype structure (L995S)')
ax_dend.set_ylabel('Distance (no. SNPs)')
ax_freq.set_ylabel('Haplotype frequency');
In [24]:
# cluster_spans is useful for accessing information about each cluster...
cluster_spans_995S
Out[24]:
In [25]:
# E.g., cluster labelled "17" in the plot:
cluster_idx = 17
dend_start, dend_stop, cluster_hap_indices = cluster_spans_995S[cluster_idx]
In [26]:
# These are positions in the dendrogram where the cluster starts and stops:
dend_start, dend_stop
Out[26]:
In [27]:
# These are the indices of the haplotypes in the cluster
cluster_hap_indices
Out[27]:
In [28]:
# How many haplotypes in the cluster?
len(cluster_hap_indices)
Out[28]:
In [29]:
# N.B., these are relative to the haplotype array passed into the function.
# To extract only haplotypes in this cluster...
cluster_haps = h_vgsc_995S.take(cluster_hap_indices, axis=1)
cluster_haps
Out[29]:
In [30]:
cluster_hap_ids = hap_ids_995S.take(cluster_hap_indices)
In [31]:
sequences = cluster_haps.astype('S1').T
In [32]:
allel.io.write_fasta('../data/demo.hapclust.995S.cut{}.cluster{}.fasta'.format(cut_height,
cluster_idx),
sequences=list(sequences),
names=cluster_hap_ids,
mode='w',
width=80)
In [33]:
# leaf_obs can also be useful, it maps the leaves of the dendrogram onto indices of original observations...
# need this if ever you want to plot the haplotypes themselves
# E.g., the first leaf of the dendrogram contains these haplotypes:
leaf_obs_995S[0]
Out[33]:
In [34]:
# E.g., the 8th leaf of the dendrogram contains these haplotypes:
leaf_obs_995S[7]
Out[34]:
In [35]:
# To extract a haplotype array matching the leaves of the dendrogram...
# only need one index per leaf, as all haplotypes per leaf are identical
indices = [l[0] for l in leaf_obs_995S]
# take unique haplotypes in order shown in dendrogram
h_vgsc_995S_dend = h_vgsc_995S.take(indices, axis=1)
h_vgsc_995S_dend
Out[35]:
In [36]:
# make a network of all L995S haplotypes
graph = hapclust.graph_haplotype_network(h_vgsc_995S, network_method='mst')
graph
Out[36]:
In [37]:
# add some color
graph = hapclust.graph_haplotype_network(h_vgsc_995S, hap_colors=hap_colors_995S, network_method='mst')
graph
Out[37]:
In [38]:
# try a different network building method
graph = hapclust.graph_haplotype_network(h_vgsc_995S, hap_colors=hap_colors_995S, network_method='msn')
graph
Out[38]:
In [39]:
# try a different network building method
graph = hapclust.graph_haplotype_network(h_vgsc_995S, hap_colors=hap_colors_995S, network_method='mjn')
graph
Out[39]:
In [40]:
# change the maximum connection distance
graph = hapclust.graph_haplotype_network(
h_vgsc_995S, hap_colors=hap_colors_995S, max_dist=10, network_method='mst')
graph
Out[40]:
In [41]:
graph = hapclust.graph_haplotype_network(
h_vgsc_995S, hap_colors=hap_colors_995S, network_method='msn', max_dist=10)
graph
Out[41]:
In [42]:
graph = hapclust.graph_haplotype_network(
h_vgsc_995S, hap_colors=hap_colors_995S, network_method='mjn', max_dist=10)
graph
Out[42]:
In [43]:
# plot a network for just a single cluster that we extracted earlier from the dendrogram
cluster_idx = 2
dend_start, dend_stop, cluster_hap_indices = cluster_spans_995S[cluster_idx]
cluster_haps = h_vgsc_995S.take(cluster_hap_indices, axis=1)
cluster_hap_pops = hap_pops_995S[cluster_hap_indices]
cluster_hap_colors = np.array([pop_colors[p] for p in cluster_hap_pops])
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='msn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[43]:
In [44]:
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='mjn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[44]:
In [45]:
# plot a network for just a single cluster that we extracted earlier from the dendrogram
cluster_idx = 9
dend_start, dend_stop, cluster_hap_indices = cluster_spans_995S[cluster_idx]
cluster_haps = h_vgsc_995S.take(cluster_hap_indices, axis=1)
cluster_hap_pops = hap_pops_995S[cluster_hap_indices]
cluster_hap_colors = np.array([pop_colors[p] for p in cluster_hap_pops])
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='msn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[45]:
In [46]:
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='mjn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[46]:
In [47]:
# plot a network for just a single cluster that we extracted earlier from the dendrogram
cluster_idx = 11
dend_start, dend_stop, cluster_hap_indices = cluster_spans_995S[cluster_idx]
cluster_haps = h_vgsc_995S.take(cluster_hap_indices, axis=1)
cluster_hap_pops = hap_pops_995S[cluster_hap_indices]
cluster_hap_colors = np.array([pop_colors[p] for p in cluster_hap_pops])
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='msn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[47]:
In [48]:
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='mjn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[48]:
In [49]:
# plot a network for just a single cluster that we extracted earlier from the dendrogram
cluster_idx = 13
dend_start, dend_stop, cluster_hap_indices = cluster_spans_995S[cluster_idx]
cluster_haps = h_vgsc_995S.take(cluster_hap_indices, axis=1)
cluster_hap_pops = hap_pops_995S[cluster_hap_indices]
cluster_hap_colors = np.array([pop_colors[p] for p in cluster_hap_pops])
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='msn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[49]:
In [50]:
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='mjn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[50]:
In [51]:
# plot a network for just a single cluster that we extracted earlier from the dendrogram
cluster_idx = 17
dend_start, dend_stop, cluster_hap_indices = cluster_spans_995S[cluster_idx]
cluster_haps = h_vgsc_995S.take(cluster_hap_indices, axis=1)
cluster_hap_pops = hap_pops_995S[cluster_hap_indices]
cluster_hap_colors = np.array([pop_colors[p] for p in cluster_hap_pops])
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='msn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[51]:
In [52]:
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='mjn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[52]:
In [53]:
# let's do L995F as well...
cut_height = 4
fig, ax_dend, ax_freq, cluster_spans_995F, leaf_obs_995F = hapclust.fig_haplotypes_clustered(
h_vgsc_995F, orientation='left', cut_height=cut_height, dpi=150,
highlight_clusters=5, label_clusters=5)
In [54]:
graph = hapclust.graph_haplotype_network(
h_vgsc_995F, hap_colors=hap_colors_995F, max_dist=4, network_method='mst')
graph
Out[54]:
In [55]:
graph = hapclust.graph_haplotype_network(
h_vgsc_995F, hap_colors=hap_colors_995F, max_dist=4, network_method='msn')
graph
Out[55]:
In [56]:
graph = hapclust.graph_haplotype_network(
h_vgsc_995F, hap_colors=hap_colors_995F, max_dist=4, network_method='mjn')
graph
Out[56]:
In [57]:
# plot a network for just a single cluster that we extracted earlier from the dendrogram
cluster_idx = 15
dend_start, dend_stop, cluster_hap_indices = cluster_spans_995F[cluster_idx]
cluster_haps = h_vgsc_995F.take(cluster_hap_indices, axis=1)
cluster_hap_pops = hap_pops_995F[cluster_hap_indices]
cluster_hap_colors = np.array([pop_colors[p] for p in cluster_hap_pops])
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='msn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[57]:
In [58]:
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='mjn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[58]:
In [59]:
# plot a network for just a single cluster that we extracted earlier from the dendrogram
cluster_idx = 16
dend_start, dend_stop, cluster_hap_indices = cluster_spans_995F[cluster_idx]
cluster_haps = h_vgsc_995F.take(cluster_hap_indices, axis=1)
cluster_hap_pops = hap_pops_995F[cluster_hap_indices]
cluster_hap_colors = np.array([pop_colors[p] for p in cluster_hap_pops])
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='msn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[59]:
In [60]:
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='mjn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[60]:
In [61]:
# plot a network for just a single cluster that we extracted earlier from the dendrogram
cluster_idx = 20
dend_start, dend_stop, cluster_hap_indices = cluster_spans_995F[cluster_idx]
cluster_haps = h_vgsc_995F.take(cluster_hap_indices, axis=1)
cluster_hap_pops = hap_pops_995F[cluster_hap_indices]
cluster_hap_colors = np.array([pop_colors[p] for p in cluster_hap_pops])
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='msn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[61]:
In [62]:
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='mjn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[62]:
In [63]:
# plot a network for two of the clusters together
cluster_hap_indices = []
for cluster_idx in 15, 16:
_, _, cidx = cluster_spans_995F[cluster_idx]
cluster_hap_indices.extend(cidx)
cluster_haps = h_vgsc_995F.take(cluster_hap_indices, axis=1)
cluster_hap_pops = hap_pops_995F[cluster_hap_indices]
cluster_hap_colors = np.array([pop_colors[p] for p in cluster_hap_pops])
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='msn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[63]:
In [64]:
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='mjn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[64]:
In [65]:
# plot a network for just a single cluster that we extracted earlier from the dendrogram
cluster_idx = 24
dend_start, dend_stop, cluster_hap_indices = cluster_spans_995F[cluster_idx]
cluster_haps = h_vgsc_995F.take(cluster_hap_indices, axis=1)
cluster_hap_pops = hap_pops_995F[cluster_hap_indices]
cluster_hap_colors = np.array([pop_colors[p] for p in cluster_hap_pops])
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='msn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[65]:
In [66]:
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors,
network_method='mjn', variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[66]:
In [67]:
# plot a network for just a single cluster that we extracted earlier from the dendrogram
cluster_idx = 33
dend_start, dend_stop, cluster_hap_indices = cluster_spans_995F[cluster_idx]
cluster_haps = h_vgsc_995F.take(cluster_hap_indices, axis=1)
cluster_hap_pops = hap_pops_995F[cluster_hap_indices]
cluster_hap_colors = np.array([pop_colors[p] for p in cluster_hap_pops])
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors, network_method='mst',
edge_weight=10, overlap=False,
show_node_labels=True, variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[67]:
In [68]:
# does the different network method really matter?
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors, network_method='msn',
edge_weight=9, overlap=False,
show_node_labels=True, variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[68]:
In [69]:
# does the different network method really matter?
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors, network_method='mjn',
edge_weight=8, overlap=False,
show_node_labels=True, variant_labels=variant_labels_vgsc, fontsize=8)
graph
Out[69]:
In [70]:
# ...yes I think it does.
In [71]:
# just for fun, graph the whole lot...
graph = hapclust.graph_haplotype_network(
h_vgsc, hap_colors=hap_colors, network_method='msn',
variant_labels=variant_labels_vgsc, fontsize=7)
graph
Out[71]:
In [72]:
graph = hapclust.graph_haplotype_network(
h_vgsc, hap_colors=hap_colors, network_method='mjn',
variant_labels=variant_labels_vgsc, fontsize=7)
graph
Out[72]:
In [73]:
cluster_idx = 33
dend_start, dend_stop, cluster_hap_indices = cluster_spans_995F[cluster_idx]
cluster_haps = h_vgsc_995F.take(cluster_hap_indices, axis=1)
cluster_hap_pops = hap_pops_995F[cluster_hap_indices]
cluster_hap_colors = np.array([pop_colors[p] for p in cluster_hap_pops])
graph = hapclust.graph_haplotype_network(
cluster_haps, hap_colors=cluster_hap_colors, network_method='mjn',
edge_weight=12, overlap=False,
show_node_labels=2, fontsize='8', variant_labels=variant_labels_vgsc)
graph
Out[73]:
In [74]:
fig = plt.figure(figsize=(12, 10), dpi=120)
ax = fig.add_subplot(2, 2, 1)
ax.set_axis_off()
ax.set_title('default options')
hapclust.plot_graphviz(graph, ax)
ax = fig.add_subplot(2, 2, 2)
ax.set_axis_off()
ax.set_title('higher res, default interpolation (bilinear)')
hapclust.plot_graphviz(graph, ax, dpi=600)
ax = fig.add_subplot(2, 2, 3)
# leave frame around to allow comparison with ratio='fill' below
ax.set_xticks([])
ax.set_yticks([])
ax.set_title('higher res, lanczos interpolation')
hapclust.plot_graphviz(graph, ax, dpi=600, interpolation='lanczos')
# using ratio='fill' means graphviz will scale the graph to fill the available space
ax = fig.add_subplot(2, 2, 4)
ax.set_xticks([])
ax.set_yticks([])
ax.set_title('ratio=fill')
hapclust.plot_graphviz(graph, ax, dpi=600, ratio='fill', interpolation='lanczos')
fig.tight_layout()
In [ ]: