In [1]:
%run setup.ipynb
%matplotlib inline
import hapclust
In [2]:
# load data
callset_haps = np.load('../data/haps_phase1.npz')
haps = allel.HaplotypeArray(callset_haps['haplotypes'])
pos = allel.SortedIndex(callset_haps['POS'])
In [3]:
tbl_variants_selected = etl.frompickle('../data/tbl_variants_missense_selected.pkl')
tbl_variants_selected
Out[3]:
In [4]:
tbl_selected_redux = (
tbl_variants_selected
.cut('POS', 'REF', 'ALT', 'AGAP004707-RA')
.mergeduplicates(key=('POS'))
.convert('ALT', lambda v: ','.join(v) if len(v) > 1 else v)
.addfield('label', lambda rec: '%s:%s>%s %s' % (rec.POS, rec.REF, rec.ALT.ljust(3), rec['AGAP004707-RA'].rjust(6)))
.sort('POS')
)
tbl_selected_redux.display(vrepr=repr)
In [5]:
lbl_vgsc_missense = tbl_selected_redux.values('label').list()
lbl_vgsc_missense
Out[5]:
In [6]:
pos_selected = tbl_selected_redux.values('POS').array()
pos_selected
Out[6]:
In [7]:
loc_vgsc_missense = pos.locate_keys(pos_selected)
np.count_nonzero(loc_vgsc_missense)
Out[7]:
In [8]:
haps_vgsc_missense = haps[loc_vgsc_missense]
haps_vgsc_missense
Out[8]:
In [9]:
ds_pw_haplotype_age = np.load('../data/pairwise_haplotype_age.npz')
pw_t_hat = ds_pw_haplotype_age['t_hat']
In [10]:
with open('../data/clust_dict.pickle', mode='rb') as f:
clust_dict = pickle.load(f)
In [11]:
# read in haplotype metadata to get population
df_haplotypes = phase1_ar31.df_haplotypes
df_haplotypes = df_haplotypes[df_haplotypes.population != 'colony']
df_haplotypes.head()
Out[11]:
In [12]:
populations = phase1_ar3.pop_ids
pop_colors = phase1_ar3.pop_colors
pop_labels = phase1_ar3.pop_labels
In [13]:
lbl_vgsc_missense
Out[13]:
In [14]:
# N.B., split colors of multiallelics
def plot_haplotypes_split(h, mut_labels, ax=None):
if ax is None:
fig, ax = plt.subplots()
# copy haplotypes for display
h = h.copy()
# colors for colormap
mycol = ['r', 'w', 'k', 'b']
# make colormap
cake = mpl.colors.ListedColormap(mycol, name='mymap', N=4)
# plot
ax.pcolormesh(np.asarray(h[::-1]), cmap=cake, vmin=-1, vmax=2)
ax.set_yticks(np.arange(h.shape[0])+.5)
ax.set_yticklabels(mut_labels[::-1],family = 'monospace', fontsize=7)
ax.hlines(np.arange(h.shape[0]+1), 0, h.shape[1], color='k', lw=.5)
ax.set_xlim(0, h.shape[1])
ax.set_ylim(0, h.shape[0])
ax.yaxis.tick_left()
ax.set_xlabel('haplotypes')
ax.set_xticks(list(range(0, h.shape[1], 200)) + [h.shape[1]])
ax.xaxis.tick_bottom()
In [15]:
plot_haplotypes_split(haps_vgsc_missense, lbl_vgsc_missense)
In [16]:
cluster_names = {
1 : 'F4',
2 : 'F3',
3 : 'F1',
4 : 'L2',
5 : 'L1',
7 : 'S4/5',
8 : 'F2',
9 : 'F5',
10 : 'S2',
12 : 'S1',
13 : 'S3'
}
In [34]:
def plot_cladogram(dist, yscale='symlog', yscale_kws=None, ylim=(10, 1e6), count_sort=True, cut_height=2e3,
linkage_method='average', n_clusters=14, fill_threshold=0, leaf_height=0, linewidth=2,
figsize=(9, 6), dpi=300):
# perform hierarchical clustering
z = scipy.cluster.hierarchy.linkage(dist, method=linkage_method)
# needed for getting leaves in right order
r = scipy.cluster.hierarchy.dendrogram(
z, count_sort=count_sort, no_plot=True)
# setup figure
fig = plt.figure(figsize=figsize, dpi=120)
gs = mpl.gridspec.GridSpec(nrows=3, ncols=1, height_ratios=[6, 5, 0.4], hspace=0)
# setup vspans - find clusters
f = scipy.cluster.hierarchy.fcluster(z, cut_height, criterion='distance')
# compute cluster sizes
fsz = np.bincount(f)
# sort largest first
fsort = np.argsort(fsz)[::-1]
# take largest n
fsort = fsort[:n_clusters]
# get haplotype indices for each cluster
clusters = [set(np.nonzero(f == i)[0]) for i in fsort]
clusters_leaves = [sorted([r['leaves'].index(i) for i in cluster]) for cluster in clusters]
ixs = np.argsort([min(cl) for cl in clusters_leaves])
clusters = [clusters[i] for i in ixs]
clusters_leaves = [clusters_leaves[i] for i in ixs]
#------------------------------------------------------------------------------------------
# plot cladogram
ax = fig.add_subplot(gs[0])
sns.despine(ax=ax, offset=3, bottom=True, top=False)
colors = [phase1_ar3.pop_colors[p] for p in df_haplotypes.population]
hapclust.cladogram(z, count_sort=True, colors=colors,
fill_threshold=fill_threshold, leaf_height=leaf_height,
plot_kws=dict(linewidth=linewidth),
fill_kws=dict(linewidth=linewidth),
ax=ax)
ax.set_ylim(*ylim)
if yscale_kws is None:
yscale_kws = dict()
ax.set_yscale(yscale, **yscale_kws)
# ax.set_ylim(bottom=-1000)
xmin, xmax = ax.xaxis.get_data_interval()
xticklabels = np.array(list(range(0, len(r['leaves']), 200)) + [len(r['leaves'])])
xticks = xticklabels / len(r['leaves'])
xticks = (xticks * (xmax - xmin)) + xmin
#ax.set_xticks([])
ax.set_xticklabels(xticklabels)
ax.set_xlabel('Haplotypes')
ax.xaxis.set_label_position('top')
ax.set_ylabel('Haplotype age (generations)')
ax.grid(axis='y', linestyle='--', zorder=-1e9, linewidth=.5)
ax.axhline(cut_height, linestyle='--', linewidth=1, color='r')
#legend
allpop = list(phase1_ar3.pop_colors.keys())
ninepop = allpop.remove('colony')
#ninecol = [phase1_ar3.pop_colors[p] for p in ninepop]
handles = [mpl.patches.Patch(color=pop_colors[pop], label=pop_labels[pop]) for pop in populations]
ax.legend(handles=handles, loc='upper right', bbox_to_anchor=(1, 1), ncol=5, prop={'size':7.5}, frameon=True, framealpha=1)
#------------------------------------------------------------------------------------------
# plot display haplotypes
ax = fig.add_subplot(gs[1])
plot_haplotypes_split(haps_vgsc_missense[:, r['leaves']], lbl_vgsc_missense, ax=ax)
ax.set_xlim(0, len(r['leaves']))
ax.set_xticks([])
ax.set_xlabel('')
ax.set_ylabel('Non-synonymous SNPs')
if cut_height:
for i, cluster_leaves in enumerate(clusters_leaves):
if i in [1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13]:
x1, x2 = min(cluster_leaves), max(cluster_leaves)
ax.axvline(x1, color='k', linestyle='--', zorder=20)
ax.axvline(x2, color='k', linestyle='--', zorder=20)
ax.axvspan(x1, x2, color='k', zorder=20, alpha=.1)
#------------------------------------------------------------------------------------------
# KDR haplotype clusters
ax_clu = fig.add_subplot(gs[2])
sns.despine(ax=ax_clu, bottom=True, left=True)
ax_clu.set_xlim(0, 1530)
ax_clu.set_ylim(0, 1)
if cut_height:
for i, cluster_leaves in enumerate(clusters_leaves):
if i in [1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13]:
xmin, xmax = min(cluster_leaves), max(cluster_leaves)
fraction = -20 / (xmax - xmin)
ax_clu.annotate("", ha='left', va='center',
xy=(xmin, 1), xycoords='data',
xytext=(xmax, 1), textcoords='data',
arrowprops=dict(arrowstyle="-",
connectionstyle="bar,fraction=%.4f" % fraction,
),
)
ax_clu.text((xmax + xmin)/2, 0, cluster_names[i], va='top', ha='center', fontsize=8)
ax_clu.set_xticks([])
ax_clu.set_yticks([])
#------------------------------------------------------------------------------------------
gs.tight_layout(fig, h_pad=-0.2)
fn = '../artwork/fig_hap_tree.png'
fig.savefig(fn, jpeg_quality=100, dpi=dpi, bbox_inches='tight')
In [35]:
plot_cladogram(pw_t_hat, fill_threshold=0, linewidth=0.7, leaf_height=0,
figsize=(10, 6), dpi=300,
ylim=(0, 2e6), yscale='symlog', yscale_kws=dict(linthreshy=100, linscaley=1.0, subsy=list(range(1, 10))))
In [ ]: