In [1]:
%run setup.ipynb
%matplotlib inline
import hapclust
In [2]:
# load data
callset_haps = np.load('../data/haps_phase1.npz')
haps = allel.HaplotypeArray(callset_haps['haplotypes'])
pos = allel.SortedIndex(callset_haps['POS'])
ann = callset_haps['ANN']
In [3]:
pos_kdr_s = 2422651
pos_kdr_f = 2422652
In [4]:
collections.Counter(ann['Annotation'])
Out[4]:
In [5]:
# perform allele count - needed to locate singletons
ac = haps.count_alleles(max_allele=3)
ac[:1]
Out[5]:
In [6]:
tbl_variants_selected = etl.frompickle('../data/tbl_variants_missense_selected.pkl')
tbl_variants_selected
Out[6]:
In [7]:
tbl_selected_redux = (
tbl_variants_selected
.cut('POS', 'REF', 'ALT', 'AGAP004707-RA')
.mergeduplicates(key=('POS'))
.convert('ALT', lambda v: ','.join(v) if len(v) > 1 else v)
.addfield('label', lambda rec: '%s:%s>%s %s' % (rec.POS, rec.REF, rec.ALT.ljust(3), rec['AGAP004707-RA'].rjust(6)))
.sort('POS')
)
tbl_selected_redux.display(vrepr=repr)
In [8]:
lbl_vgsc_missense = tbl_selected_redux.values('label').list()
lbl_vgsc_missense
Out[8]:
In [9]:
pos_selected = tbl_selected_redux.values('POS').array()
pos_selected
Out[9]:
In [10]:
loc_vgsc_missense = pos.locate_keys(pos_selected)
np.count_nonzero(loc_vgsc_missense)
Out[10]:
In [11]:
haps_vgsc_missense = haps[loc_vgsc_missense]
haps_vgsc_missense
Out[11]:
Here we divide haplotype data up into two, with an "EHH" set containing no singletons and only neutral biallelic variants, which we'll use for analysis of haplotype sharing, and a "mut" set containing singletons and putatively non-neutral bi-allelic variants that we will use for analysis of mutations on shared haplotypes.
In [12]:
# define types of variants to include in EHH analysis - should be mostly neutral
loc_type_neutral = ((ann['Annotation'] == b'intergenic_region') |
(ann['Annotation'] == b'intron_variant') |
(ann['Annotation'] == b'downstream_gene_variant') |
(ann['Annotation'] == b'upstream_gene_variant') |
(ann['Annotation'] == b'synonymous_variant') |
(ann['Annotation'] == b'3_prime_UTR_variant') |
(ann['Annotation'] == b'5_prime_UTR_variant')
)
np.count_nonzero(loc_type_neutral), loc_type_neutral.shape
Out[12]:
In [13]:
# locate singletons - will exclude from EHH analysis
# NB the EHH analysis doesn't need the multiallelics
loc_sgl_bi = (ac[:, :2].min(axis=1) == 1) & (ac.is_biallelic_01())
loc_nosgl_bi = (ac[:, :2].min(axis=1) > 1) & (ac.is_biallelic_01())
np.count_nonzero(loc_sgl_bi), np.count_nonzero(loc_nosgl_bi), loc_nosgl_bi.shape
Out[13]:
In [14]:
# these are the variants to use for EHH
loc_ehh = loc_type_neutral & loc_nosgl_bi
np.count_nonzero(loc_ehh), loc_ehh.shape
Out[14]:
In [15]:
# these are the variants to use for mutational distance
# include non-neutral mutations
loc_mut = loc_sgl_bi | ~loc_type_neutral
np.count_nonzero(loc_mut), loc_mut.shape
Out[15]:
In [16]:
haps_mut = haps[loc_mut]
pos_mut = pos[loc_mut]
In [17]:
haps_mut
Out[17]:
In [18]:
haps_ehh = haps[loc_ehh]
pos_ehh = pos[loc_ehh]
In [19]:
#check
pos_mut.locate_key(pos_kdr_s), pos_mut.locate_key(pos_kdr_f)
Out[19]:
In [20]:
# read in haplotype metadata to get population
df_haplotypes = phase1_ar31.df_haplotypes
df_haplotypes = df_haplotypes[df_haplotypes.population != 'colony']
df_haplotypes.head()
Out[20]:
In [21]:
# set core SNP
core_pos = pos_kdr_f
In [22]:
# split the EHH dataset
dist_ehh_right, dist_ehh_left, haps_ehh_right, haps_ehh_left = hapclust.split_flanks(haps_ehh, pos_ehh, core_pos)
In [23]:
# these give the distance from the core position to each downstream variant, moving away from the core
dist_ehh_right
Out[23]:
In [24]:
dist_ehh_right.shape, dist_ehh_left.shape, dist_ehh_right.min(), dist_ehh_left.min()
Out[24]:
In [25]:
# split the mutations dataset
dist_mut_right, dist_mut_left, haps_mut_right, haps_mut_left = hapclust.split_flanks(haps_mut, pos_mut, core_pos)
In [26]:
dist_mut_right
Out[26]:
In [27]:
dist_mut_right.shape, dist_mut_left.shape, dist_mut_right.min(), dist_mut_left.min()
Out[27]:
In [28]:
haps_ehh_left.shape, haps_ehh_right.shape, haps_mut_left.shape, haps_mut_right.shape
Out[28]:
In [29]:
is_accessible = phase1_ar3.accessibility['2L/is_accessible'][:]
In [30]:
nn_idx_sorted_right, nn_spl_right, nn_spd_right, nn_muts_right = hapclust.neighbour_haplotype_sharing(haps_ehh_right, haps_mut_right, dist_ehh_right, dist_mut_right)
In [31]:
nn_idx_sorted_left, nn_spl_left, nn_spd_left, nn_muts_left = hapclust.neighbour_haplotype_sharing(haps_ehh_left, haps_mut_left, dist_ehh_left, dist_mut_left)
In [32]:
nn_spl_right.min(), nn_spl_right.max()
Out[32]:
In [33]:
nn_spd_right.min(), nn_spd_right.max(), nn_spd_left.min(), nn_spd_left.max()
Out[33]:
In [34]:
nn_muts_right.min(), nn_muts_right.max(), nn_muts_left.min(), nn_muts_left.max()
Out[34]:
In [35]:
# compute accessible lengths - needed for mutation analysis
nn_spd_right_accessible = hapclust.haplotype_accessible_length(nn_spd_right, core_pos=core_pos, is_accessible=is_accessible, flank='right')
nn_spd_left_accessible = hapclust.haplotype_accessible_length(nn_spd_left, core_pos=core_pos, is_accessible=is_accessible, flank='left')
In [36]:
def nn_accessibility_diagnostics():
# get some diagnostics on accessibility on left versus right flanks
fig, ax = plt.subplots(figsize=(8, 8))
x = nn_spd_right
y = nn_spd_right_accessible
ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Right flank')
x = nn_spd_left
y = nn_spd_left_accessible
ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Left flank')
ax.set_xlabel('Physical length')
ax.set_ylabel('Accessible length')
lim = 0, x.max()
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.legend()
ax.set_title('Neighbour shared haplotype length (bp)')
fig.tight_layout()
nn_accessibility_diagnostics()
In [37]:
# 1 cM/Mb convert to M/bp
1 / (1e2 * 1e6)
Out[37]:
In [38]:
# assume constant recombination rate
rr_right = 1.5e-8
# adjust recombination rate on left flank (factor derived from pairwise analysis below)
rr_left = rr_right * 0.37
# assumed mutation rate
mu_right = 3.5e-9
# adjust mutation rate on left flank (factor derived from pairwise analysis below)
mu_left = mu_right * 0.61
In [39]:
def plot_nn_right_flank():
pops_right = df_haplotypes.population[nn_idx_sorted_right]
pop_colors_right = [phase1_ar3.pop_colors[p] for p in pops_right]
fig = plt.figure(figsize=(20, 10))
hapclust.fig_neighbour_haplotype_sharing(nspd=nn_spd_right,
nspd_accessible=nn_spd_right_accessible,
muts=nn_muts_right,
haps_display=haps_vgsc_missense[:, nn_idx_sorted_right],
haps_display_vlbl=lbl_vgsc_missense,
pop_colors=pop_colors_right,
nspd_cut=2e4,
nspd_ylim=(1e3, 3e6),
that_ylim=(2e1, 5e5),
muts_ylim=(0, 25),
mu=mu_right, rr=rr_right,
fig=fig)
fig.suptitle('Right flank')
plot_nn_right_flank()
In [40]:
def plot_nn_left_flank():
pops_left = df_haplotypes.population[nn_idx_sorted_left]
pop_colors_left = [phase1_ar3.pop_colors[p] for p in pops_left]
fig = plt.figure(figsize=(20, 10))
hapclust.fig_neighbour_haplotype_sharing(nspd=nn_spd_left,
muts=nn_muts_left,
nspd_accessible=nn_spd_left_accessible,
haps_display=haps_vgsc_missense[:, nn_idx_sorted_left],
haps_display_vlbl=lbl_vgsc_missense,
pop_colors=pop_colors_left,
nspd_cut=2e4,
nspd_ylim=(1e3, 3e6),
that_ylim=(2e1, 5e5),
muts_ylim=(0, 25),
mu=mu_left, rr=rr_left,
fig=fig)
fig.suptitle('Left flank')
plot_nn_left_flank()
In [41]:
def nn_t_hat_diagnostics():
# get some diagnostics on estimates of t hat on left versus right flanks
fig = plt.figure()
ax = fig.add_subplot(111)
x = nn_spd_left
t_hat = (1 + nn_muts_left) / (2 * (nn_spd_left * rr_left + nn_spd_left_accessible * mu_left))
ax.hist(t_hat[(nn_spd_left > 0)], bins=np.logspace(1, 6, 50), histtype='step', lw=2, label='Upstream', color='b')
ax.axvline(np.median(t_hat), linestyle='--', color='b', lw=2)
t_hat = (1 + nn_muts_right) / (2 * (nn_spd_right * rr_right + nn_spd_right_accessible * mu_right))
ax.hist(t_hat[(nn_spd_right > 0)], bins=np.logspace(1, 6, 50), histtype='step', lw=2, label='Downstream', color='g')
ax.axvline(np.median(t_hat), linestyle='--', color='g', lw=2)
ax.set_xscale('log')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Frequency (no. pairs)')
ax.legend()
nn_t_hat_diagnostics()
In [42]:
pw_spl_right, pw_spd_right, pw_muts_right = hapclust.pairwise_haplotype_sharing(haps_ehh_right, haps_mut_right, dist_ehh_right, dist_mut_right, jitter=False)
In [43]:
pw_spl_left, pw_spd_left, pw_muts_left = hapclust.pairwise_haplotype_sharing(haps_ehh_left, haps_mut_left, dist_ehh_left, dist_mut_left, jitter=False)
In [44]:
pw_spl_right.shape, pw_spl_left.shape
Out[44]:
In [45]:
pw_spd_right.min(), pw_spd_right.max(), pw_spd_left.min(), pw_spd_left.max()
Out[45]:
In [46]:
pw_spd_right_accessible = hapclust.haplotype_accessible_length(pw_spd_right, core_pos=core_pos, is_accessible=is_accessible, flank='right')
pw_spd_left_accessible = hapclust.haplotype_accessible_length(pw_spd_left, core_pos=core_pos, is_accessible=is_accessible, flank='left')
In [47]:
def pw_accessibility_diagnostics():
# check again accessibility
fig, ax = plt.subplots(figsize=(8, 8))
x = pw_spd_right
y = pw_spd_right_accessible
ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Right flank')
x = pw_spd_left
y = pw_spd_left_accessible
ax.plot(x, y, linestyle=' ', marker='o', markersize=4, label='Left flank')
ax.set_xlabel('Physical length')
ax.set_ylabel('Accessible length')
lim = 0, x.max()
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.legend()
ax.set_title('Pairwise shared haplotype length')
fig.tight_layout()
pw_accessibility_diagnostics()
check recombination rate on left vs. right flank
In [48]:
palette = sns.color_palette()
sns.palplot(palette);
In [49]:
def pw_haplength_diagnostics():
# compare shared haplotype length on left versus right flanks
x = pw_spd_left
y = pw_spd_right
fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(x, y, marker='o', markersize=1, mfc='none', mec='k', linestyle=' ', alpha=.1)
lim = 1e1, 1e7
df = pandas.DataFrame({'x': x, 'y': y})
# linear regression
r = sfa.ols('y ~ x + 0', data=df).fit()
# plot the regression line
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color=palette[2], lw=2)
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.set_xlabel('Left flank')
ax.set_ylabel('Right flank')
ax.set_title('Shared haplotype length (bp)')
ax.set_xscale('log')
ax.set_yscale('log')
return r.params
pw_haplength_diagnostics()
Out[49]:
In [50]:
def pw_haplength_diagnostics_adjusted():
# compare shared haplotype length after applying adjustment for different
# recombination rates on left and right flanks
x = pw_spd_left * rr_left
y = pw_spd_right * rr_right
fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(x, y, marker='o', markersize=1, mfc='none', mec='k', linestyle=' ', alpha=.1)
lim = 1e-7, 1e-1
df = pandas.DataFrame({'x': x, 'y': y})
r = sfa.ols('y ~ x + 0', data=df).fit()
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color='r', lw=2)
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.set_xlabel('Left flank')
ax.set_ylabel('Right flank')
ax.set_title('Shared haplotype length (Morgan)')
ax.set_xscale('log')
ax.set_yscale('log')
return r.params
pw_haplength_diagnostics_adjusted()
Out[50]:
In [51]:
# check where the heterochromatin boundary is supposed to be
phase1_ar3.tbl_chromatin
Out[51]:
In [52]:
def pw_mutation_diagnostics():
# check mutation rate on left versus right flank.
x = pw_muts_left / pw_spd_left_accessible
y = pw_muts_right / pw_spd_right_accessible
tst = (x > 0) & (y > 0)
x = x[tst]
y = y[tst]
lim = 1e-6, 1e-1
fig, ax = plt.subplots(figsize=(7, 7))
sns.despine(ax=ax, offset=5)
ax.plot(x, y, marker='o', markersize=2, mfc='none', mec='k', linestyle=' ')
df = pandas.DataFrame({'x': x, 'y': y})
r = sfa.ols('y ~ x + 0', data=df).fit()
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color=palette[2], lw=2)
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, linestyle='--', color='k')
ax.set_title('Mutations ($bp^{-1}$)')
ax.set_xlabel('Left flank')
ax.set_ylabel('Reft flank')
return r.params
pw_mutation_diagnostics()
Out[52]:
In [53]:
def pw_mutation_diagnostics_adjusted():
# check mutations on left versus right flank after correcting mutation rate
x = pw_muts_left / (mu_left * pw_spd_left_accessible)
y = pw_muts_right / (mu_right * pw_spd_right_accessible)
tst = (x > 0) & (y > 0)
x = x[tst]
y = y[tst]
lim = 1e2, 1e7
fig, ax = plt.subplots(figsize=(7, 7))
sns.despine(ax=ax, offset=5)
ax.plot(x, y, marker='o', markersize=2, mfc='none', mec='k', linestyle=' ')
df = pandas.DataFrame({'x': x, 'y': y})
r = sfa.ols('y ~ x + 0', data=df).fit()
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color=palette[2], lw=2)
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, linestyle='--', color='k')
ax.set_title('Mutations (TODO units?)')
ax.set_xlabel('Left flank')
ax.set_ylabel('Reft flank')
return r.params
pw_mutation_diagnostics_adjusted()
Out[53]:
In [54]:
def pw_t_hat_diagnostics():
# now compare estimats of t_hat on left and right flanks, using all adjustments
pw_t_hat_left = (1 + pw_muts_left) / (2 * (pw_spd_left * rr_left + pw_spd_left_accessible * mu_left))
pw_t_hat_right = (1 + pw_muts_right) / (2 * (pw_spd_right * rr_right + pw_spd_right_accessible * mu_right))
x = np.log10(pw_t_hat_left)
y = np.log10(pw_t_hat_right)
fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(x, y, marker='o', markersize=1, mfc='none', mec='k', linestyle=' ', alpha=.1)
lim = 1, 6.5
df = pandas.DataFrame({'x': x, 'y': y})
r = sfa.ols('y ~ x + 0', data=df).fit()
ax.plot(lim, [l * r.params['x'] for l in lim], linestyle='--', color='r', lw=2)
ax.set_xlim(*lim)
ax.set_ylim(*lim)
ax.plot(lim, lim, 'k--')
ax.set_xlabel('Left flank')
ax.set_ylabel('Right flank')
ax.set_title('$\hat{t}$')
ticks = np.arange(lim[0], int(lim[1]) + 1)
ticklabels = ['$10^{%s}$' % t for t in ticks]
ax.set_xticks(ticks)
ax.set_xticklabels(ticklabels)
ax.set_yticks(ticks)
ax.set_yticklabels(ticklabels)
return r.params
pw_t_hat_diagnostics()
Out[54]:
In [55]:
def compute_combined_t_hat():
global pw_t_hat_both
# combined both flanks
pw_muts_both = pw_muts_left + pw_muts_right
pw_spd_both = pw_spd_left + pw_spd_right
pw_sgl_both = pw_spd_left * rr_left + pw_spd_right * rr_right
pw_sml_both = pw_spd_left_accessible * mu_left + pw_spd_right_accessible * mu_right
pw_t_hat_both = (1 + pw_muts_both) / (2 * (pw_sgl_both + pw_sml_both))
compute_combined_t_hat()
In [56]:
np.savez_compressed('../data/pairwise_haplotype_age.npz',
t_hat=pw_t_hat_both)
In [57]:
!ls -lh ../data/pairwise_haplotype_age.npz
In [58]:
def plot_dendrogram(dist, cut_height=1e3, yscale='log', ylim=(10, 1e6), linkage_method='average',
n_clusters=14):
"""This function plots a dendrogram using scipy and provides some utilities for annotating clusters."""
z = scipy.cluster.hierarchy.linkage(dist, method=linkage_method)
fig = plt.figure(figsize=(16, 8), )
gs = mpl.gridspec.GridSpec(nrows=3, ncols=1, height_ratios=[6, .5, 4], hspace=0)
ax = fig.add_subplot(gs[0])
sns.despine(ax=ax, offset=3, bottom=True, top=False)
r = scipy.cluster.hierarchy.dendrogram(
z, no_labels=True, count_sort=True,
color_threshold=0,
above_threshold_color='k',
ax=ax)
ax.set_ylim(*ylim)
ax.set_yscale(yscale)
# ax.set_ylim(bottom=-1000)
xmin, xmax = ax.xaxis.get_data_interval()
xticklabels = np.array(list(range(0, len(df_haplotypes), 200)) + [len(df_haplotypes)])
xticks = xticklabels / len(df_haplotypes)
xticks = (xticks * (xmax - xmin)) + xmin
ax.set_xticks(xticks)
ax.set_xticklabels(xticklabels)
ax.set_xlabel('Haplotypes')
ax.xaxis.set_label_position('top')
ax.set_ylabel('$\hat{t}$', rotation=0, ha='right')
cluster_palette = sns.color_palette('Set3', n_colors=12)
if cut_height:
ax.axhline(cut_height, linestyle='--', color='k')
# find clusters
f = scipy.cluster.hierarchy.fcluster(z, cut_height, criterion='distance')
# compute cluster sizes
fsz = np.bincount(f)
# sort largest first
fsort = np.argsort(fsz)[::-1]
# take largest n
fsort = fsort[:n_clusters]
# get haplotype indices for each cluster
clusters = [set(np.nonzero(f == i)[0]) for i in fsort]
clusters_leaves = [sorted([r['leaves'].index(i) for i in cluster])
for cluster in clusters]
ixs = np.argsort([min(cl) for cl in clusters_leaves])
clusters = [clusters[i] for i in ixs]
clusters_leaves = [clusters_leaves[i] for i in ixs]
for i, cluster_leaves in enumerate(clusters_leaves):
color = cluster_palette[i % len(cluster_palette)]
x1, x2 = min(cluster_leaves), max(cluster_leaves)
# ax.axvline(x1*10, color='k', linestyle='--')
# ax.axvline(x2*10, color='k', linestyle='--')
ax.fill_between([x1*10, x2*10], 0, cut_height, color=color, alpha=.4, zorder=20)
# ax.axvspan(x1*10, x2*10, color=color, zorder=-20, alpha=.5)
ax.text((x1*10 + x2*10) / 2, ylim[0], str(i), ha='center', va='top')
ax = fig.add_subplot(gs[1], )
sns.despine(ax=ax, left=True, bottom=True)
pops = df_haplotypes.population[r['leaves']]
pop_colors = [phase1_ar3.pop_colors[p] for p in pops]
ax.broken_barh([(i, 1) for i in range(len(df_haplotypes))], yrange=(0, 1), color=pop_colors)
ax.set_xlim(0, len(df_haplotypes))
ax.set_yticks([])
ax.set_ylabel('Population', rotation=0, ha='right', va='center')
ax.set_xticks([])
if cut_height:
for i, cluster_leaves in enumerate(clusters_leaves):
color = cluster_palette[i % len(cluster_palette)]
x1, x2 = min(cluster_leaves), max(cluster_leaves)
ax.axvline(x1, color='k', linestyle='--', zorder=20)
ax.axvline(x2, color='k', linestyle='--', zorder=20)
ax = fig.add_subplot(gs[2])
hapclust.plot_haplotypes(ax, haps_vgsc_missense[:, r['leaves']], lbl_vgsc_missense)
ax.set_xlim(0, len(df_haplotypes))
ax.set_xlabel('Haplotypes')
if cut_height:
for i, cluster_leaves in enumerate(clusters_leaves):
color = cluster_palette[i % len(cluster_palette)]
x1, x2 = min(cluster_leaves), max(cluster_leaves)
ax.axvline(x1, color='k', linestyle='--', zorder=20)
ax.axvline(x2, color='k', linestyle='--', zorder=20)
ax.axvspan(x1, x2, color=color, zorder=20, alpha=.4)
gs.tight_layout(fig, h_pad=0)
return clusters
In [59]:
# canonical dendrogram we'll use for cluster analyses
clusters = plot_dendrogram(pw_t_hat_both, cut_height=2e3)
In [60]:
len(clusters)
Out[60]:
In [61]:
cluster_names = {
1 : 'F4',
2 : 'F3',
3 : 'F1',
4 : 'L2',
5 : 'L1',
7 : 'S4/5',
8 : 'F2',
9 : 'F5',
10 : 'S2',
12 : 'S1',
13 : 'S3'
}
In [62]:
clust_dict = {
n: clusters[k] for k, n in cluster_names.items()
}
sorted(clust_dict)
Out[62]:
In [63]:
# save as pickle
import pickle
with open('../data/clust_dict.pickle', 'wb') as handle:
pickle.dump(clust_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
In [64]:
# check pickle
with open('../data/clust_dict.pickle', 'rb') as handle:
b = pickle.load(handle)
print(clust_dict == b)
In [65]:
# make list of all indices (values) and their cluster names (keys) in clusters_good
all_idx = []
all_clu = []
for n, s in clust_dict.items():
for i in sorted(s):
all_clu.append(n)
all_idx.append(i)
len(all_idx), len(all_clu)
Out[65]:
In [66]:
# write out text file
clus_df = pandas.DataFrame()
clus_df['cluster'] = all_clu
clus_df['haplotype'] = all_idx
clus_df.to_csv('../data/clusters.csv', index=False)